In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import skew, kurtosis

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
data.head(7)

In [None]:
# Already it can be seen that the id feature is unnecessary but we shall deal with it later
data.info()

In [None]:
# We can already see that there are missing values in the bmi column
print(data.isnull().sum().sort_values(ascending=False))
print("Proportion of missing values for bmi: ", data.isnull().sum()['bmi']*100/len(data),'%')

So far it can be seen that the only null values are in the BMI column and only number 201 out of 5110 samples. This is a low proportion so we can likely drop these samples or impute them without losing too much variance.

In [None]:
data.describe()

In [None]:
# Find the numeric features 
data.select_dtypes(include=np.number)

In [None]:
# Many discrete features are in numeric form so we should extract those too
data.select_dtypes(include=np.number).nunique()

In [None]:
# It can be seen that there are three numeric binary features: hypertension,heart_disease,stroke (also the target)
# Lets note the continous features and the categorical FEATURES seperately
cat_features = "hypertension heart_disease".split()
cont_features = "age avg_glucose_level bmi".split()
cat_features.extend(data.select_dtypes(exclude=np.number).columns.tolist())

In [None]:
cat_features

In [None]:
cont_features

In [None]:
# Make sure we extracted all the features (not including id or the label)
len(cat_features)+len(cont_features)+2 == len(data.columns)

So far we have checked out which features have missing values (which isn't a big deal) and which features are categorical and continous. Now for some Exploratory Data Analysis.

In [None]:
# Make subplots checking the distribution of the numeric features
sns.set_style("white")
fig,axes = plt.subplots(1,3,figsize=(13,11))
for feat,ax in zip(cont_features,axes.flat):
    sns.histplot(x=feat,data=data,hue='stroke',ax=ax)

The plots are not very conclusive about which features point to a higher risk of stroke: the first subplot does suggest that a person is at a higher risk if above the age of 40, and this goes up dramatically as s/he approaches 80.

Also note that two graphs look rather skewed (both positively skewed).

In [None]:
for feat in cont_features:
    print("Skewness of {}= {}".format(feat,skew(data[feat].dropna())))

Make a mental note to:
- Impute values for BMI NaNs
- Standardize/Logarithmically scale the BMI and avg_glucose_level features

In [None]:
print("Skewness of bmi feature BEFORE logarithmic scaling:")
print(skew(data['bmi'].dropna()))
print("Skewness of bmi feature AFTER logarithmic scaling:")
print(skew(np.log1p(data['bmi'].dropna())))
print('-'*120)
print("Skewness of avg_glucose_level feature BEFORE logarithmic scaling:")
print(skew(data['avg_glucose_level']))
print("Skewness of avg_glucose_level feature AFTER logarithmic scaling:")
print(skew(np.log1p(data['avg_glucose_level'])))

The Skewness of these features does seem to go down considerably by applying logarithmic scaling. We need to address avg_glucose_level a bit more seriously though since any skewness above 0.5 is not very healthy.

In [None]:
np.log1p(data.dropna()['bmi']).plot(kind='hist')

In [None]:
np.log1p(data['avg_glucose_level']).plot(kind='hist')

Now that we have played with the continous features' distributions, lets check out those of the Categorical Features.

In [None]:
fig,axes = plt.subplots(4,2,figsize=(18,14))
for feat,ax in zip(cat_features,axes.flat):
    sns.countplot(data=data,x=feat,hue='stroke',ax=ax)
sns.countplot(data=data,x='stroke',ax=axes[3,1])

- hypertension and heart_disease happen to be the best indicators out of the lot for predicting strokes (obviously).  
- smoking_status seems to be a fairly decent indicator as well.
- Unfortunately, it seems our target variable has an extremely unbalanced distribution so we cannot expect our models to be fantastic right off the bat.

In [None]:
# Check proportion of samples suffering strokes
print("Percentage of People in this dataset suffering from a stroke: {}%".format(len(data[data['stroke']==0]) * 100 /len(data)))

Over 95% of our samples are not positive instances- the dataset is not likely to make it easy for the algorithms to find patterns.

Now that we have explored the data, we can start some feature engineering and some transformations:
- get rid of the residence_type variable since that shows absolutely no patterns with the stroke
- apply One-Hot Encoding to the categorical variables and StandardScaler to everything else; DONT forget to impute values for bmi
- finalize our pipeline with ColumnTransformer

In [None]:
# Get rid of the Residence Type variable since it will only serve to slow down training
data.drop('Residence_type',axis=1,inplace=True)

In [None]:
# Also drop the id column since we are just playing with the data
data.drop('id',axis=1,inplace=True)

In [None]:
# There is only one instance of a person being 'Other' so we can make it easier for the algorithm to get rid of this class altogether
data[data['gender'] == 'Other']

In [None]:
cop = data.copy()

In [None]:
cop['gender'].replace(['Other'],'Male',inplace=True)

In [None]:
cop['gender'].nunique()

In [None]:
# Apply this logoc to the main dataframe
data['gender'].replace(['Other'],'Male',inplace=True)

In [None]:
for col in data.columns:
    print(col,data[col].nunique())

In [None]:
# Make our categorical features and continous features list again so we can apply the transformations
cat_features = 'gender hypertension heart_disease ever_married work_type smoking_status'.split()
cont_features = 'age avg_glucose_level bmi'.split()

# Make sure we captured all the features (exluding the label)
len(cat_features)+len(cont_features) + 1 == len(data.columns)

In [None]:
# Apply the imputer for the BMI based on the mean a
data.fillna(value={'bmi':data['bmi'].mean()}, inplace=True)

In [None]:
# Transformation final steps

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

X = data.drop('stroke',axis=1)
y = data['stroke']

pipeline = ColumnTransformer([
    ("num_scaler",StandardScaler(),cont_features),
    ("encoder",OneHotEncoder(),cat_features)
])

X = pipeline.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.15,random_state=42)

In [None]:
X

In [None]:
# import models for classification
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score

In [None]:
log = LogisticRegression()
mlp = MLPClassifier(random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
rfc = RandomForestClassifier(n_estimators=90,random_state=42)
gbc = GradientBoostingClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42)
svc = LinearSVC(random_state=42)
xgb = XGBClassifier()

estimators = [log,mlp,knn,rfc,gbc,ada,svc,xgb]

# Cross Validate for evaluation of each 
for estimator in estimators:
    print("Training {}".format(estimator))
    estimator.fit(X_train,y_train)
    print("Score: {}".format(estimator.score(X_val,y_val)))

Note that most of our classifiers look to perform extremely well but it fact this is not true. Recall that the distribution of the target variable is extremely unbalanced- over 95% of the instances have positive labels hence we could have gotten a higher accuracy by having a classifier that simply predicted '1' each and every time.

In [None]:
from sklearn.base import BaseEstimator

class Always0Classifier(BaseEstimator):
    def fit(self,X,y=None):
        return self
    def predict(self,X,y=None):
        return np.zeros((len(X),1), dtype=bool)

In [None]:
dummy = Always0Classifier()
cross_val_score(dummy,X_train,y_train,cv=6,scoring="accuracy").mean()

It is rather funny to see us getting a higher accuracy simply off always saying the person has less risk of a stroke- this is taking advantage of the dataset so don't do this lol.

In [None]:
# Make an ensemble just to see if we can do better
named_estimators = [
    ("log",log),("mlp",mlp),("knn",knn),("rfc",rfc),("gbc",gbc),("ada",ada),("svc",svc),("xgb",xgb)
]

voting_clf = VotingClassifier(named_estimators)

In [None]:
print("Training")
voting_clf.fit(X_train,y_train)
print("Done. Score: {}".format(voting_clf.score(X_val,y_val)))

While none of our estimators technically performed bad- they did not perform better than a dummy classifier that always predicted the same label again and again and again. The Hard Voting classifier here did not perform better than the best constituent classifier here so:
- If we wanted a legit submission, we would use the XGB Classifier here
- If we were naughty, we would use the Always0Classifier since that has the best technical accuracy here

In [None]:
# Once again, checking the metrics for xgb
from sklearn.metrics import confusion_matrix, precision_score, recall_score


predictions = xgb.predict(X_val)

print(confusion_matrix(predictions,y_val))
print("Accuracy: ",xgb.score(X_val,y_val))
print("Recall: ",recall_score(y_val,predictions,average="micro"))
print("Precision: ",precision_score(y_val,predictions,average="micro"))

Life is weird when you have an unbalanced dataset :(