In [None]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Data

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', index_col='id')
df

In [None]:
df_to_enc_list = df.iloc[:,[0,4,5,6,9]].columns.values.tolist()

In [None]:
for i in range(len(df_to_enc_list)):
    df[df_to_enc_list[i]] = LabelEncoder().fit_transform(df[df_to_enc_list[i]])

In [None]:
df.info()

Around 200 missing values for bmi

In [None]:
target = df.dropna().stroke
feat = df.dropna().drop(columns=['stroke'])
scaler = RobustScaler()
feat = pd.DataFrame(scaler.fit_transform(feat),columns=feat.columns.values.tolist())

In [None]:
plt.figure()
plt.hist(target)
plt.title('Stroke distribution')
plt.show()

## Feature Importances

In [None]:
clfDTC = DecisionTreeClassifier().fit(feat, target)
clfRFC = RandomForestClassifier().fit(feat, target)
clfGBC = GradientBoostingClassifier().fit(feat, target)

im_DTC = clfDTC.feature_importances_
im_RFC = clfRFC.feature_importances_
im_GBC = clfGBC.feature_importances_

fig, ax = plt.subplots(1,3,sharey=True,figsize=(16,6))
fig.suptitle('Feature Importances')

sns.heatmap(ax=ax[0],data=np.expand_dims(im_DTC,axis=1),
            annot=True, robust=True, 
            xticklabels=[None],yticklabels=feat.columns.tolist())

ax[0].set_title('Decision Tree Classifier')

sns.heatmap(ax=ax[1],data=np.expand_dims(im_RFC,axis=1),
            annot=True, robust=True,
            xticklabels=[None],yticklabels=feat.columns.tolist())

ax[1].set_title('Random Forest Classifier')

sns.heatmap(ax=ax[2],data=np.expand_dims(im_GBC,axis=1),
            annot=True, robust=True,
            xticklabels=[None],yticklabels=feat.columns.tolist())

ax[2].set_title('Gradient Boosting Classifier')

plt.show()

* *Age*, *avg_glucose_level* and *bmi* are the most important features explaining 70-80% of the variance
Choosing Gradient Boosting Classifier:
* *Gender*, *Residence_type*, *Ever_married* are pure noise.

# Model Prediction

## Gradient Boosting Classifier

In [None]:
var_exp=sorted(zip(feat.columns.values.tolist(),im_GBC),key=lambda x: x[1], reverse=True)
cum_var_exp = []
cum_var = 0
for i in range(len(var_exp)):
    cum_var += var_exp[i][1]
    cum_var_exp.append([var_exp[i][0],cum_var])
print('Cummulative variance explained:')    
cum_var_exp

In [None]:
predictor_feat = []
for i in range(len(cum_var_exp)):
    if  cum_var_exp[i][1]<=0.98:
        predictor_feat.append(cum_var_exp[i][0])

print('Predictor features')
predictor_feat

In [None]:
X_tr, X_ts, Y_tr, Y_ts = train_test_split(feat[predictor_feat], target, test_size=0.25,stratify=target,
                                         shuffle=True, random_state=42)

In [None]:
clf = GradientBoostingClassifier().fit(X_tr,Y_tr)

Y_predGBC = clf.predict(X_ts)

score = accuracy_score(Y_ts,Y_predGBC)
conf_mat = confusion_matrix(Y_ts, Y_predGBC)

plt.figure()

sns.heatmap(data = conf_mat, annot=True, robust=True, fmt='.4g',
            xticklabels=['NO','YES'],yticklabels=['NO','YES'])
plt.title('Accuracy Score: {:.3f}'.format(score))

plt.show()

## Random Forest Classifier

In [None]:
var_exp=sorted(zip(feat.columns.values.tolist(),im_RFC),key=lambda x: x[1], reverse=True)
cum_var_exp = []
cum_var = 0
for i in range(len(var_exp)):
    cum_var += var_exp[i][1]
    cum_var_exp.append([var_exp[i][0],cum_var])
print('Cummulative variance explained:')    
cum_var_exp

In [None]:
predictor_feat = []
for i in range(len(cum_var_exp)):
    if  cum_var_exp[i][1]<=0.98:
        predictor_feat.append(cum_var_exp[i][0])

print('Predictor features')
predictor_feat

In [None]:
X_tr, X_ts, Y_tr, Y_ts = train_test_split(feat[predictor_feat], target, test_size=0.25,stratify=target,
                                         shuffle=True, random_state=42)

clf = RandomForestClassifier().fit(X_tr,Y_tr)

Y_predRFC = clf.predict(X_ts)

score = accuracy_score(Y_ts,Y_predRFC)
conf_mat = confusion_matrix(Y_ts, Y_predRFC)

plt.figure()

sns.heatmap(data = conf_mat, annot=True, robust=True, fmt='.4g',
            xticklabels=['NO','YES'],yticklabels=['NO','YES'])
plt.title('Accuracy Score: {:.3f}'.format(score))

plt.show()

## Decission Tree Classifier

In [None]:
var_exp=sorted(zip(feat.columns.values.tolist(),im_DTC),key=lambda x: x[1], reverse=True)
cum_var_exp = []
cum_var = 0
for i in range(len(var_exp)):
    cum_var += var_exp[i][1]
    cum_var_exp.append([var_exp[i][0],cum_var])
print('Cummulative variance explained:')    
cum_var_exp

In [None]:
predictor_feat = []
for i in range(len(cum_var_exp)):
    if  cum_var_exp[i][1]<=0.98:
        predictor_feat.append(cum_var_exp[i][0])

print('Predictor features')
predictor_feat

In [None]:
X_tr, X_ts, Y_tr, Y_ts = train_test_split(feat[predictor_feat], target, test_size=0.25,stratify=target,
                                         shuffle=True, random_state=42)

clf = DecisionTreeClassifier().fit(X_tr,Y_tr)

Y_predDTC = clf.predict(X_ts)

score = accuracy_score(Y_ts,Y_predDTC)
conf_mat = confusion_matrix(Y_ts, Y_predDTC)

plt.figure()

sns.heatmap(data = conf_mat, annot=True, robust=True, fmt='.4g',
            xticklabels=['NO','YES'],yticklabels=['NO','YES'])
plt.title('Accuracy Score: {:.3f}'.format(score))

plt.show()

# Solution

In [None]:
pd.DataFrame(data=np.array([Y_predGBC,Y_predRFC,Y_predDTC]).T,
             columns=['Gradient Boosting','Random Forest','Decision Tree'],
            index = X_ts.index)