In [None]:
import pandas as pd
import numpy as np
import math
from pathlib import Path
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.inspection import plot_partial_dependence
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

> # Data Visualization

In [None]:
age = df.groupby('age').count()['id'].reset_index()
age['count'] = age['id']

fig = px.scatter(age, x="age", y="count", title = 'age distribution')
fig.show()

In [None]:
fig = px.scatter(df, x="age", y="avg_glucose_level", color="gender")
fig.show()

In [None]:
sm_st = df.groupby('smoking_status').agg('sum')['stroke'].reset_index()
sm_st['total'] = df.groupby('smoking_status').agg('count')['id'].reset_index()['id']
sm_st['% of strokes'] = round(sm_st['stroke']/ sm_st['total']*100,1)

fig = go.Figure(data=[
    go.Bar(name='no stroke', x=sm_st.smoking_status, y=sm_st.total.tolist()),
    go.Bar(name='stroke', x=sm_st.smoking_status, y=sm_st.stroke.tolist(),
           text=sm_st['% of strokes'], texttemplate='%{text}%', textposition='outside')
])
fig.update_layout(barmode='stack', title = 'Stroke ratio relative to smoking status')
fig.show()

In [None]:
df['ever_married'] = df['ever_married'].apply(lambda x: 1 if x == 'Yes' else 0)

fig = make_subplots(rows=1, cols=3, specs=[[{"type": "pie"},{"type": "pie"},{"type": "pie"}]], 
                    shared_xaxes=True, shared_yaxes=False, vertical_spacing=0.01)


trace1 = go.Pie(labels=df.hypertension.value_counts().index, values=df.hypertension.value_counts().values, title = 'hypertension')
trace2 = go.Pie(labels=df.heart_disease.value_counts().index, values=df.heart_disease.value_counts().values, title = 'heart_disease')
trace3 = go.Pie(labels=df.ever_married.value_counts().index, values=df.ever_married.value_counts().values, title = 'ever_married' )

fig.add_trace(trace1, 1, 1)
fig.add_trace(trace2, 1, 2)
fig.add_trace(trace3, 1, 3)
fig.update_traces(hole=.5)

fig.show()

In [None]:
fig = go.Figure([go.Bar(
    x=df.work_type.value_counts().index, 
    y=df.work_type.value_counts().values , 
    text=df.work_type.value_counts().values)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title = 'work type')
fig.show()

In [None]:
def pie_plot(col):
    
    labels = df[col].value_counts().index
    values= df[col].value_counts().values
    title = col
    fig = go.Figure(data=[go.Pie(labels=labels, values=values, title = title)])
    fig.update_layout(
        autosize=False,
        width=500,
        height=500)
    return fig

In [None]:
pie_plot('gender')

Removing outlier from gender column

In [None]:
df = df[df['gender']!='Other'].reset_index(drop=True)

In [None]:
pie_plot('Residence_type')

In [None]:
pie_plot('stroke')

In [None]:
def box_plot(x,y):
    return px.box(df, 
                  x=x, 
                  y=y, 
                  points='all',
                  title= x + ' & ' + y,
                  width=800,
                  height=500)

In [None]:
box_plot('stroke','bmi')

In [None]:
box_plot('stroke','age')

There are stroke cases for ages 1.3 and 14 which are outliers

In [None]:
px.box(df[df['stroke']==1]['age'])


Let's remove those outliers


In [None]:
idx_to_drop = df[df['stroke']==1][df[df['stroke']==1]['age']<15].index
df.drop(idx_to_drop,inplace=True)
df = df.reset_index(drop=True)

In [None]:
box_plot('stroke','avg_glucose_level')

In [None]:
fig = px.scatter(df, x="bmi", y="avg_glucose_level")
fig.show()

In [None]:
corr = df.drop('id',axis=1).corr()
data = go.Heatmap(z=corr.values,
                  y=corr.index.values,
                  x=corr.columns.values,
                 colorscale='RdBu')
fig = go.Figure(data=data)
fig.show()

In [None]:
df.corr().stroke.sort_values()[1:-1].plot(kind='barh')

> # Data Preparation

In [None]:
df = df.drop('id',axis=1)

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=666)
for train_index, test_index in split.split(df, df['stroke']):
    train = df.loc[train_index]
    test = df.loc[test_index]

In [None]:
X_train = train.drop('stroke',axis=1)
y_train = train['stroke']

X_test = test.drop('stroke',axis=1)
y_test = test['stroke']

In [None]:
cols_to_encode = []
for col in X_train.columns:
    if X_train[col].dtype == 'O':
        cols_to_encode.append(col)
num_col = list(X_train.drop(cols_to_encode,axis=1).columns)

print (f'categorical cols: {cols_to_encode}',"\n"
       f'numerical cols: {num_col}')

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.num_col = num_col
        self.cat_col = cols_to_encode
    
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        
        X['bmi'] = X['bmi'].fillna(X['bmi'].median())
        num = X[self.num_col].copy()
        cat = X[self.cat_col].copy()
        
        for column in self.cat_col:
            s = pd.get_dummies(cat[column], prefix = column)
            cat = pd.concat([cat, s], axis=1).drop([column], axis=1)
        
        X = pd.concat([num, cat], axis = 1)

        return X

In [None]:
encoder = Encoder()
X_train = encoder.transform(X_train)

> # Training

In [None]:
rfc = RandomForestClassifier(n_estimators = 40, min_samples_leaf = 3, max_features=0.5, n_jobs=-1, oob_score=True)
rfc.fit(X_train, y_train)

In [None]:
feature_importance = pd.DataFrame({'Feature' : X_train.columns, 'Importance' : rfc.feature_importances_})
feature_importance.sort_values('Importance', ascending=False, inplace=True)

In [None]:
fig = go.Figure([go.Bar(
x=feature_importance.Feature, 
y=feature_importance.Importance, 
text=feature_importance.Importance)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title = 'Feature importance')
fig.show()

In [None]:
to_keep = feature_importance[feature_importance['Importance'] > 0.02].Feature.values

In [None]:
X_train = X_train[to_keep]

In [None]:
rfc = RandomForestClassifier(n_estimators=40, min_samples_leaf = 3, max_features = 0.4, n_jobs=-1, oob_score=True)
rfc.fit(X_train, y_train)

In [None]:
param_grid = [
    {'n_estimators': [40, 60, 90, 120], 'max_features': [0.4, 0.5, 0.7], 'min_samples_leaf': [2, 3, 4, 5]},
  ]

grid_search = GridSearchCV(rfc, param_grid, cv = 3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(X_train, y_train)

In [None]:
rfc = RandomForestClassifier(n_estimators = grid_search.best_params_['n_estimators'], 
                              min_samples_leaf = grid_search.best_params_['min_samples_leaf'], 
                              max_features = grid_search.best_params_['max_features'],  
                              n_jobs=-1, 
                              oob_score=True)
rfc.fit(X_train, y_train)

> # Predictions

In [None]:
X_test = encoder.transform(X_test)
X_test = X_test[to_keep]

In [None]:
plot_confusion_matrix(rfc, X_test, y_test)

The model fails to predict stroke cases and needs improvement

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score

In [None]:
preds = rfc.predict(X_test)
prob = rfc.predict_proba(X_test)[:,1]

In [None]:
print(classification_report(y_test, preds))
print(f'ROC AUC score: {roc_auc_score(y_test, prob)}')
print('Accuracy Score: ',accuracy_score(y_test, preds))
print('F1 Score: ',f1_score(y_test, preds))
print('Recall: ', recall_score(y_test, preds))

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, prob)

fig = go.Figure()

fig.add_trace(go.Scatter(x=false_positive_rate, y=true_positive_rate, name="auc"))
fig.add_trace(go.Scatter(x=np.arange(0,1,0.01), y=np.arange(0,1,0.01)))

fig.update_layout(
        title = 'ROC curve',
        autosize=False,
        width=600,
        height=600)
fig.show()

---------