<div class="alert alert-info" 
     style="background-color:#008a79; color:white; padding:0px 10px; border-radius:10px;">
    <h1 style='margin:10px 5px'>Stroke: EDA | Predictions</h1>
</div>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report, roc_auc_score, plot_roc_curve 
from imblearn.over_sampling import SMOTE 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df

In [None]:
df.info()

In [None]:
df.fillna(df.bmi.mean(), inplace=True)

In [None]:
print('How many people have suffer a stroke in the dataset?')
str((df.stroke.value_counts()[1]).round(2)) + ' or the ' + \
str((df.stroke.value_counts()[1]/df.stroke.value_counts()[0]).round(2)) + '%'

#### There is a strong class imbalace so I will try to deal with it with a SMOTE lately.

<div class="alert alert-info" 
     style="background-color:#008a79; color:white;  padding:0px 10px; border-radius:10px;">     <h2 style='margin:15px 5px'>EDA</h2>
</div>

In [None]:
pd.crosstab(df.gender, df.stroke, margins=True)

In [None]:
pd.crosstab(df.gender, df.stroke, normalize=True, margins=True).round(2)

In [None]:
pd.crosstab(df.smoking_status, df.stroke, margins=True)

In [None]:
pd.crosstab(df.smoking_status, df.stroke, normalize=True, margins=True).round(2)

In [None]:
pd.crosstab(df.hypertension, df.stroke, margins=True)

In [None]:
pd.crosstab(df.hypertension, df.stroke, normalize=True, margins=True).round(2)

In [None]:
pd.crosstab(df.ever_married, df.stroke, margins=True)

In [None]:
pd.crosstab(df.ever_married, df.stroke, normalize=True, margins=True).round(2)

In [None]:
fig = plt.subplots(figsize=(15,15))
plt.scatter(df.age, df.bmi, c=df.stroke, alpha=0.4)
plt.title('Stroke cluster compared to Age and BMI')
plt.xlabel('In yellow who had a stroke');

In [None]:
no_stroke = df[df['stroke']==0]
yes_stroke = df[df['stroke']==1]

no_stroke.age.plot(kind='hist', color='green', alpha=0.2, edgecolor='b')
yes_stroke.age.plot(kind='hist', color='red', alpha=0.2, edgecolor='b', figsize=(15,8), title='Frequencies of Stroke vs No Stroke by Age');

In [None]:
no_stroke = df[df['stroke']==0]
yes_stroke = df[df['stroke']==1]

no_stroke.avg_glucose_level.plot(kind='hist', color='green', alpha=0.2, edgecolor='b')
yes_stroke.avg_glucose_level.plot(kind='hist', color='red', alpha=0.2, edgecolor='b', figsize=(15,8), title='Frequencies of Stroke vs No Stroke by Avg glucose level');

In [None]:
no_stroke = df[df['stroke']==0]
yes_stroke = df[df['stroke']==1]

no_stroke.bmi.plot(kind='hist', bins=8, color='green', alpha=0.2, edgecolor='b')
yes_stroke.bmi.plot(kind='hist', bins=8, color='red', alpha=0.2, edgecolor='b', figsize=(15,8), title='Frequencies of Stroke vs No Stroke by BMI');

<div class="alert alert-info" 
     style="background-color:#008a79; color:white;  padding:0px 10px; border-radius:10px;">     <h2 style='margin:15px 5px'>Data transformation</h2>
</div>

In [None]:
gender_enc = LabelEncoder()
df['gender_enc'] = gender_enc.fit(df['gender'])

ever_married_enc = LabelEncoder()
df['ever_married_enc'] = ever_married_enc.fit(df['ever_married'])

work_type_enc = LabelEncoder()
df['work_type_enc'] = work_type_enc.fit(df['work_type'])

Residence_type_enc = LabelEncoder()
df['Residence_type_enc'] = Residence_type_enc.fit(df['Residence_type'])

smoking_status_enc = LabelEncoder()
df['smoking_status_enc'] = smoking_status_enc.fit(df['smoking_status'])

def transform(dataset):
    dataset['gender_enc'] = gender_enc.transform(dataset['gender'])
    dataset['ever_married_enc'] = ever_married_enc.transform(dataset['ever_married'])
    dataset['work_type_enc'] = work_type_enc.transform(dataset['work_type'])
    dataset['Residence_type_enc'] = Residence_type_enc.transform(dataset['Residence_type'])
    dataset['smoking_status_enc'] = smoking_status_enc.transform(dataset['smoking_status'])
    return dataset.drop(['id','gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis=1)

df_enc = transform(df)


df_enc = df_enc.drop('stroke', axis=1)
df_enc['stroke'] = df['stroke']
df_enc

In [None]:
corr_matrix = df_enc.corr()
fig, ax = plt.subplots(figsize=(15,6))
ax = sns.heatmap(corr_matrix,
                annot=True,
                linewidth=0.5,
                fmt="2.2f",
                cmap='YlGnBu');

<div class="alert alert-info" 
     style="background-color:#008a79; color:white;  padding:0px 10px; border-radius:10px;">     <h2 style='margin:15px 5px'>Model</h2>
</div>

In [None]:
X = df_enc.drop('stroke', axis=1)
y = df_enc['stroke']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=.2)

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
print(cross_val_score(model, X_train, y_train, cv=3))

In [None]:
print('Score on the Test set: ', model.score(X_test, y_test))

In [None]:
model.predict([[38, 0, 0, df.avg_glucose_level.mean(), 24, 1, 0, 2, 1, 1]])

In [None]:
model.feature_importances_

In [None]:
feat = pd.Series(model.feature_importances_, df_enc.columns.drop('stroke'))
feat.sort_values().plot(kind='barh', title='Feature Importance');

In [None]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
y_pred_prob = [x[1] for x in y_pred_prob]
print("Y predicted: ", y_pred)
print("Y probability predicted: ", y_pred_prob[:5])

In [None]:
print("Confusion Matrix:")
plot_confusion_matrix(model, X_test, y_test);

In [None]:
def evaluation_metric(model, X_test, y_test, y_pred, y_pred_prob):
    print("Accuracy Score: ", accuracy_score(y_test, y_pred))
    print("AUC Score: ", roc_auc_score(y_test, y_pred_prob))
    print("\n Classification Report: \n\n", classification_report(y_test, y_pred))
    print("\n ROC curve: \n")
    plot_roc_curve(model, X_test, y_test);

evaluation_metric(model, X_test, y_test, y_pred, y_pred_prob)

##### I am going to use a SMOTE to try to improve the AUC.

In [None]:
sm = SMOTE(sampling_strategy='auto', k_neighbors=8)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [None]:
X_res_train, X_res_test, y_res_train, y_res_test = train_test_split(X_res, y_res, random_state=42, test_size=.2)

In [None]:
model_res = RandomForestClassifier()
model_res.fit(X_res_train, y_res_train)
print('Train cv score resampled: ', cross_val_score(model_res, X_res_train, y_res_train, cv=3))
print('Test set score resampled: ', model_res.score(X_test, y_test))

In [None]:
y_pred = model_res.predict(X_test)
y_pred_prob = model_res.predict_proba(X_test)
y_pred_prob = [x[1] for x in y_pred_prob]
print("Y predicted: ", y_pred)
print("Y probability predicted: ", y_pred_prob[:5])

In [None]:
print("Confusion Matrix:")
plot_confusion_matrix(model_res, X_test, y_test);

In [None]:
def evaluation_metric(model_res, X_test, y_test, y_pred, y_pred_prob):
    print("Accuracy Score: ", accuracy_score(y_test, y_pred))
    print("AUC Score: ", roc_auc_score(y_test, y_pred_prob))
    print("\n Classification Report: \n\n", classification_report(y_test, y_pred))
    print("\n ROC curve: \n")
    plot_roc_curve(model_res, X_test, y_test);

evaluation_metric(model_res, X_test, y_test, y_pred, y_pred_prob)

##### Indeed the resampled model improved.

In [None]:
model_res.feature_importances_

In [None]:
feat = pd.Series(model_res.feature_importances_, df_enc.columns.drop('stroke'))
feat.sort_values().plot(kind='barh', title='Feature Importance');

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type='bar')

In [None]:
shap.summary_plot(shap_values[0], X_test, plot_type="layered_violin", color='coolwarm')

In [None]:
shap.summary_plot(shap_values[0], X_test, plot_type="dot", color='coolwarm')

In [None]:
explainer_res = shap.TreeExplainer(model_res)
shap_values_res = explainer_res.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values_res, X_test, plot_type='bar')

In [None]:
shap.summary_plot(shap_values_res[0], X_test, plot_type="layered_violin", color='coolwarm')

In [None]:
shap.summary_plot(shap_values_res[0], X_test, plot_type="dot", color='coolwarm')