In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,ConfusionMatrixDisplay,precision_score,recall_score,f1_score,classification_report,roc_curve,plot_roc_curve,auc,precision_recall_curve,plot_precision_recall_curve,average_precision_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import plot_confusion_matrix

import folium
from folium.plugins import HeatMap
import plotly.express as px

plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 32)

In [None]:
stroke_df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
stroke_df.head()

In [None]:
stroke_df.describe()

In [None]:
stroke_df.info()

In [None]:
for i in stroke_df.columns[1:]:
    print ("Unique data in {} is: {}".format(i, stroke_df[i].unique()))

In [None]:
null_df = pd.DataFrame({'Null Values' : stroke_df.isnull().sum(), 'Percentage Missing (%)': round((stroke_df.isnull().sum()/stroke_df.shape[0])*100,3) })
null_df

In [None]:
stroke_df.shape

# EDA

## Distribution is number of stroke vs not stroke

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.countplot(stroke_df['stroke'], palette = "Set2")
ax.set_title("Distribution of stroke")


## As we can see, the dataset is strongly imbalanced, we will deal with that later

In [None]:
stroke_df['gender'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.countplot(stroke_df['gender'], palette = "Set2")
ax.set_title("Distribution of gender")


# There are more female than male in this dataset. Let see who gets more stroke

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.countplot(stroke_df['stroke'], hue='gender', data=stroke_df, palette = "Set2")
ax.set_title("Distribution of gender vs stroke")


## Don't understand why there is only one gender "other", for simplicity, we will just drop that row

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.violinplot(x='gender', y='age', data=stroke_df, palette = "Set2")
ax.set_title("Distribution of age vs gender")


# The distribution of age is the same !!

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.countplot(x='gender', hue='work_type', data=stroke_df, palette = "Set2")
ax.set_title("Distribution of work type vs gender")

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.countplot(x='stroke', hue='work_type', data=stroke_df, palette = "Set2")
ax.set_title("Distribution of work type vs stroke")

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.violinplot(y='bmi', x='gender', data=stroke_df, palette = "Set2")
ax.set_title("Distribution of age vs bmi")


In [None]:
plt.figure(figsize=(10,6))
for i, col in enumerate(['age', 'bmi']):
    plt.subplot(1,2,i+1)
    sns.boxplot(stroke_df[col])
    plt.tight_layout()

In [None]:
stroke_0 = stroke_df[stroke_df['stroke'] ==0]
stroke_1 = stroke_df[stroke_df['stroke'] ==1]
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.distplot(stroke_0['bmi'], kde="true", ax=ax)
ax = sns.distplot(stroke_1['bmi'], kde="true", ax=ax)
ax.set_title("Bmi distribution")
ax.legend(loc='upper right')


## People who suffer from stroke has a slightly high average bmi than those who dont 

In [None]:
labels=['no stroke','stroke']
stroke_0 = stroke_df[stroke_df['stroke'] ==0]
stroke_1 = stroke_df[stroke_df['stroke'] ==1]
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.distplot(stroke_0['avg_glucose_level'], kde="true", color= 'blue', ax=ax)
ax = sns.distplot(stroke_1['avg_glucose_level'], kde="true", color = 'green', ax=ax)
ax.set_title("Average glucose distribution")
ax.legend(labels=labels, loc='upper right')

## People who suffer from stroke has a very high average glucose level, especially when it comes to >170. However, people will low glucose level still suffer from stroke

In [None]:
labels = ['Not stroke', 'Stroke']
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.countplot(x='Residence_type', hue='stroke', data=stroke_df, palette = "Set2")
ax.set_title("Distribution of work type vs stroke")
ax.legend(labels=labels)

## Not so much data to get from residence type, since the number of people suffer from stroke are equal

In [None]:
bmi_avg_glucose_df = stroke_df[['bmi','avg_glucose_level','stroke']]
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.scatterplot(x='bmi', y= 'avg_glucose_level', hue='stroke', data= bmi_avg_glucose_df , palette = "Set2")
ax.set_title("Distribution of bmi and avg_glucose vs stroke")

In [None]:
for i, column in enumerate(['hypertension','heart_disease']):
    fig, ax = plt.subplots(figsize=(15,10))
    ax = sns.countplot(x=column, hue = 'stroke', data = stroke_df)


## We can see that bmi does not contribute as much to stroke as avg_glucose, most people who suffer from stroke has relatively high average glucose, whereas their bmi are relatively low

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.violinplot(y='age', x='stroke', data=stroke_df, palette = "Set2")
ax.set_title("Distribution of age vs stroke")

## It is easy to see that, most people who suffer from stroke are more than 60 years old, no surprise at all !

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.countplot(x='smoking_status', data=stroke_1, palette = "Set2")
ax.set_title('Smoking status vs stroke')

## This dataset is so weird, more than 70 people who used to smoke suffer from stroke, whereas 44 for still smoking, 90 for never smoke

## Heatmap

In [None]:
# Pearson Correlation Heatmap
plt.figure(figsize=(15,8))

sns.heatmap(stroke_df.corr(method = 'pearson'), vmax=1, center=0, annot = True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
# Pearson Correlation Heatmap
plt.figure(figsize=(15,8))

sns.heatmap(stroke_df.corr(method = 'spearman'), vmax=1, center=0, annot = True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

# Preprocessing

In [None]:
stroke_df.drop('id', axis=1, inplace = True)
stroke_df = stroke_df.drop(stroke_df[stroke_df.gender == 'Other'].index)
stroke_df.bmi.fillna(stroke_df['bmi'].mean(),inplace=True)

In [None]:
le = LabelEncoder()
stroke_df['gender'] = le.fit_transform(stroke_df['gender'])
stroke_df['ever_married'] = le.fit_transform(stroke_df['ever_married'])
stroke_df['work_type'] = le.fit_transform(stroke_df['work_type'])
stroke_df['Residence_type'] = le.fit_transform(stroke_df['Residence_type'])
stroke_df['smoking_status'] = le.fit_transform(stroke_df['smoking_status'])

In [None]:
stroke_df.head()

In [None]:
y = stroke_df['stroke']

In [None]:
X = stroke_df.drop('stroke',axis=1)
X

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X , y , test_size = 0.2, stratify = y, random_state=11)

In [None]:

# pipeline = make_pipeline(MinMaxScaler(),SMOTE(random_state=11), 
#                               RandomForestClassifier(random_state=11))
# params ={'n_estimators': np.arange(10,500,10),
#  'max_depth': np.arange(4,50,2)}
# new_params = {'randomforestclassifier__' + key: params[key] for key in params}
# stratified_kfold = StratifiedKFold(n_splits=5, shuffle = True, random_state=11)
# grid_search = GridSearchCV(estimator=pipeline, param_grid=new_params, scoring='recall', cv= stratified_kfold, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# cv_score = grid_search.best_score_
# test_score = grid_search.score(X_test,y_test)
# print(cv_score,test_score)

In [None]:
# y_test_predict = grid_search.predict(X_test)
# recall_score(y_test, y_test_predict)

In [None]:
pipeline = make_pipeline(MinMaxScaler(), SMOTE(random_state=11), 
                              SVC(random_state=11))
params = {'C': [0.001,0.1, 0.01, 1, 2, 3, 4, 5, 10, 100],  
              'gamma': [10, 5, 1,  0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001], 
              'kernel': ['linear','rbf','kernel']} 
new_params = {'svc__' + key: params[key] for key in params}
stratified_kfold = StratifiedKFold(n_splits=5, shuffle = True, random_state=11)
grid_search = GridSearchCV(estimator=pipeline, param_grid=new_params, scoring='recall', cv= stratified_kfold, n_jobs=-1)
grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test,y_test)
print(cv_score,test_score)

In [None]:
print(grid_search.best_params_)
y_test_predict = grid_search.predict(X_test)
print(classification_report(y_test,y_test_predict))
recall_score(y_test, y_test_predict)