In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The Data is highly imbalanced with only 4% of patients with target(stroke)=1 and 95% with target(stroke)=0.

By using trivial predicator, we can achieve an accuracy of 94.6% which seems good performance at first, however this trivial predictor is completely useless as it has absolutely no discriminatory power.

I will be using SMOTE algorithm in this notebook to balance the data.


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly import tools
from plotly.subplots import make_subplots
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.combine import SMOTEENN

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.drop('id', 1, inplace=True)

In [None]:
fig = px.bar(x=df['stroke'].value_counts().index, y=df['stroke'].value_counts(), text=(df['stroke'].value_counts()/len(df['stroke'])*100), height=500, width=500)
fig.update_traces(textposition='outside', texttemplate='%{text:.4s}%', marker_color=['peachpuff','silver'])
fig.show()

In [None]:
sns.distplot(x=df['age'])

In [None]:
fig = go.Figure()


fig.add_trace(go.Histogram(x=df['age'],
                          xbins=dict(
                          size=1),
                          opacity=1))

fig.update_layout(title_text='Age Distribution',
                 xaxis_title='Age',
                 yaxis_title='Count',
                 bargap=0.05,
                 xaxis={'showgrid':False},
                 yaxis={'showgrid':False},
                 template='ggplot2',
                 height=600,
                 width=1000)
fig.show()


In [None]:
trace1= go.Bar(x=df['gender'].value_counts().index, y=df['gender'].value_counts(), text = (df['gender'].value_counts()/len(df['gender'])*100), 
               marker_color=['lightpink','lightblue','grey'])

trace2= go.Bar(x=df['hypertension'].value_counts().index, y=df['hypertension'].value_counts(), text = (df['hypertension'].value_counts()/len(df['hypertension'])*100),
              marker_color=['plum','papayawhip'])

trace3= go.Bar(x=df['heart_disease'].value_counts().index, y=df['heart_disease'].value_counts(), text = (df['heart_disease'].value_counts()/len(df['heart_disease'])*100),
              marker_color=['mediumturquoise','lightgreen'])

trace4= go.Bar(x=df['ever_married'].value_counts().index, y=df['ever_married'].value_counts(), text = (df['ever_married'].value_counts()/len(df['ever_married'])*100),
              marker_color=['seagreen',"rgb(114, 78, 145)"])

trace5 = go.Bar(x=df['work_type'].value_counts().index, y=df['work_type'].value_counts(), text=(df['work_type'].value_counts()/len(df['work_type'])*100),
             marker_color=['rgb(56, 75, 126)', 'rgb(18, 36, 37)', 'rgb(34, 53, 101)','rgb(33, 75, 99)'])

trace6 = go.Bar(x=df['Residence_type'].value_counts().index, y=df['Residence_type'].value_counts(), text=(df['Residence_type'].value_counts()/len(df['Residence_type'])*100),
               marker_color=['palegreen','olive'])

fig = make_subplots(rows=3, cols=2, specs=[[{'type':'bar'},{'type':'bar'}],
                                          [{'type':'bar'},{'type':'bar'}],
                                          [{'type':'bar'},{'type':'bar'}]],
                   subplot_titles = ['Gender Distribution','Hypertension Distribution','Heart Disease Distribution','Married VS Single',
                   'Private OR Self-Employed','Resident Type Distribution'])

fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,2)
fig.append_trace(trace3,2,1)
fig.append_trace(trace4,2,2)
fig.append_trace(trace5,3,1)
fig.append_trace(trace6,3,2)

fig['layout'].update(height=1500, width=1000, title='Stroke Prediction Discrete Feature Analysis')
fig.update_traces(textposition='outside', texttemplate='%{text: .3s}%')
fig.show()

In [None]:
fig = go.Figure()


fig.add_trace(go.Histogram(x=df['avg_glucose_level'],
                          xbins=dict(
                          size=1),
                          opacity=1))

fig.update_layout(title_text='AVG_Glucose_Level',
                 xaxis_title='AVG_Glucose_Level',
                 yaxis_title='Count',
                 bargap=0.05,
                 xaxis={'showgrid':False},
                 yaxis={'showgrid':False},
                 template='seaborn',
                 height=600,
                 width=1000)
fig.show()


In [None]:

df['bmi'].fillna(df['bmi'].mean(), inplace=True)

In [None]:
fig = go.Figure()


fig.add_trace(go.Histogram(x=df['bmi'],
                          xbins=dict(
                          size=1),
                          opacity=1))

fig.update_layout(title_text='Body Mass Index',
                 xaxis_title='BMI',
                 yaxis_title='Count',
                 bargap=0.05,
                 xaxis={'showgrid':False},
                 yaxis={'showgrid':False},
                 template='plotly_white',
                 height=600,
                 width=1000)
fig.show()


In [None]:
numerical = df.select_dtypes(exclude=['object']).columns
numerical

In [None]:
# Finding all the categorical columns from the data
categorical=df.select_dtypes(exclude=['int64','float64']).columns
categorical

In [None]:
color=['lightblue','navy']
for i in categorical:
    sns.countplot(x=df[i], hue=df['stroke'], palette=color)
    plt.title('Categorical Features VS Stoke', fontsize=17)
    plt.xlabel(i, fontsize=12)
    plt.ylabel('Count')
    plt.show()

In [None]:
for i in numerical:
    fig = px.histogram(x=df[i], color=df['stroke'], height=500,width=800, title=str(i)+' VS Stroke', nbins=20)
    fig.show()

In [None]:
for i in categorical:
    df[i]=pd.factorize(df[i])[0]

In [None]:
df.head(3)

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
x = df.drop(['stroke'], axis=1).values
y = df['stroke'].values
print(x.shape)
print(y.shape)

# Random Forest Classifier

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=20, test_size=0.2, shuffle=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics

rf = RandomForestClassifier()

rf.fit(x_train, y_train)

In [None]:
print('Accuracy score of Random Forest is: ', accuracy_score(y_test, rf.predict(x_test))*100,'%')

In [None]:
cf=metrics.confusion_matrix(y_test,rf.predict(x_test), labels=[1,0])
sns.heatmap(cf,annot=True)

In [None]:
# Now let's have a look at the report
print(metrics.classification_report(y_test,rf.predict(x_test), labels=[1,0]))
print('Accuracy_Score:',accuracy_score(y_test,rf.predict(x_test))*100,'%')
print('Recall:',metrics.recall_score(y_test,rf.predict(x_test))*100,'%')

# XGBClassifier

In [None]:
from xgboost import XGBClassifier
xgb=XGBClassifier(use_label_encoder=False)

In [None]:
xgb.fit(x_train, y_train)
print('Accuracy of XGBClassifier is: ', accuracy_score(y_test, xgb.predict(x_test))*100,'%')

# Confusion Matrix

In [None]:
cf=metrics.confusion_matrix(y_test,xgb.predict(x_test), labels=[1,0])
sns.heatmap(cf,annot=True)

In [None]:
# Now let's have a look at the report
print(metrics.classification_report(y_test,xgb.predict(x_test), labels=[1,0]))
print('Accuracy_Score:',accuracy_score(y_test,xgb.predict(x_test))*100,'%')
print('Recall:',metrics.recall_score(y_test,xgb.predict(x_test))*100,'%')

# LGBMCLASSIFIER

In [None]:
from lightgbm import LGBMClassifier
lgbm=LGBMClassifier()
lgbm.fit(x_train, y_train)

In [None]:
print('Accuracy of LGBMClassifier is: ', accuracy_score(y_test, lgbm.predict(x_test))*100,'%')

In [None]:
cf=metrics.confusion_matrix(y_test,rf.predict(x_test), labels=[1,0])
sns.heatmap(cf,annot=True)

In [None]:
# Now let's have a look at the report
print(metrics.classification_report(y_test,lgbm.predict(x_test), labels=[1,0]))
print('Accuracy_Score:',accuracy_score(y_test,lgbm.predict(x_test))*100,'%')
print('Recall:',metrics.recall_score(y_test,lgbm.predict(x_test))*100,'%')

# SMOTE

SMOTE is an oversampling technique that generates synthetic samples from the minority class.

In [None]:
#before applying smote

one_count =0
zero_count = 1
for i in y:
    if i==1:
        one_count+=1
    else:
        zero_count+=1
print('The number of one are: ',one_count)
print('The number of zero are: ', zero_count)

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x,y,random_state=22, test_size=0.2, shuffle=True)

In [None]:
oversample = SMOTE(random_state=101)
x_train1, y_train1 = oversample.fit_resample(x_train1,y_train1)

In [None]:
#After applyig smote

one_count=0
zero_count =0
for i in y_train1:
    if i==1:
        one_count +=1
    else:
        zero_count +=1
print('The number of one are: ',one_count)
print('The number of zero are: ', zero_count)

After applying the smote we can see that number of one and zeros are balanced now.

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(x_train1, y_train1)

In [None]:
print('Accuracy of LGBMClassifier is: ', accuracy_score(y_test1, lgbm.predict(x_test1))*100,'%')

# **Confusion Matrix**

In [None]:
cf=metrics.confusion_matrix(y_test1,lgbm.predict(x_test1), labels=[1,0])
sns.heatmap(cf,annot=True)

In [None]:

print(metrics.classification_report(y_test1,lgbm.predict(x_test1), labels=[1,0]))
print('Accuracy_Score:',accuracy_score(y_test1,lgbm.predict(x_test1))*100,'%')
print('Recall:',metrics.recall_score(y_test1,lgbm.predict(x_test1))*100,'%')

# SMOTEENN

SMOTEENN is used to downsample majority class.

In [None]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x,y,random_state=22, test_size=0.2, shuffle=True)

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(x_train2, y_train2)

In [None]:
sme = SMOTEENN()
x_train2, y_train2 = sme.fit_resample(x_train2,y_train2)

In [None]:
lgbm.fit(x_train2,y_train2)

In [None]:
print('Accuracy of LGBMClassifier is: ', accuracy_score(y_test2, lgbm.predict(x_test2))*100,'%')

In [None]:
cf=metrics.confusion_matrix(y_test2,lgbm.predict(x_test2), labels=[1,0])
sns.heatmap(cf,annot=True)

In [None]:
# Now let's have a look at the report
print(metrics.classification_report(y_test2,lgbm.predict(x_test2), labels=[1,0]))
print('Accuracy_Score:',accuracy_score(y_test2,lgbm.predict(x_test2))*100,'%')
print('Recall:',metrics.recall_score(y_test2,lgbm.predict(x_test2))*100,'%')

# Boderline SMOTE and RandomUnderSampling

Boderline SMOTE will not only oversample the miority data but majority data as well, where the majority data are causing misclassification in the decision boundary.

RandomUnderSampling randomly selecting examples from the majority class and deleting them from the training dataset.

In [None]:
x_train3, x_test3, y_train3, y_test3 = train_test_split( x, y, test_size=0.2, random_state=22, shuffle=True)

In [None]:
one_count=0
zero_count=0
for i in y_train3:
    if i==1:
        one_count+=1
    else:
        zero_count+=1
print('The number of 0 are:',zero_count)
print('The number of 1 are:',one_count)

In [None]:
over = BorderlineSMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.6)
steps = [('o', over), ('u', under)]

In [None]:
pipeline = Pipeline(steps=steps)
x_train3, y_train3 = pipeline.fit_resample(x_train3, y_train3)

In [None]:
# After
one_count=0
zero_count=0
for i in y_train3:
    if i==1:
        one_count+=1
    else:
        zero_count+=1
print('The number of 0 are:',zero_count)
print('The number of 1 are:',one_count)

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(x_train3, y_train3)

In [None]:
print('Accuracy of LGBMClassifier is: ', accuracy_score(y_test3, lgbm.predict(x_test3))*100,'%')

In [None]:
cf=metrics.confusion_matrix(y_test3,lgbm.predict(x_test3), labels=[1,0])
sns.heatmap(cf,annot=True)

In [None]:
print(metrics.classification_report(y_test3,lgbm.predict(x_test3), labels=[1,0]))
print('Accuracy_Score:',accuracy_score(y_test3,lgbm.predict(x_test3))*100,'%')
print('Recall:',metrics.recall_score(y_test3,lgbm.predict(x_test3))*100,'%')