Within this notebook, there is some basic exploratory data analysis and modelling with the use of multiple oversampling techniques.
Feedback, criticism and comments are much appreciated.

Attribute Info:
1. id: unique identifier
2. gender: "Male", "Female" or "Other"
3. age: age of the patient
4. hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5. heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6. ever_married: "No" or "Yes"
7. work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8. Residence_type: "Rural" or "Urban"
9. avg_glucose_level: average glucose level in blood
10. bmi: body mass index
11. smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
12. stroke: 1 if the patient had a stroke or 0 if not
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# data prep
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# under/over sampling
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN

# modelling
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_predict

# classification metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

In [None]:
df.nunique()

In [None]:
df.drop(['id'], inplace=True, axis=1)

In [None]:
df[df['smoking_status'] == 'Unknown'].smoking_status.count()

Since 'Unknown' smoking status is a large portion of the data, we will not drop it.

In [None]:
df.describe()

The minimum age is 0.08. Lets investigate this

In [None]:
df[df['age'] < 1].head(5)

Are these patients truly younger than 1 year when this data was recorded? I don't know too much about how the data was collected and if these are misinputs, but these values could be true because their work type is 'children'

# Exploratory Data Analysis

# Target Variable

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,8))

df['stroke'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%', ax=ax[0])
sns.countplot(x='stroke', data=df, ax=ax[1])

ax[0].set_ylabel('')
ax[0].set_title('Stroke')
ax[1].set_title('Stroke')

plt.show()

Note that we are dealing with an imbalanced data set. About 5:95 or 1:19 ratio

# Correlation

In [None]:
# convert discrete variables with string values to numeric as label encoding
dfcorr = df.copy()
dfcorr[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']] = dfcorr[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']].astype('category')
dfcorr['gender'] = dfcorr['gender'].cat.codes
dfcorr['ever_married'] = dfcorr['ever_married'].cat.codes
dfcorr['work_type'] = dfcorr['work_type'].cat.codes
dfcorr['Residence_type'] = dfcorr['Residence_type'].cat.codes
dfcorr['smoking_status'] = dfcorr['smoking_status'].cat.codes

corr = dfcorr.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(10,8))

cmap= sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink":.5})

plt.title('Heatmap of all variables')

plt.show()

The lifestyle variables, residence_type, work_type and smoking_status don't correlate well with stroke. However, smoking_status has a lot of unknown values. Smoking_status will be investigated further in the notebook.

In [None]:
corr2 = df.corr()
mask = np.triu(np.ones_like(corr2, dtype=bool))

fig, ax = plt.subplots(figsize=(10,8))

cmap= sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr2, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink":.5})

plt.title('Heatmap with only health-related variables')

plt.show()

seems like age is correlated with everything

# Gender

In [None]:
df[['gender','stroke']].groupby(['gender']).count()

lets remove 'Other' since there is only 1 entry

In [None]:
df = df[df['gender'] != 'Other']

In [None]:
# male
dfmale = df[df['gender'] == 'Male']
dfmale = dfmale['stroke'].value_counts(normalize=True)
dfmale = dfmale.mul(100)
dfmale = dfmale.rename('percent').reset_index()

# female
dffemale = df[df['gender'] == 'Female']
dffemale = dffemale['stroke'].value_counts(normalize=True)
dffemale = dffemale.mul(100)
dffemale = dffemale.rename('percent').reset_index()



fig, ax = plt.subplots(2, 2, figsize=(14,10))

df[['gender','stroke']].groupby(['gender']).count().plot.bar(ax=ax[0,0])
sns.countplot(x='gender', hue='stroke', data=df, ax=ax[0,1])
sns.barplot(x='index', y='percent', data=dfmale, ax=ax[1,0])
sns.barplot(x='index', y='percent', data=dffemale, ax=ax[1,1])

ax[0,0].set_title('Gender Count')
ax[0,1].set_title('Gender and Stroke')
ax[1,0].set_title('Male + Stroke')
ax[1,1].set_title('Female + Stroke')

ax[1,0].set_xlabel('Stroke')
ax[1,1].set_xlabel('Stroke')

ax[0,0].set_xticklabels(ax[0,0].get_xticklabels(), rotation=0) # labels were vertical
ax[0,0].invert_xaxis() # male and female labels in wrong order
ax[0,0].get_legend().remove()

ax[1,0].text(0, 50, "95.58%", va='center', ha='center', fontsize=20)
ax[1,0].text(1, 4.7, "4.43%", ha='center', fontsize=20)

ax[1,1].text(0, 50, "95.86%", va='center', ha='center', fontsize=20)
ax[1,1].text(1, 4.3, "4.15%", ha='center', fontsize=20)

fig.tight_layout()
plt.show()

Even though there are more females than males recorded in the data, the relative % of males and females having experienced a stroke is roughly equal.

# Age

In [None]:
fig = plt.figure(figsize=(20,10))

ax = [None for _ in range(5)] # List to save ax for setting parameter

ax[0] = plt.subplot2grid((3,4), (0,0), colspan = 2)
ax[1] = plt.subplot2grid((3,4), (1,0), colspan = 1)
ax[2] = plt.subplot2grid((3,4), (1,1), colspan = 1)
ax[3] = plt.subplot2grid((3,4), (2,0), colspan = 1)
ax[4] = plt.subplot2grid((3,4), (2,1), colspan = 1)

sns.histplot(x='age', hue='stroke', multiple='stack', binwidth=5, data=df, ax=ax[0])
sns.histplot(x='age', binwidth=5, data=df[df['stroke'] == 0], ax=ax[1])
sns.histplot(x='age', color= '#FF8C00', binwidth=5, data=df[df['stroke'] == 1], ax=ax[2])
sns.histplot(x='age', hue='stroke', multiple='stack', binwidth=5, data=df[df['gender'] == 'Male'], ax=ax[3])
sns.histplot(x='age', hue='stroke', multiple='stack', binwidth=5, data=df[df['gender'] == 'Female'], ax=ax[4])

ax[0].set_title('Age and Stroke/No Stroke')
ax[1].set_title('Age and No Stroke')
ax[2].set_title('Age and Stroke')
ax[3].set_title('Age, Male, Stroke')
ax[4].set_title('Age, Female, Stroke')

fig.tight_layout()
plt.show()

This data suggests that the older the patient is, the more likely that they will have a stroke. There are a few cases where a patient had a stroke before the age of 20.

In [None]:
df[(df['age'] < 20) & (df['stroke'] == 1)]

Now a child at the age of about 1 and 14 having a stroke is surprising. Not sure whether this is a misinput or not. Maybe the stroke was due to a rare disease/condition? This makes me question what was considered a stroke when this data was collected, but this is out of our scope for now.

# Hypertension

In [None]:
# no hypertension
dfnohyper = df[df['hypertension'] !=1]
dfnohyper = dfnohyper['stroke'].value_counts(normalize=True)
dfnohyper = dfnohyper.mul(100)
dfnohyper = dfnohyper.rename('percent').reset_index()

# with hypertension
dfhyper = df[df['hypertension'] !=0]
dfhyper = dfhyper['stroke'].value_counts(normalize=True)
dfhyper = dfhyper.mul(100)
dfhyper = dfhyper.rename('percent').reset_index()

fig, ax = plt.subplots(1,3, figsize=(16,6))

sns.countplot(x='hypertension', hue='stroke', data=df, ax=ax[0])
sns.barplot(x='index', y='percent', data=dfnohyper, ax=ax[1])
sns.barplot(x='index', y='percent', data=dfhyper, ax=ax[2])

ax[2].set(ylim=(0, 100))

ax[0].set_title('Count of Stroke and Hypertension')
ax[1].set_title('No Hypertension with Stroke')
ax[2].set_title('Hypertension with Stroke')

ax[1].set_xlabel('Stroke')
ax[2].set_xlabel('Stroke')

ax[1].text(0, 48, "96.66%", va='center', ha='center', fontsize=20)
ax[1].text(1, 3.5, "3.34%", ha='center', fontsize=20)
ax[2].text(0, 43, "86.70%", ha='center', fontsize=20)
ax[2].text(1, 4.7, "13.30%", ha='center', fontsize=20)

plt.show()

patients with hypertension have a higher occurrance of stroke

In [None]:
fig, ax= plt.subplots(1, 2, figsize=(16,8))

sns.violinplot(x='hypertension', y='age', hue='stroke', data=df, split=True, ax=ax[0])
sns.boxplot(x='hypertension', y='age', hue='stroke', data=df, ax=ax[1])

ax[0].set_title('Hypertension, age, stroke')
ax[1].set_title('Hypertension, age, stroke')

plt.show()

the larger distribution of having hypertension is located at older ages

# Heart Disease

In [None]:
# no heart disease
dfnohd = df[df['heart_disease'] !=1]
dfnohd = dfnohd['stroke'].value_counts(normalize=True)
dfnohd = dfnohd.mul(100)
dfnohd = dfnohd.rename('percent').reset_index()

# with heart disease
dfhd = df[df['heart_disease'] !=0]
dfhd = dfhd['stroke'].value_counts(normalize=True)
dfhd = dfhd.mul(100)
dfhd = dfhd.rename('percent').reset_index()

fig, ax = plt.subplots(1,3, figsize=(16,6))

sns.countplot(x='heart_disease', hue='stroke', data=df, ax=ax[0])
sns.barplot(x='index', y='percent', data=dfnohd, ax=ax[1])
sns.barplot(x='index', y='percent', data=dfhd, ax=ax[2])

ax[2].set(ylim=(0, 100))

ax[0].set_title('Count of Stroke and Heart Disease')
ax[1].set_title('No Heart Disease with Stroke')
ax[2].set_title('Heart Disease with Stroke')

ax[1].set_xlabel('Stroke')
ax[2].set_xlabel('Stroke')

ax[1].text(0, 48, "96.38%", va='center', ha='center', fontsize=20)
ax[1].text(1, 3.9, "3.62%", ha='center', fontsize=20)
ax[2].text(0, 43, "83.54%", ha='center', fontsize=20)
ax[2].text(1, 8, "16.46%", ha='center', fontsize=20)

plt.show()

Although the count of patients with heart disease is very low, there is relatively a 12.84% increased occurance of strokes with patients that have a heart disease

# Average Glucose Level

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16, 6))

sns.histplot(x='avg_glucose_level', hue='stroke', multiple='stack', data=df, binwidth=10, ax=ax[0])
sns.histplot(x='avg_glucose_level', color= '#FF8C00', binwidth=10, data=df[df['stroke'] == 1], ax=ax[1])

ax[0].set_title('Average Glucose Level')
ax[1].set_title('Average Glucose Level with Stroke')
plt.show()

The shape of the second peak (avg_glucose_level above 150) for patients with stroke looks relatively higher than the second peak of the left plot, suggesting that there is a higher occurance of stroke in patients with a higher average glucose level 

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16,6))

sns.violinplot(x='stroke', y='avg_glucose_level', data=df, ax=ax[0])
sns.boxplot(x='stroke', y='avg_glucose_level', data=df, ax=ax[1])

ax[0].set_title('stroke and avg glucose level')
ax[1].set_title('stroke and avg glucose level')


plt.show()

This plot better describes that with higher avg_glucose level, the higher risk of having a stroke. We can see more clearly than the histogram that there is a portionally larger distribution of patients with a higher average glucose level having experienced a stroke.

# BMI

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16, 6))

sns.histplot(x='bmi', hue='stroke', multiple='stack', data=df, binwidth=5, ax=ax[0])
sns.histplot(x='bmi', color= '#FF8C00', binwidth=5, data=df[df['stroke'] == 1], ax=ax[1])

ax[0].set_title('bmi')
ax[1].set_title('bmi with stroke')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,12))

ax = [None for _ in range(3)] # create # of axes

ax[0] = plt.subplot2grid((4,4), (0,0), colspan=2)
ax[1] = plt.subplot2grid((4,4), (1,0), colspan=1)
ax[2] = plt.subplot2grid((4,4), (1,1), colspan=1)
    
    
sns.regplot(x='avg_glucose_level', y='bmi', data=df, ax=ax[0], scatter_kws={'s':2}, line_kws={"color": "black", 'linewidth':1})
sns.regplot(x='avg_glucose_level', y='bmi', data=df[df['stroke'] == 0], ax=ax[1], scatter_kws={'s':2}, line_kws={"color": "black", 'linewidth':1})
sns.regplot(x='avg_glucose_level', y='bmi', color= '#FF8C00', data=df[df['stroke'] == 1], ax=ax[2], scatter_kws={'s':2}, line_kws={"color": "black", 'linewidth':1})

ax[0].set_title('avg glucose level and BMI')
ax[1].set_title('No Stroke')
ax[2].set_title('Stroke')

ax[2].set_ylim(0, 100)

plt.tight_layout()
plt.show()

Slight positive correlation from all graphs

# Smoking Status

In [None]:
df.groupby(['smoking_status'])['smoking_status'].describe()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16,8))

sns.violinplot(x='smoking_status', y='age', data=df, ax=ax[0])
sns.boxplot(x='smoking_status', y='age', data=df, ax=ax[1])

ax[0].set_title('Age and smoking status')
ax[1].set_title('Age and smoking status')

plt.show()

Seems that there is a lot of unknown values for ages below 20. Let's trying dividing up this group

In [None]:
# split the unknown smoking status into two groups: Unknown >20 age, and Unknown <20 age
df[(df['smoking_status'] == 'Unknown') & (df['age'] > 20)] = df[(df['smoking_status'] == 'Unknown') & (df['age'] > 20)].replace('Unknown','Unknown over 20')
df['smoking_status'] = df['smoking_status'].replace('Unknown', 'Unknown under 20')

In [None]:
pd.crosstab(df.smoking_status, df.stroke,
           rownames=['smoking_status'], colnames=['stroke'])

In [None]:
fig, ax = plt.subplots(figsize=(16,6))

sns.countplot(x='smoking_status', hue='stroke', data=df)

ax.text(-0.2, 400, "92.21%", va='center', ha='center', fontsize=15)
ax.text(0.2, 100, "7.79%", va='center', ha='center', fontsize=15)
ax.text(0.8, 400, "95.24%", va='center', ha='center', fontsize=15)
ax.text(1.2, 120, "4.76%", va='center', ha='center', fontsize=15)
ax.text(1.8, 400, "94.68%", va='center', ha='center', fontsize=15)
ax.text(2.2, 80, "5.32%", va='center', ha='center', fontsize=15)
ax.text(2.8, 400, "94.37%", va='center', ha='center', fontsize=15)
ax.text(3.2, 80, "5.63%", va='center', ha='center', fontsize=15)
ax.text(3.8, 400, "99.97%", va='center', ha='center', fontsize=15)
ax.text(4.2, 40, "0.30%", va='center', ha='center', fontsize=15)

ax.text(3.7, 1800, "Note: local % for each group", va='center', ha='center', fontsize=15)

plt.title('Smoking Status and Stroke')

plt.tight_layout()
plt.show()

Patients who formerly smoked or smokes had a slightly higher occurance of stroke than patients who never smoked.

# Preparing the Data

In [None]:
df.isnull().sum()
# will simple impute the bmi null values later on

In [None]:
# Drop resident type (low correlation with all features, see heatmap)
df = df.drop(columns=['Residence_type'], axis=1)

In [None]:
# split the data

train_set, test_set = train_test_split(df, test_size=0.2, random_state=1)

# separate features and target
X = train_set.drop(['stroke'], axis=1)
y = train_set['stroke']

# separate numerical and categorical columns
X_num = X[['age', 'avg_glucose_level', 'bmi']]
X_cat = X[['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'smoking_status']]

In [None]:
# pipeline

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
     ('scaler', StandardScaler())
])
    
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(sparse=False, drop='first'))
])

num_colnames = list(X_num)
cat_colnames = list(X_cat)


full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_colnames),
    ('cat', cat_pipeline, cat_colnames)
])

X_prepared = full_pipeline.fit_transform(X)

### Over/Undersampling
undersample = RandomUnderSampler(sampling_strategy=0.6, random_state=1)

# SMOTE
smote = SMOTE(sampling_strategy=0.1, random_state=1)
pipelinesmote = Pipeline([
    ('smote', smote),
    ('under', undersample)
])

X_smote, y_smote = pipelinesmote.fit_resample(X_prepared, y)

# BorderlineSMOTE
pipelinebsmote = Pipeline([
    ('border', BorderlineSMOTE(sampling_strategy=0.1, random_state=1)),
    ('under', undersample)
])

X_bsmote, y_bsmote = pipelinebsmote.fit_resample(X_prepared, y)

# Borderline-SMOTE SVM
pipelinesvmsmote = Pipeline([
    ('svmsmote', SVMSMOTE(sampling_strategy=0.1, random_state=1)),
    ('under', undersample)
])

X_svmsmote, y_svmsmote = pipelinesvmsmote.fit_resample(X_prepared, y)

# Adaptive Synthetic Sampling
pipelineadasyn = Pipeline([
    ('adasyn', ADASYN(sampling_strategy=0.1, random_state=1)),
    ('under', undersample)
])

X_adasyn, y_adasyn = pipelinesvmsmote.fit_resample(X_prepared, y)

# Modelling

In [None]:
scoredict = [] # will be filled with tuples of model/sample and roc/auc score
               # and eventually put into a dataframe to be plotted

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

def eval_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
    meanscore = scores.mean().round(3)
    scorename = type(model).__name__ + ' ' + c
    scoredict.append(tuple((scorename, scores.mean().round(3))))

# models

logclf = LogisticRegression(random_state=1)
knnclf = KNeighborsClassifier()
svcclf = SVC(random_state=1)

listofmodels = [logclf, knnclf, svcclf]

Xfeatures = [X_prepared, X_smote, X_bsmote, X_svmsmote, X_adasyn]
ylabels = [y, y_smote, y_bsmote, y_svmsmote, y_adasyn]
datasetlabel = ['NO RESAMPLING', 'SMOTE', 'BSMOTE', 'SVMSMOTE', 'ADASYN']

for i in listofmodels:
    for a, b, c in zip(Xfeatures, ylabels, datasetlabel):
        eval_model(i, a, b)
        
models = pd.DataFrame(data=scoredict,
                     columns=['Models and sample', 'Mean ROC AUC'])
models = models.sort_values(by=['Mean ROC AUC'], ascending=False)

fig = plt.subplots(1,1, figsize=(14,10))

sns.barplot(x='Mean ROC AUC', y='Models and sample', data=models, orient='h')
plt.show()

For each individual model, the mean ROC AUC is highest using BSMOTE. Lets optimize these models with BSMOTE samples

# Grid Search

In [None]:
# KNN

param_gridknn = [
    {'n_neighbors':[1, 5, 10, 15], 'weights':['uniform', 'distance'], 'leaf_size':[10, 30, 50]}
]

grid_searchknn = GridSearchCV(knnclf, param_gridknn, cv=cv,
                          scoring='roc_auc')
grid_searchknn.fit(X_bsmote, y_bsmote)

# SVC

param_gridsvc = [
    {'C':[.01, .1, 1, 5, 10], 'kernel':['linear', 'poly', 'rbf'], 
     }
]

grid_searchsvc = GridSearchCV(svcclf, param_gridsvc, cv=cv,
                          scoring='roc_auc')
grid_searchsvc.fit(X_bsmote, y_bsmote)


# Log

param_gridlog = [
    {'C':[0.01,.1,1,5,10], 'class_weight':[dict, 'balanced', None]
    }
]

grid_searchlog = GridSearchCV(logclf, param_gridlog, cv=cv,
                          scoring='roc_auc')
grid_searchlog.fit(X_bsmote, y_bsmote)

print('KNN hyperparameters:', grid_searchknn.best_params_)
print('SVC hyperparameters:', grid_searchsvc.best_params_)
print('LOG hyperparameters:', grid_searchlog.best_params_)

In [None]:
def evalgrid(modelname, model):
    scores = cross_val_score(model, X_bsmote, y_bsmote, cv=cv, scoring='roc_auc')
    print(modelname)
    print('Mean ROC AUC:', round(scores.mean(), 3))
    print('Std dev.:', round(scores.std(), 3))
    print('')

evalgrid('KNN', grid_searchknn.best_estimator_)
evalgrid('SVC', grid_searchsvc.best_estimator_)
evalgrid('LOG', grid_searchlog.best_estimator_)

# Evaluate model on Test Set

In [None]:
X_test = test_set.drop('stroke', axis=1)
y_test = test_set['stroke'].copy()

X_test_prepared = full_pipeline.transform(X_test)

def evaltest(model, modelpred):
    modelroc = roc_auc_score(y_test, modelpred)
    modelprec = precision_score(y_test, modelpred)
    modelrecall = recall_score(y_test, modelpred)
    modelf1 = f1_score(y_test, modelpred)
    print(model)
    print('ROC AUC:', round(modelroc, 3))
    print('Precision:', round(modelprec, 3))
    print('Recall:', round(modelrecall, 3))
    print('F1 score:', round(modelf1, 3))
    print('')

# KNN
knnpredictions = grid_searchknn.best_estimator_.predict(X_test_prepared)
evaltest('KNN', knnpredictions)

# SVC
svcpredictions = grid_searchsvc.best_estimator_.predict(X_test_prepared)
evaltest('SVC', svcpredictions)

# Log
logpredictions = grid_searchlog.best_estimator_.predict(X_test_prepared)
evaltest('LOG', logpredictions)

LogisticRegression looks like the best model since it has the highest recall score and precision score.