# Import required datasets

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import relevant datasets

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
types = {
    'gender':'category',
    'age': 'int64',
    'ever_married':'category', 
    'work_type':'category',
    'Residence_type':'category',
    'smoking_status':'category'}

df = df.astype(types)
df.drop(columns='id', inplace=True)
df.info()

In [None]:
# fill for null in BMI data
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())
df.describe()

# Exploratory Data Analysis

In [None]:
#Continuous variables
target = df['stroke']
cont = df.drop(columns=['hypertension', 'heart_disease'])

sns.pairplot(cont, hue='stroke', diag_kind='kde')

In [None]:
#Categorical variables
cat = df[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke']]
#cat = pd.get_dummies(cat)

fig,((ax1,ax2),(ax3,ax4),(ax5,ax6)) = plt.subplots(nrows=3,ncols=2, figsize=(20,12))
sns.violinplot(data=df, x='gender', y='age', hue='stroke', split=True, inner='box', ax=ax1)
sns.violinplot(data=df, x='ever_married', y='age', hue='stroke', split=True, inner='box', ax=ax2)
sns.violinplot(data=df, x='work_type', y='age', hue='stroke', split=True, inner='box', ax=ax3)
sns.violinplot(data=df, x='Residence_type', y='age', hue='stroke', split=True, inner='box', ax=ax4)
sns.violinplot(data=df, x='smoking_status', y='age', hue='stroke', split=True, inner='box', ax=ax5)
sns.kdeplot(data=df, x='bmi', y='age', hue='stroke', ax=ax6)

# EDA conclusions

From the results, we can see that as age increases the probability of stroke increases.  
However, even though age is a major contributing factor, a current smoker also has an elevated stroke risk level than other age groups.  
We also see that a BMI>25 (overweight) is an indicator of stroke

In [None]:
stroke = df['stroke'].value_counts()[1]
no_stroke = df['stroke'].value_counts()[0]
pct_stroke = stroke/(stroke+no_stroke)*100

print('The number of positive stoke cases in dataset is {:0.2f}%'.format(pct_stroke))

# Data Preparation

Prepare for 2 cases 
Case 1 -> SMOTE Oversampling to oversample positive stroke cases to compensate for imbalanced data.  
Case 2 -> Random Under Sampler to undersample negative stroke cases to compensate for imbalanced data

In [None]:
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.compose import make_column_transformer
from sklearn.svm import SVC
from imblearn.pipeline import make_pipeline


variable = df.drop(columns='stroke')
target = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(variable, target, random_state=10, test_size=0.3)


ct = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']),
                       (MaxAbsScaler(), ['avg_glucose_level', 'bmi', 'age']),
                       remainder='passthrough')

smote = SMOTE(random_state=10)
rus = RandomUnderSampler(random_state=10)
svc = SVC()

# Machine Learning

As the dataset is imbalanced (positive stroke cases much lower than total).  
We can compensate using Synthetic Minority Oversampling Technique (SMOTE) to balance the target dataset


The first case would test the effect of a StandardScaler and the second case would test the effect of Max Absolute Scaler

In [None]:
# Case 1 SMOTE
case1 = make_pipeline(ct, smote,svc)
case1.fit(X_train, y_train)
y_pred1 = case1.predict(X_test)

In [None]:
# Case 2 Random Under Sampler
case2 = make_pipeline(ct, rus,svc)
case2.fit(X_train, y_train)
y_pred2 = case2.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

ac1 = accuracy_score(y_test, y_pred1)
ac2 = accuracy_score(y_test, y_pred2)

print('The accuracy score of case 1 (SMOTE) is: {:.2f}% and case 2 (Random Under Sampler) is: {:.2f}%'.format(ac1*100,ac2*100))

In [None]:
print(classification_report_imbalanced(y_test, y_pred1))
print(classification_report_imbalanced(y_test, y_pred2))

This is my first time using SMOTE and Random Under Sampler to compensate for imbalanced data

Many thanks to Aditi Mulye and lakshman raj as i refered quite abit to their submission.

I would appreciate any comments on how to improve the scores as well as introducing any scoring techniques for imbalanced datasets. Ususally I would use an ROC curve to compare between two models but that is not possible with imblearn.