In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings  
warnings.filterwarnings('ignore')

# Understanding the dataset

In [None]:
data=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

In [None]:
data.shape

Our dataset has 5110 people. Let's check our dataset for missing values and data type by `info()`

In [None]:
data.info()

As we can see BMI has some missing values. And our data types are: (Be aware even though some data types having int64 type is actually categorical variable)
* Numerical variables
  * Continous: age, avg_glucose_level, BMI
  * Distrect: None
* Categorical : gender, stroke, smoking_status,heart_disease, ever_married, hypertension, work_type, Residence_type

It's a good idea to store our column data type

In [None]:
num_cols=["age","avg_glucose_level","bmi"]
cat_cols=["gender","stroke","smoking_status","heart_disease", "ever_married", "hypertension", "work_type", "Residence_type"]

In [None]:
data[cat_cols]

Let's walk through each feature and see its correlation to stroke.

## Gender
Who has a higher chance of getting a stroke? Female or male. Does male suffer more stroke because they smoke? Let's find out

In [None]:
gender_stroke=data.groupby(["gender","stroke"])[["stroke"]].count()
gender_stroke

We see a similar stroke rate in both gender. There is other gender which only have 1 sample. We will remove this.

In [None]:
data=data[data["gender"]!="Other"]

In [None]:
pd.crosstab(data['gender'],data['stroke']).plot.bar()

## Smoking status

In [None]:
gender_stroke=data.groupby(["smoking_status","stroke"])[["stroke"]].count()
gender_stroke

In [None]:
pd.crosstab(data['smoking_status'],data['stroke']).plot.bar()

## heart_disease

In [None]:
pd.crosstab(data['heart_disease'],data['stroke'],normalize='index')

As we can see, if you have heart disease you have over 17% chance of getting a stroke compared to 4% who don't have heart disease

In [None]:
pd.crosstab(data['heart_disease'],data['stroke']).plot.bar()

## ever_married

In [None]:
pd.crosstab(data['ever_married'],data['stroke'],normalize='index')

Strange but expected! people who are married are 6 time more likely to get a stroke! 6% compared to 1%

In [None]:
pd.crosstab(data['ever_married'],data['stroke']).plot.bar()

## Hypertension, work_type, Residence_type
It's the same as above with no much different, I will skip the visualization for them.
# Continous variables

In [None]:
data[num_cols].head()

Wait! Remember that we are missing some BMI values? We need to fill them up. Here I calculate BMI based on the mean gender and age.

First, let bin our age into 5 categories and calculate the mean of each one

In [None]:
data["age_bin"]=pd.qcut(data['age'], 5,labels=[0,1,2,3,4])
data["age_bin"].value_counts()

Great! Our age divided into 5 bins, let's calculate the mean of each bin and gender

In [None]:
age_to_fill=data.groupby(["gender","age_bin"])["bmi"].mean()
age_to_fill

We have the mean BMI of each gender and age range, let's fill the missing value

In [None]:
for cl in range(0,5):
    for sex in ['Male' , 'Female']:
        filll = pd.to_numeric(age_to_fill.xs(sex).xs(cl))
        data.loc[(data.bmi.isna() & (data.age_bin == cl) & (data.gender == sex)),'bmi'] =filll
        
data.drop(columns='age_bin',inplace=True)#We don't need this anymore

In [None]:
fig = plt.figure(figsize=(18, 5))
ax1 = fig.add_subplot(131)
data["bmi"].hist(bins=40,color = "skyblue")
ax2 = fig.add_subplot(132,sharey=ax1,sharex=ax1)
data[data["stroke"]==1]["bmi"].hist(bins=40,color = "red")
ax3 = fig.add_subplot(133,sharey=ax1,sharex=ax1)
data[data["stroke"]==0]["bmi"].hist(bins=40,color = "springgreen")
ax1.title.set_text('All BMI')
ax2.title.set_text('Stroke=1')
ax3.title.set_text('Stroke=0')
plt.show()

With our own eyes, it's hard to see the different. Let's calculate the mean and median.

In [None]:
data.groupby(["stroke"])["bmi"].agg(['mean','median'])

We can see that stroke patient on average have higher BMI. Let do the same glucose level:

In [None]:
data.groupby(["stroke"])["avg_glucose_level"].agg(['mean','median'])

Stroke patient also have higher average glucose level

# Finishing up

In [None]:
data.drop(columns='id',inplace=True)
data.head()

In [None]:
data.hypertension.replace({0:'No',1:'Yes'},inplace=True)
data.heart_disease.replace({0:'No',1:'Yes'},inplace=True)
data.head()

## One hot and Scale

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
num_cols=['age', 'avg_glucose_level', 'bmi']
cat_cols=['hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status','stroke']

In [None]:
ct = ColumnTransformer([("standard", StandardScaler(), num_cols)],remainder='passthrough')
df= pd.DataFrame(ct.fit_transform(data[num_cols]), columns=data[num_cols].columns)
df.head()

In [None]:
data.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
data_2=pd.concat((data[cat_cols],df),axis=1)
data_2.head()

In [None]:
data=pd.get_dummies(data)
data.head()

Great! our data is complte and ready to go!

# Prediction
Having take a look at our dataset, now let's make some prediction! We start by spliting our dataset into train and test set

## Split
Split our data

In [None]:
from sklearn.model_selection import train_test_split
X=data.drop(columns='stroke')
y=data['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_train.value_counts(),y_test.value_counts()

In [None]:
y.value_counts().plot.pie()

Our dataset is imbalanced. There are only about 4% stroke cases in our training set. If we predict no one has a stroke, we will have an accuracy of 96%! Which is not a good thing. Besides that detecting stroke patients is more important than the healthy ones.

We can fix them by oversampling or undersampling. Since our dataset is small we will take the first one.

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
# instantiating over and under sampler
over = RandomOverSampler(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.8)
# first performing oversampling to minority class
X_over, y_over = over.fit_resample(X, y)
print(f"Oversampled: {Counter(y_over)}")
# now to comine under sampling 
X_combined_sampling, y_combined_sampling = under.fit_resample(X_over, y_over)
print(f"Combined Random Sampling: {Counter(y_combined_sampling)}")

In [None]:
y_combined_sampling.value_counts().plot.pie()

As we can see our data is now more balanced. We shall split our data again!

In [None]:
over = RandomOverSampler(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.8)

## Training
# first performing oversampling to minority class
X_over, y_over = over.fit_resample(X_train, y_train)

# now to comine under sampling 
X_combined_sampling, y_combined_sampling = under.fit_resample(X_over, y_over)
X_train=X_combined_sampling
y_train=y_combined_sampling
print(f"Combined Random Sampling on X_train: {Counter(y_combined_sampling)}")

## Testing
# first performing oversampling to minority class
X_over, y_over = over.fit_resample(X_test, y_test)

# now to comine under sampling 
X_combined_sampling, y_combined_sampling = under.fit_resample(X_over, y_over)
X_test=X_combined_sampling
y_test=y_combined_sampling
print(f"Combined Random Sampling on X_train: {Counter(y_combined_sampling)}")

Looks like our dataset is more blanced now. Time to start training

# Learning

In [None]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report

### Logistic Regression

In [None]:
model=LogisticRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(classification_report(y_pred,y_test))

### RandomForest

In [None]:
model=RandomForestClassifier()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(classification_report(y_pred,y_test))

### KNN

In [None]:
model=KNeighborsClassifier()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(classification_report(y_pred,y_test))

### Decision Tree

In [None]:
model=DecisionTreeClassifier(max_depth=4)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(classification_report(y_pred,y_test))

### SVC

In [None]:
model=SVC()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(classification_report(y_pred,y_test))

### Linear SVC

In [None]:
model=SVC(kernel='linear')
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(classification_report(y_pred,y_test))

# Cross validation
Cross validation usually give us better result.

In [None]:
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score,cross_validate  #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
kfold = KFold(n_splits=8) # k=10, split the data into 10 equal parts

xyz=[]
xyz2=[]
accuracy=[]
f1=[]
std=[]
classifiers=["Logistic Reg","SVC","KNN","Decision tree","Random Forest","Linear SVC"]
models=[LogisticRegression(),SVC(),KNeighborsClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
       SVC(kernel='linear')]
for i in models:
    model = i
    cv_result = cross_validate(model,X_combined_sampling,y_combined_sampling, cv = kfold,scoring = ["f1_macro","accuracy"])
    
    xyz.append(cv_result['test_accuracy'].mean())
    xyz2.append(cv_result['test_f1_macro'].mean())
    #std.append(cv_result.std())
    accuracy.append(cv_result['test_accuracy'])
    f1.append(cv_result['test_f1_macro'])
    
new_models_dataframe2=pd.DataFrame({'CV Acc Mean':xyz,'CV f1 mean':xyz2},index=classifiers)       
new_models_dataframe2

In [None]:
plt.subplots(figsize=(12,6))
box=pd.DataFrame(f1,index=classifiers)
box.T.boxplot()

Overall, decision tree and random forstest did the best job classifying stroke.

# Voting ensemble
Voting regression is a good way to boost model accuracy.

In [None]:
ensemble_lin_rbf=VotingClassifier(estimators=[('KNN',KNeighborsClassifier()),
                                              ('RBF',SVC(probability=True,kernel='rbf')),
                                              ('RFor',RandomForestClassifier(n_estimators=500,max_depth=5,random_state=1)),
                                              ('LR',LogisticRegression()),
                                              ('DT',DecisionTreeClassifier(max_depth=4,criterion='gini', random_state=0)),
                                              ('LSVC',SVC(probability=True,kernel='linear'))
                                             ], 
                       voting='hard')
cv_result=cross_validate(ensemble_lin_rbf,X_combined_sampling,y_combined_sampling, cv = kfold,scoring = ["f1_macro","accuracy"])
cv_result

In [None]:
print(f"Our ensemble model accuracy:{cv_result['test_accuracy'].mean()} ")
print(f"Our ensemble model f1-score:{cv_result['test_f1_macro'].mean()} ")