In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Stroke Prediction

You can see the data and other details [here](https://www.kaggle.com/fedesoriano/stroke-prediction-dataset)

I'm trying to predict stroke with some common classifier algorithms, and also doing some preprocessing, please give me suggestions to improve my notebook :)

# **EDA**

In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df

In [None]:
df.isna().any()

Fill null values with mean

In [None]:
df.bmi = df.bmi.fillna(df.bmi.mean())

In [None]:
numerical = df[["age", "avg_glucose_level", "bmi"]]

In [None]:
categorical = df[["gender", "hypertension", "heart_disease", "ever_married", "work_type", "Residence_type", "smoking_status", "stroke"]]

Since our goal is to predict people with stroke, let's check the distribution of stroke

In [None]:
sns.countplot(df["stroke"])

Very unbalanced, we can't directly put this dataset into models, we have to do some preprocessing first,

let's check other categorical values distribution compared to stroke column first

In [None]:

fig, axes = plt.subplots(3,3, figsize=(15,15))
a = 0
b = 0
for col in categorical.columns:
    sns.countplot(ax=axes[a][b], x=col, hue="stroke", data=categorical)
    fig.tight_layout() 
    axes[a][b].set_xticklabels(axes[a][b].get_xticklabels(), rotation=10)
    axes
    a+=1
    if a==3:
        a = 0
        b+=1
    
    


There's an anomaly on gender data, I'm just gonna drop it since it only one data

after that, let's check the gender with other features, we might get something

In [None]:
df = df[df.gender != "Other"]

In [None]:
fig, axes = plt.subplots(3,3, figsize=(15,15))
a = 0
b = 0
for col in categorical.columns:
    sns.countplot(ax=axes[a][b], x=col, hue="gender", data= categorical[categorical.gender != "Other"])
    fig.tight_layout() 
    axes[a][b].set_xticklabels(axes[a][b].get_xticklabels(), rotation=10)
    axes
    a+=1
    if a==3:
        a = 0
        b+=1
    
    


From the visualization we understand that:
* There are more female population in this data
* Most of women are not smoking
* Can't tell the rest

How about numerical values?

In [None]:
fig, axes = plt.subplots(1,3, figsize=(15,4))
a = 0
for x in numerical.columns:
    sns.distplot(df[x], ax=axes[a])
    a+=1
    

In [None]:
sns.scatterplot(x="age", y="avg_glucose_level", hue="stroke", data=df)

It's pretty hard to tell, let's jump to the correlation between each columns

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_prep = df.apply(le.fit_transform)
corrs = df_prep.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corrs, annot=True, cmap="Blues")

In [None]:
corrs["stroke"]

As we all know that stroke usually attack older people, and they tend to have higher blood pressure which brings some heart diseases.

# Preprocessing

Since the stroke data is imbalanced, we have to make them balance. And one way to do that is using SMOTE,

Smote is an oversampling method by augmenting the minority classes, you can read more explanation [here](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/)

Since the augmentation will involve all features, we have to make sure that every features is numerical, let's encode the all of the categorical features first

In [None]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

encode_cols = ["gender", "hypertension", "ever_married", "work_type", "Residence_type", "smoking_status"]

for col in encode_cols:
    LE.fit(df[col])
    df[col] = LE.transform(df[col])

In [None]:
df.stroke.value_counts()

Apply SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sampler = SMOTE(random_state = 42)
X = df.drop(['stroke'],axis=1)
y = df[['stroke']]
X,y= sampler.fit_resample(X,y['stroke'].values.ravel())
y = pd.DataFrame({'stroke':y})
sns.countplot(data = y, x = 'stroke', y= None)
plt.show()

Now we have balanced data!

In [None]:
df.info()

Now we are ready for modelling

# Modelling

Splitting data into train and test

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

Building pipelines that will be automatically preprocessing the dataset when used by the model

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline

col_trans = make_column_transformer(
            (OneHotEncoder(),['hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']),
            (StandardScaler(),['age','avg_glucose_level', 'bmi']), 
            remainder = 'passthrough') 


In [None]:
df.gender

**Logistic Regression**

In [None]:
logR = LogisticRegression()

pipe = make_pipeline(col_trans, logR)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

**Random Forest**

In [None]:
RF = RandomForestClassifier(n_estimators = 50, max_depth = 3, random_state = 2 )

pipe = make_pipeline(col_trans, RF)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

**Gradient Boosting**

In [None]:
GB = GradientBoostingClassifier(n_estimators = 50, max_depth = 3, random_state = 2)

pipe =  make_pipeline(col_trans, GB)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

**Naive Bayes**

In [None]:
NB = GaussianNB()

pipe =  make_pipeline(col_trans, NB)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

That's it, Gradient Boosting results the highest accuracy (86%)

I will find out something to improve these models, please give me feedback or suggestion since I'm still a beginner :)