# Stroke prediction

My goal was to try to improve the very low F1 scores and to compensate the imbalance in the dataset. I'm a beginner so any feedback is much appreciated!

### Import dataset

In [None]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


df =  pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

### Data analysis

In [None]:
df.describe()

In [None]:
df.head(10)

In [None]:
df.stroke.value_counts() # Dataset is imbalanced

### Data cleansing

In [None]:
df.drop("id", axis=1, inplace=True)
df = df.sample(frac = 1)

#### Treatment of null values


In [None]:

df['smoking_status'].replace('Unknown', np.nan, inplace=True)
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
df['smoking_status'].fillna(df['smoking_status'].mode()[0], inplace = True)

#df.dropna(inplace=True)


## Encoding

### Label encoding for categorical features with 2 values

In [None]:

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

label_features = ['ever_married','Residence_type']
df[label_features] = df[label_features].apply(le.fit_transform)


### One hot encoding for categorical features with >2 values

In [None]:
ohe_features = ['gender','work_type','smoking_status']
for feat in ohe_features:
    df[feat] = pd.Categorical(df[feat])
    df_dummies = pd.get_dummies(df[feat], prefix = feat + '_encoded',drop_first=True)
    df.drop(feat, axis=1, inplace=True)
    df = pd.concat([df, df_dummies], axis=1)


## Train-test split

In [None]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df.drop('stroke',axis=1), df['stroke'], test_size=0.33, random_state=42)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

### Oversampling


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

sm = SMOTE(random_state=2)
train_x, train_y = sm.fit_resample(train_x, train_y)
#os = RandomOverSampler(sampling_strategy = 1)
#train_x, train_y = os.fit_resample(train_x, train_y)



## Test models

In [None]:
from sklearn import svm
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, accuracy_score

svc = svm.SVC()
svc.fit(train_x, train_y)

y_pred = svc.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('SVM\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))

In [None]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(class_weight='balanced')
log.fit(train_x, train_y)

y_pred = log.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Logistic Regression\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_x,train_y)

y_pred = gnb.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Gaussian Naive Bayes\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

tree_para = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
tree = GridSearchCV(DecisionTreeClassifier(random_state=0,class_weight='balanced'), tree_para, cv=5)
tree.fit(train_x, train_y)

y_pred = tree.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Decision Tree with Grid Search\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))


In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=0,class_weight='balanced_subsample')
rfc.fit(train_x,train_y)

y_pred = rfc.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Random Forest\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(eval_metric='error',use_label_encoder=False)
xgb.fit(train_x,train_y)

y_pred = xgb.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('XGB\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))


In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(random_state=42)
brf.fit(train_x,train_y)

y_pred = brf.predict(test_x)
f1 = f1_score(test_y, y_pred,average=None)
roc = roc_auc_score(test_y, y_pred)
cm = confusion_matrix(test_y, y_pred) 

print('Balanced Random Forest\n')
print('Confusion matrix: \n',cm,'\n')
print('Accuracy Score: {:.5f} \n'.format(accuracy_score(test_y, y_pred)))
print('ROC AUC Score: {:.5f} \n'.format(roc))
print('F1: {:.5f} {:.5f} \n'.format(f1[0], f1[1]))


# Feel free to give your feedback in the comments! :)
