In [58]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv


In [59]:

from imblearn.over_sampling import SMOTE


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder


from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [60]:
from sklearn.model_selection import cross_val_score

In [61]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [62]:
DT_bmi_pipe = Pipeline( steps=[ 
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=42))
                              ])
X = df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)

Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender']]),index=Missing.index)
df.loc[Missing.index,'bmi'] = predicted_bmi

In [63]:
print('Missing values: ',sum(df.isnull().sum()))

Missing values:  0


In [64]:
variables = [variable for variable in df.columns if variable not in ['id','stroke']]

In [65]:
# Drop single Other gender

no_str_only = no_str_only[(no_str_only['gender'] != 'Other')]

no_str_only.gender.value_counts()

Female    2853
Male      2007
Name: gender, dtype: int64

In [66]:
# Encoding categorical values

df['gender'] = df['gender'].replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)
df['Residence_type'] = df['Residence_type'].replace({'Rural':0,'Urban':1}).astype(np.uint8)
df['work_type'] = df['work_type'].replace({'Private':0,'Self-employed':1,'Govt_job':2,'children':-1,'Never_worked':-2}).astype(np.uint8)

In [67]:
df['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [68]:
X  = df[['gender','age','hypertension','heart_disease','work_type','avg_glucose_level','bmi']]
y = df['stroke']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [70]:
# Our data is biased, we can fix this with SMOTE

oversample = SMOTE()
X_train_resh, y_train_resh = oversample.fit_resample(X_train, y_train.ravel())

In [71]:
from sklearn.model_selection import GridSearchCV

n_estimators =[32,64,100,128,175,256]
max_features = [2,3,5,7]
bootstrap = [True,False]
criterion = ["gini", "entropy"]

param_grid = {'RF__n_estimators':n_estimators,
              'RF__max_features':max_features,
              'RF__criterion': criterion,
              'RF__bootstrap':bootstrap}

In [72]:
from sklearn.model_selection import GridSearchCV

In [56]:
rf_pipeline = Pipeline(steps = [('scale',StandardScaler()),('RF',RandomForestClassifier(random_state=42))])

In [57]:
grid = GridSearchCV(rf_pipeline,param_grid, scoring="f1")

grid.fit(X_train_resh,y_train_resh)

GridSearchCV(estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('RF',
                                        RandomForestClassifier(random_state=42))]),
             param_grid={'RF__bootstrap': [True, False],
                         'RF__criterion': ['gini', 'entropy'],
                         'RF__max_features': [2, 3, 5, 7],
                         'RF__n_estimators': [32, 64, 100, 128, 175, 256]},
             scoring='f1')

In [69]:
grid.best_params_

{'RF__bootstrap': False,
 'RF__criterion': 'gini',
 'RF__max_features': 3,
 'RF__n_estimators': 256}

In [73]:
penalty = ['l1','l2', 'elasticnet']
C = [0.001, 0.003, 0.005, 0.007, 0.009, 0.01, 0.1, 1, 10, 100] 
solver = ["lbfgs", "liblinear", "saga"]

log_param_grid = {'LR__penalty': penalty, 
                  'LR__C': C, 'LR__solver':solver}

logreg = LogisticRegression()
logreg_pipeline = Pipeline(steps = [('scale',StandardScaler()),('LR',LogisticRegression())])

grid = GridSearchCV(logreg_pipeline, log_param_grid, scoring="f1")


In [74]:
grid.fit(X_train_resh,y_train_resh)

GridSearchCV(estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('LR', LogisticRegression())]),
             param_grid={'LR__C': [0.001, 0.003, 0.005, 0.007, 0.009, 0.01, 0.1,
                                   1, 10, 100],
                         'LR__penalty': ['l1', 'l2', 'elasticnet'],
                         'LR__solver': ['lbfgs', 'liblinear', 'saga']},
             scoring='f1')

In [75]:
grid.best_params_

{'LR__C': 0.005, 'LR__penalty': 'l1', 'LR__solver': 'saga'}

In [None]:
svm_param_grid = {'SVM__C': [0.1, 1, 10, 100, 1000],  
             'SVM__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
             'SVM__kernel': ['rbf']} 

svm_pipeline = Pipeline(steps = [('scale',StandardScaler()),('SVM',SVC(C=10,gamma=0.1,kernel='rbf',random_state=42))])

grid = GridSearchCV(svm_pipeline,svm_param_grid)


In [None]:
grid.fit(X_train_resh,y_train_resh)

In [None]:
grid.best_params_