In this notebook you will get to see the prediction of the probability of a person having stroke from the dataset given.The machine learning algorithm used to get the prediction is LightGBM with hyperparameter tuning and cross validation. <br>
I hope you like the notebook, Happy Reading ;-D

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing the libraries 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
# To supress the warnings 
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading and Reading the Dataset
df=pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

### Exploratory Data Analysis :

In [None]:
# checking missing values 
df.isnull().sum()

In [None]:
# checking is the dataset is balanced or not 

K=df.stroke.value_counts(normalize=True)*100
print("Positive outcomes : {:.2f}% \nNegative outcomes : {:.2f}%".format(K[1],K[0]))

In [None]:
plt.bar(['No','Yes'],df.stroke.value_counts()/df.shape[0]*100)
plt.xlabel("Suffered Stroke (YES/NO)")
plt.ylabel("% of People")
plt.title("Check for Imbalance in target variable ")
plt.tight_layout()

As seen from the above plot,the dataset is highly Imbalanced.

##### - Handling the work_type column:

In [None]:
# checking value counts for each work type
df.work_type.value_counts()

In [None]:
# checking distribution for each work type

plt.bar(list(df.work_type.unique()),df.work_type.value_counts())
plt.xlabel("Work Type")
plt.ylabel("Number of People")
plt.title("Check for Imbalance in work_type variable ")
plt.tight_layout()

As the median age of the entries whose work_type=Never_worked is 16 years and also as the upper quantile of the entries whose work_type= children is also 16 years so we are changing the work type of the entries with "Never_worked" and combining with the children column

In [None]:
print("Quantiles : \n{}".format(df.loc[df.work_type=='children'].age.quantile([0.25,0.50,0.75,1.0])))

print("\nNumber of entries whose work_type=children : ",df.loc[(df.work_type=='children')].age.count())
print("Number of teenagers whose work_type=children : ",df.loc[(df.work_type=='children')&(df.age>=13)&(df.age<=19)].age.count())
print("Median age of entries whose work_type=Never_worked : ",df.loc[df.work_type=='Never_worked'].age.median())

Replacing those rows of work_type==Never_worked to work_type==children, In short we are clubbing to unique type of worktype into one for the logical reason explained above

In [None]:
for i in df.index:
    if df.loc[i,"work_type"] =='Never_worked':
        df.loc[i,"work_type"]='children'

In [None]:
df.work_type.value_counts()

##### - Handling gender colums :

In [None]:
df.gender.value_counts()

There is only one record where the Gender="Other" so with just one record it will not be a wise decision to predict anything as we always look for averaging stuffs so this 1 record would not help. So we better delete that

In [None]:
df.loc[df.gender=='Other']

In [None]:
df.drop(df.loc[df.gender=='Other'].index,axis=0,inplace=True)
df.reset_index(drop=True,inplace=True)

In [None]:
df.shape # Number of records has decreased by 1 as we deleted 

In [None]:
Target=df.stroke
print(len(Target))

##### - Handling the columns with object datatype

In [None]:
# Transforming categorical features into numerical ones 
from sklearn.preprocessing import LabelEncoder
for c in df.columns:
    if df[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(df[c].values))
        df[c] = lbl.transform(df[c].values)
           
df.head()


##### - Handling BMI column

In [None]:
#Handling the nan values in BMI columns

WithoutBMI=df[df.bmi.isna()]         # The dataframe is constructed of the entries that have bmi as nan 

WithBMI=df[df.bmi.isna()==False]     # The dataframe is constructed of the entries that do not have bmi as nan 


BMI varies with age so for every 10 year age range we are taking the median of the BMI's availbale in WithBMI dataframe and replacing the nan values with it in the dataframe(df) that have bmi=nan in that age range.

In [None]:
print(WithBMI.age.max())
AgeRanges=[(0,10),(10,20),(20,30),(30,40),(40,50),(50,60),(60,70),(70,80),(80,WithBMI.age.max())]

Replacements=[]
for i in AgeRanges:
    R1=WithBMI.loc[(WithBMI.age<i[1]) & (WithBMI.age>i[0])].bmi.median()
    Replacements.append((R1)) 
Replacements

In [None]:
# Replacing all the missing values with None
df.fillna("None",inplace=True)

In [None]:
for i in df.index:
    if df.loc[i,"age"] <=10:
        if df.loc[i,"bmi"]=="None":
            df.loc[i,"bmi"]=Replacements[0]
            
    elif df.loc[i,"age"] in range(11,21):
        if df.loc[i,"bmi"]=="None":
            df.loc[i,"bmi"]=Replacements[1]
            
    elif df.loc[i,"age"] in range(21,31):
        if df.loc[i,"bmi"]=="None":
            df.loc[i,"bmi"]=Replacements[2]
                    
    elif df.loc[i,"age"] in range(31,41):
        if df.loc[i,"bmi"]=="None":
            df.loc[i,"bmi"]=Replacements[3]
            
            
    elif df.loc[i,"age"] in range(41,51):
        if df.loc[i,"bmi"]=="None":
            df.loc[i,"bmi"]=Replacements[4]
            
            
    elif df.loc[i,"age"] in range(51,61):
        if df.loc[i,"bmi"]=="None":
            df.loc[i,"bmi"]=Replacements[5]
            
        
    elif df.loc[i,"age"] in range(61,71):
        if df.loc[i,"bmi"]=="None":
            df.loc[i,"bmi"]=Replacements[6]
            
            
    elif df.loc[i,"age"] in range(71,81):
        if df.loc[i,"bmi"]=="None":
            df.loc[i,"bmi"]=Replacements[7]
            
    elif df.loc[i,"age"] in range(81,91):
        if df.loc[i,"bmi"]=="None":
            df.loc[i,"bmi"]=Replacements[8]
            
    
            
df.bmi.isnull().sum()

In [None]:
# all the missing values have been replaced with suitable substitutions
df.isnull().sum().sum()

In [None]:
df.dtypes

As features with object datatypes can not be fed into the ML model so we are converting it into float datatype 

In [None]:
df.bmi=df.bmi.astype('float')
print(df.bmi.dtype)

In [None]:
df.head()

### Model Building :

In [None]:
Y=df.stroke                                           # Target 
df.drop(['id','stroke'],axis=1,inplace=True)
X=df                                                  # Features 

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,train_size=0.7,stratify=Y,random_state=42)

In [None]:
# Handling the data imbalance with SMOTE 

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

oversample=RandomOverSampler()

X_train_sm,y_train_sm=oversample.fit_resample(X_train,y_train)
Count1=Counter(y_train)
Count2=Counter(y_train_sm)
print("Target counts before upsampling : {}".format(Count1))
print("Target counts after upsampling : {}".format(Count2))


Taking some of the best classifiers and without any Hyperparameter tuning trying to get the best classifier as a base  

In [None]:
# importing the Classifiers
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


lgbm=LGBMClassifier()
cat=CatBoostClassifier(verbose = False)
xgb=XGBClassifier()
rf=RandomForestClassifier()

models=[lgbm,cat,xgb,rf]

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

RocAucScores=[]
confMatrix=[]
for i in models:
    i.fit(X_train_sm,y_train_sm)
    y_pred=i.predict_proba(X_test)[:,1]
    RocAucScores.append(roc_auc_score(y_test,y_pred))


In [None]:
#Printing roc_auc_score for all the classifier models we used 
print(RocAucScores)


As we saw LightGBM performed the best without any hyperparameter tuning so we would look to increase its auc score with hyperparamter tuning. We are using Optuna for that purpose

Hyperparameter Tuning using optuna

In [None]:
%%time
import lightgbm as lgb
def objective(trial,data=X_train_sm, target = y_train_sm):
    #X_train,X_test,y_train,y_test = train_test_split(train,target,train_size=0.9)
    dtrain = lgb.Dataset(X_train_sm, label=y_train_sm)
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 500),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 3, 200),
        'n_jobs' :-1
    }
    model = lgb.train(param,dtrain)
    y_pred = model.predict(X_test)#[:,1]
    pred_labels = np.rint(y_pred)
    auc_roc_score = roc_auc_score(y_test,pred_labels)
    return auc_roc_score

In [None]:
%%time 
import optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
params=study.best_params 
params['metric'] = 'roc_auc_score'
params

In [None]:
Estimators=[50,60,70,80,100,120,150,180,200,210,220,250,300,400,500]
EstimatorsAndScores=[]
for i in Estimators:
    params['n_estimators']=i
    LGBM = LGBMClassifier(**params)
    LGBM.fit(X_train_sm,y_train_sm)
    y_pred = LGBM.predict_proba(X_test)[:,1]
    EstimatorsAndScores.append((i,metrics.roc_auc_score(y_test,y_pred)))
    #target_names = ["class 0 ","class 1"]
    #print("\n\n For Estimators = {} \t  AUC Score : {} \n\n".format(i,metrics.roc_auc_score(y_test,y_pred)))


In [None]:
# For having the best value for "n_estimators" parameter
def take2nd(elem):
    return elem[1]
BestEstimators=sorted(EstimatorsAndScores,key=take2nd,reverse=True)[0][0]
params['n_estimators']=BestEstimators
params

In [None]:
lgbm=LGBMClassifier(**params) 
lgbm.fit(X_train_sm,y_train_sm)

In [None]:
y_pred2=lgbm.predict_proba(X_test)[:,1]
y_pred2

In [None]:
# Before cross validation 
print("The AUC score before Hyperparameter tuning : {} ".format(roc_auc_score(y_test,y_pred)))
print("The AUC score after Hyperparameter tuning : {} ".format(roc_auc_score(y_test,y_pred2)))
print("Improvement after Hyperparameter Tuning : {}".format((roc_auc_score(y_test,y_pred2))-(roc_auc_score(y_test,y_pred))))

##### Improving the results further we are using StratifiedKFold

In [None]:
from sklearn.model_selection import StratifiedKFold
SEED = 1024
N_SPLITS =5
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

In [None]:
i=1
pred_test_full =0
max_auc = 0 
lgbmodel=LGBMClassifier(**params) 
for train_idx,test_idx in kfold.split(X,Y): 
    print(' Running {} of KFold {}'.format(i,kfold.n_splits)) 
    xtr,xvl = X.loc[train_idx],X.loc[test_idx] 
    ytr,yvl = Y.loc[train_idx],Y.loc[test_idx] 
    lgbmodel.fit(xtr,ytr) 
    score = roc_auc_score(yvl,lgbmodel.predict_proba(xvl)[:,1])
    print('\nROC AUC score: {}\n\n'.format(score))
    pred_test = lgbmodel.predict_proba(X_test)[:,1]
    pred_test_full +=pred_test
    i+=1

In [None]:
y_pred3=pred_test_full
y_pred3

In [None]:
# After Hyperparameter tuning and cross validation, we get a score of 
roc_auc_score(y_test,y_pred3)

In [None]:
print("The AUC score before Hyperparameter tuning : {} ".format(roc_auc_score(y_test,y_pred)))
print("The AUC score after Hyperparameter tuning : {} ".format(roc_auc_score(y_test,y_pred2)))
print("The AUC score after Hyperparameter tuning & cross validation : {} ".format(roc_auc_score(y_test,y_pred3)))
print("Improvement after Hyperparameter Tuning : {}".format((roc_auc_score(y_test,y_pred2))-(roc_auc_score(y_test,y_pred))))
print("Improvement after cross validation : {}".format((roc_auc_score(y_test,y_pred3))-(roc_auc_score(y_test,y_pred2))))
print("\nOverall Improvement after using Hyperparameter tuning and cross validation from the base model : {}".format((roc_auc_score(y_test,y_pred3))-(roc_auc_score(y_test,y_pred))))

<h4 align="center"> ..._/\_ Thank You _/\_...</h4>

If the Notebook was good enough then kindly support me with an upvote :-D<br>
If you want to share your opinion or have any suggestions do let me know in the comments ;-D