In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.bmi.replace(to_replace=np.nan, value=data.bmi.mean(), inplace=True)

In [None]:
data.describe()

# Visualization

In [None]:
labels =data['stroke'].value_counts(sort = True).index
sizes = data['stroke'].value_counts(sort = True)

colors = ["lightblue","red"]
explode = (0.05,0) 
 
plt.figure(figsize=(7,7))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90,)

plt.title('Customer Churn Breakdown')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=data,x='gender');

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
data.plot(kind='scatter', x='age', y='avg_glucose_level', alpha=0.5, color='green', ax=axes[0], title="Age vs. avg_glucose_level")
data.plot(kind='scatter', x='bmi', y='avg_glucose_level', alpha=0.5, color='red', ax=axes[1], title="bmi vs. avg_glucose_level")
plt.show()

In [None]:
sns.set(style="ticks");
pal = ["#FA5858", "#58D3F7"]

sns.pairplot(data, hue="stroke", palette=pal);
plt.title("stroke");

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(data.corr(),annot=True);


In [None]:
plt.figure(figsize=(10,5))
strok=data.loc[data['stroke']==1]
sns.countplot(data=strok,x='ever_married',palette='inferno');


In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=strok,x='work_type',palette='cool');


In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=strok,x='smoking_status',palette='autumn');


In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=strok,x='Residence_type',palette='Greens');

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=strok,x='heart_disease',palette='Reds');

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=strok,x='hypertension',palette='Pastel2');

# Preprocessing

In [None]:
cat_features = ['work_type', 'gender', 'Residence_type', 'smoking_status', 'ever_married'] # categorical features
num_features = ['age', 'avg_glucose_level', 'bmi']                                         # numerical features

In [None]:
# Assigning categorical variables to a numerical value
cat_maps = {'work_type': {'Private':0, 'Self-employed': 1, 'Govt_job':2, 'children':3, 'Never_worked':4},
            'gender': {'Male':0, 'Female':1},
            'Residence_type': {'Urban':0, 'Rural':1},
            'smoking_status': {'formerly smoked':0, 'never smoked':1, 'smokes':2, 'Unknown':3},
            'ever_married': {'Yes':0, 'No':1}
}

In [None]:
data['work_type'] = data['work_type'].map(cat_maps['work_type'])
data['gender'] = data['gender'].map(cat_maps['gender'])
data['Residence_type'] = data['Residence_type'].map(cat_maps['Residence_type'])
data['smoking_status'] = data['smoking_status'].map(cat_maps['smoking_status'])
data['ever_married'] = data['ever_married'].map(cat_maps['ever_married'])

In [None]:
data

In [None]:
# Split into input data and target variable

features = ['age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'gender',
 'work_type',
 'smoking_status']

label = ['stroke']

X = data[features]
y = data[label]

In [None]:
X.gender=(X.gender.fillna(1))
X.isnull().sum()

In [None]:
# Split into training and validation sets. Stratified split of 80-20 ratio

from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test=train_test_split(X, y,test_size=0.2,random_state=42, stratify = y)

In [None]:
"""
Synthetic Minority Oversampling Technique (SMOTE) for handling class imbalance.
Only numerical features should be inputted with categorical features untouched

"""

from imblearn.over_sampling import SMOTENC

smote = SMOTENC([1,2,3,4,7,8,9]) # we pass the index of the input numerical features
X_train , y_train = smote.fit_resample(X_train, y_train)

In [None]:
X_train = pd.DataFrame(data = X_train, columns = features)
X_test = pd.DataFrame(data = X_test, columns = features)

In [None]:
!pip install category_encoders

# Base Models + Hyperparameter Tuning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from matplotlib import pyplot
import category_encoders as ce

In [None]:
folds = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001) # Stratified K-Fold Cross Validation

In [None]:
"""
Numerical features are scaled using Standard Scaling. 
Categorical features are encoded in different ways for different models:
    > For tree-based models, catboost encoding is used
    > For linear models, one hot encoding is used
    > For KNN, categorical features are dropped
"""


tree_mapper = ColumnTransformer(transformers=[('num', StandardScaler(), num_features),
                                         ('ce', ce.CatBoostEncoder(), cat_features)
                                         ], remainder= 'passthrough')

linear_mapper = ColumnTransformer(transformers=[('num', StandardScaler(), num_features),
                                         ('ce', ce.OneHotEncoder(), cat_features)
                                         ], remainder= 'passthrough')

num_mapper = ColumnTransformer(transformers=[('num', StandardScaler(), num_features)
                                         ], remainder= 'drop')

In [None]:
models = [
          Pipeline([('mapper', linear_mapper), ('classifier', LogisticRegression())]),
          Pipeline([('mapper', num_mapper), ('classifier', KNeighborsClassifier())]),
          Pipeline([('classifier', GaussianNB())]),
          Pipeline([('classifier', BernoulliNB())]),
          Pipeline([('mapper', tree_mapper), ('classifier', DecisionTreeClassifier())]),
          Pipeline([('mapper', tree_mapper), ('classifier', RandomForestClassifier())]),
          Pipeline([('mapper', tree_mapper), ('classifier', XGBClassifier())]),
          Pipeline([('mapper', tree_mapper), ('classifier', GradientBoostingClassifier())]),
          Pipeline([('mapper', linear_mapper), ('classifier', SVC(probability= True))]),
          Pipeline([('mapper', tree_mapper), ('classifier', AdaBoostClassifier(
              base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                      max_depth = None,
                                                      max_features = None,
                                                      min_samples_leaf = 1,
                                                      min_samples_split = 2,
                                                      random_state = 0)))])
]

model_grids = [
               [{'classifier__C':[1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1, 1, 5, 1e1, 5e1, 1e2, 5e2, 1e3],
                 'classifier__random_state':[0]}],                                 #logistic regression
               
               [{'classifier__n_neighbors':[5,7,9,11, 13, 15, 17, 19], 
                 'classifier__metric': ['euclidean', 'manhattan', 'minkowski']}],  #KNN
               
               [{'classifier__var_smoothing': [1e-10, 1e-09, 1e-8, 1e-7]}],        #GaussianNB

               [{'classifier__alpha': [1e-2, 1e-1, 1, 1e1, 1e2]}],                 #BernoulliNB

               [{'classifier__criterion':['gini','entropy'],
                 'classifier__random_state':[0], 
                 'classifier__max_depth' : [3, 5, 8, 10, 15, None], 
                 'classifier__min_samples_split' : [1,2,5,10,15,30],
                 'classifier__min_samples_leaf': [1,2,5,10], 
                 'classifier__max_features': ['log2', 'sqrt', None]}],             #Decision Tree
               
               [{'classifier__criterion':['gini','entropy'],
                 'classifier__n_estimators': [1000],
                 'classifier__random_state':[0], 
                 'classifier__max_depth' : [3, 5, 8, 10, 15, None], 
                 'classifier__min_samples_split' : [1,2,5,10,15,30],
                 'classifier__min_samples_leaf': [1,2,5,10], 
                 'classifier__max_features': ['log2', 'sqrt', None]}],             #Random Forest

               [{'classifier__n_estimators':[1000],
                 'classifier__criterion':['gini','entropy'],
                 'classifier__random_state':[0],
                 'classifier__max_depth': [3, 5, 8, 10, 15, 30],
                 'classifier__min_child_weight': [2,4,6,8,10],
                 'classifier__gamma': [0, 0.1, 0.2, 0.3],
                 'classifier__reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
                 'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9],
                 'classifier__eta': [0.1, 0.2, 0.3, 0.4, 0.5]}],                   #XGBoost
               
               [{'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2], 
                 'classifier__n_estimators': [1000],
                 'classifier__random_state':[0],
                 'classifier__max_depth' : [5, 8, 15, None], 
                 'classifier__min_samples_split' : [1,2,5,10],
                 'classifier__min_samples_leaf': [1,2,5,10], 
                 'classifier__max_features': ['log2', 'sqrt', 'auto', 'None']}],   #Gradient Bossting Decision Tree
                        

               [{'classifier__C':[1e-1, 1, 1e1] ,
                 'classifier__random_state':[0],
                 'classifier__kernel': ['rbf', 'poly']
                }],                                                                #SVM
               
               [{'classifier__n_estimators' : [800, 1000, 1200], 
                 'classifier__learning_rate' : [1e-3, 1e-2, 5e-2, 1e-1, 5e-1, 1, 1e1],
                 'classifier__random_state':[0]}]                                  #AdaBoost
]            
          

In [None]:
# Hyperparameter Tuning. Random Search of 100 iterations is used. Uncomment cell to run
"""
for i,j in zip(models, model_grids):
    grid = RandomizedSearchCV(estimator=i, param_distributions=j, n_iter = 100, scoring='f1_weighted', cv = skf)
    grid.fit(X_train, y_train)
    best_f1 = grid.best_score_
    best_param = grid.best_params_
    print('{}:\nBest F1 : {:.4f}'.format(i.steps[-1],best_f1))
    print('Best Parameters : ',best_param)
    print('')
    print('----------------')
    print('')
"""

# Testing tuned models

In [None]:
models_tuned = []
models_tuned.append(Pipeline([('mapper', linear_mapper),  ('classifier', LogisticRegression(random_state = 0))]))
models_tuned.append(Pipeline([('mapper', num_mapper),  ('classifier', KNeighborsClassifier(n_neighbors=5, 
                                                                                           metric = 'manhattan'))]))
models_tuned.append(Pipeline([('classifier', GaussianNB(var_smoothing= 1e-7))]))
models_tuned.append(Pipeline([('classifier', BernoulliNB(alpha= 100))]))
models_tuned.append(Pipeline([('mapper', linear_mapper),  ('classifier', SVC(C=1, random_state = 0, probability= True))]))

models_tuned.append(Pipeline([('mapper', tree_mapper),  
                              ('classifier', 
                               DecisionTreeClassifier(criterion='entropy',
                                                      max_depth = None,
                                                      max_features = None,
                                                      min_samples_leaf = 1,
                                                      min_samples_split = 2,
                                                      random_state = 0)
                               )]))
models_tuned.append(Pipeline([('mapper', tree_mapper),  
                              ('classifier', 
                               RandomForestClassifier(n_estimators = 1000,
                                                      criterion='entropy',
                                                      max_depth = None,
                                                      max_features = 'sqrt',
                                                      min_samples_leaf = 1,
                                                      min_samples_split = 10,
                                                      random_state = 0)
                               )]))

models_tuned.append(Pipeline([('mapper', tree_mapper),  
                              ('classifier', 
                               AdaBoostClassifier(base_estimator= DecisionTreeClassifier(criterion='gini',
                                                      max_depth = 30,
                                                      max_features = 'log2',
                                                      min_samples_leaf = 5,
                                                      min_samples_split = 15,
                                                      random_state = 0),
                                                  learning_rate = 0.1,
                                                  n_estimators = 500)
                               )]))

models_tuned.append(Pipeline([('mapper', tree_mapper),  
                              ('classifier', XGBClassifier(criterion = 'gini',
                                                           eta = 0.1,
                                                           max_depth = 8,
                                                           n_estimators = 500,
                                                           random_state = 0
                                                           ))]))

models_tuned.append(Pipeline([('mapper', tree_mapper),  
                              ('classifier', GradientBoostingClassifier(n_estimators = 1000,
                                                                        learning_rate= 0.2,
                                                                        max_depth = 8,
                                                                        min_samples_split = 10,
                                                                        min_samples_leaf = 1,
                                                                        max_features = 'auto',
                                                                        random_state = 0)
                              )]))


In [None]:
lst_1_tuned= []

for m in range(len(models_tuned)):
    lst_2_tuned= []
    model = models_tuned[m]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = skf)   #K-Fold Validation
    
    test_acc = accuracy_score(y_test, y_pred)

    cr = classification_report(y_test, y_pred)
      
    cm = confusion_matrix(y_test, y_pred)

    precision = precision_score(y_test, y_pred, average= 'weighted')
    recall = recall_score(y_test, y_pred, average= 'weighted')
    f1 = f1_score(y_test, y_pred, average= 'weighted')
    roc = roc_auc_score(y_test, y_pred)

    predicted_probab = model.predict_proba(X_test)
    predicted_probab = predicted_probab[:, 1]

    fpr, tpr, _ = roc_curve(y_test, predicted_probab)
    pyplot.plot(fpr, tpr, marker='.', label = type(models_tuned[m][-1]).__name__)
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    pyplot.legend()
    pyplot.show()
   
    print(type(models_tuned[m][-1]).__name__ , ':')
    
    print('Accuracy Score: {:.4f}'.format(test_acc))
    print('')
    
    print("K-Fold Validation Mean Accuracy: {:.4f} %".format(accuracies.mean()*100))
    print('')

    print("Classification report: ")
    print(cr)
    print('')

    print("Confusion matrix: ")
    print(cm)
    print('')

    print('Precision Score: {:.4f}'.format(precision))
    print('')

    print('Recall Score: {:.4f}'.format(recall))
    print('')

    print('F1 score: {:.4f}'.format(f1))
    
    print('-----------------------------------')
    print('')
    lst_2_tuned.append(type(models_tuned[m][-1]).__name__)
    lst_2_tuned.append(accuracies.mean())
    lst_2_tuned.append(test_acc)
    lst_2_tuned.append(precision)
    lst_2_tuned.append(recall)
    lst_2_tuned.append(f1)
    lst_1_tuned.append(lst_2_tuned)

In [None]:
df_tuned = pd.DataFrame(lst_1_tuned, columns= ['Model','Cross-val acc','Test Accuracy','Precision','Recall', 'F1'])
df_tuned.sort_values(by= ['F1'], inplace= True, ascending= False)
df_tuned

# Ensembling

In [None]:
from sklearn.ensemble import StackingClassifier, VotingClassifier, BaggingClassifier

def get_stacking_model():
    # define the base models
    level0 = []
    level0.append(['Logistic Regression', Pipeline([('mapper', linear_mapper),  
                                                    ('classifier', LogisticRegression(random_state = 0))])])
    
    level0.append(['KNN', Pipeline([('mapper', num_mapper),  
                                    ('classifier', KNeighborsClassifier(n_neighbors=5, metric = 'manhattan'))])])
    
    # level0.append(['Gaussian NB', Pipeline([('mapper', linear_mapper), 
    #                                         ('classifier', GaussianNB(var_smoothing= 1e-7))])])
    
    # level0.append(['Bernoulli NB', Pipeline([('mapper', linear_mapper), 
    #                                          ('classifier', BernoulliNB(alpha=100))])])
    
    level0.append(['SVM', Pipeline([('mapper', linear_mapper),  
                                    ('classifier', SVC(C=1, random_state = 0, probability= True))])])
    
    # level0.append(['DT', Pipeline([('mapper', tree_mapper),  
    #                                ('classifier', 
    #                                 DecisionTreeClassifier(criterion='entropy',
    #                                                   max_depth = None,
    #                                                   max_features = None,
    #                                                   min_samples_leaf = 1,
    #                                                   min_samples_split = 2,
    #                                                   random_state = 0)
    #                            )])])
    level0.append(['Random Forest', Pipeline([('mapper', tree_mapper),  
                              ('classifier', RandomForestClassifier(n_estimators = 1000,
                                                      criterion='entropy',
                                                      max_depth = None,
                                                      max_features = 'sqrt',
                                                      min_samples_leaf = 1,
                                                      min_samples_split = 10,
                                                      random_state = 0)
                               )])
    ])
    level0.append(['AdaBoost', Pipeline([('mapper', tree_mapper),  
                              ('classifier', 
                               AdaBoostClassifier(base_estimator= DecisionTreeClassifier(criterion='gini',
                                                      max_depth = 30,
                                                      max_features = 'log2',
                                                      min_samples_leaf = 5,
                                                      min_samples_split = 15,
                                                      random_state = 0),
                                                  learning_rate = 0.1,
                                                  n_estimators = 500)
                               )])
    ])
    level0.append(['XGBoost', Pipeline([('mapper', tree_mapper),  
                              ('classifier', XGBClassifier(criterion = 'gini',
                                                           eta = 0.1,
                                                           max_depth = 8,
                                                           n_estimators = 500,
                                                           random_state = 0
                                                           )
                              )])])
    level0.append(['GBT', Pipeline([('mapper', tree_mapper),  
                              ('classifier', GradientBoostingClassifier(n_estimators = 1000,
                                                                        learning_rate= 0.2,
                                                                        max_depth = 8,
                                                                        min_samples_split = 10,
                                                                        min_samples_leaf = 1,
                                                                        max_features = 'auto',
                                                                        random_state = 0)
                              )])
    ])
    

    # define meta learner model
    # level1 = KNeighborsClassifier(n_neighbors= 7)
    level1 = RandomForestClassifier(criterion='entropy', n_estimators= 1000, random_state= 0)
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=skf)
    return model

In [None]:
model = get_stacking_model()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = skf)   #K-Fold Validation

test_acc = accuracy_score(y_test, y_pred)

cr = classification_report(y_test, y_pred)
  
cm = confusion_matrix(y_test, y_pred)

precision = precision_score(y_test, y_pred, average= 'weighted')
recall = recall_score(y_test, y_pred, average= 'weighted')
f1 = f1_score(y_test, y_pred, average= 'weighted')

print('Stacking Ensemble:')

print('Accuracy Score: {:.4f}'.format(test_acc))
print('')

print("Classification report: ")
print(cr)
print('')

print("Confusion matrix: ")
print(cm)
print('')

print('Precision Score: {:.4f}'.format(precision))
print('')

print('Recall Score: {:.4f}'.format(recall))
print('')

print('F1 score: {:.4f}'.format(f1))

print('-----------------------------------')
print('')

In [None]:
list_stack=[]
list_stack.append("Stacking ensemble")
list_stack.append(np.nan) # Only testing ensemble on test set
list_stack.append(test_acc)
list_stack.append(precision)
list_stack.append(recall)
list_stack.append(f1)
final_list = lst_1_tuned
final_list.append(list_stack)

In [None]:
df_tuned = pd.DataFrame(lst_1_tuned, columns= ['Model','Cross-val acc','Test Accuracy','Precision','Recall', 'F1'])
df_tuned.sort_values(by= ['F1'], inplace= True, ascending= False)
df_tuned