In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import EnsembleVoteClassifier
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

In [None]:
df.drop('id', axis=1, inplace=True)  # dropping ID Column
df.isnull().sum()                    # Checking Null values

**We can see there are 201 records which does not have BMI values. Now, removing those records is not a good idea bcoz its around 4 % of our total data.
Let's plot confusion matrix and check which variable is having best correlation with BMI.**

In [None]:
corr = df.corr()
sns.heatmap(corr, annot=True)

**It can be observed that AGE is having best correlation 0.33 with BMI. So, I'm using age and will try to find mean age each age group arange. Now, fow missing values, I will assign those mean age values to those 201 records.
So, we will replace NULL values with mean values of age. Now, for each age group, I will assign a mean value to it. So, its better to have them with mean value, or else we may lose some important insights.**

In [None]:
labels = [1,2,3,4,5]
df["age_mean"] = pd.cut(df["age"], 5, labels=labels)
df["age_mean"].value_counts()

In [None]:
grp_bmi = df.groupby("age_mean")["bmi"].mean()
grp_bmi

**Replacing missing values of BMI with age_mean values based on the age group.**

In [None]:
def bmi_val(cols):
    bmi = cols[0]
    age_mean = cols[1]
    
    if pd.isnull(bmi):
        if age_mean == 1:
            return 20.7
        elif age_mean == 2:
            return 28.6
        elif age_mean == 3:
            return 31.4
        elif age_mean == 4:
            return 31.6
        elif age_mean == 5:
            return 29.4
    else:
        return bmi
df["bmi"] = df[["bmi","age_mean"]].apply(bmi_val, axis=1)

In [None]:
df.isnull().sum()  

**Splitting data into Train and Test:**

In [None]:
x = df[['age','hypertension','heart_disease','avg_glucose_level','bmi']]
y = df['stroke']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 3)

# **Machine Learning Models:**
There are 5 machine learning models used for this dataset. In the end, will compare results of each model and will try to find the best model for this dataset.

## **1. KNN (K-Nearest Neighbors Algorithm)**

This model is hyperparameter tuned. This model will use 'n_neighbors' in range 1 to 30 and will try to find the best value of 'n_neighbors', which will provide the best result.

In [None]:
k_neigh = np.arange(1,31,1)

knn = KNeighborsClassifier()
hyperParam = [{'n_neighbors':k_neigh}]

gsv = GridSearchCV(knn,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(x_train, y_train)                      # Fitting model with x_train and y_train
knn_pred = best_model.best_estimator_.predict(x_test)           # Predicting the results

print("Best HyperParameter: ",gsv.best_params_)
print("Best Accuracy:",best_model.score(x_test, y_test))

**Confusion Matrix:**
Plotting confusion matrix for KNN. We can find results from confusion matrix:

In [None]:
plot_confusion_matrix(gsv,x_test, y_test)

conf_metr = confusion_matrix(y_test, knn_pred)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,knn_pred))
print("Accuracy:",metrics.accuracy_score(y_test, knn_pred))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, knn_pred))
print("Precision:",metrics.precision_score(y_test, knn_pred))

In [None]:
# ROC Curve:
metrics.plot_roc_curve(gsv, x_test, y_test)

## **2. Logistic Regression:**
This model is hyperparameter tunned. LogisticRegression is using solver as 'liblinear' and C has few values.

In [None]:
c_val = [0.001,0.01,0.1,0.5,1.0]

logr = LogisticRegression(solver='liblinear')
hyperParam = [{'C':c_val}]

gsv = GridSearchCV(logr,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(x_train,y_train)                   # Fitting model with x_train and y_train
logr_pred = best_model.best_estimator_.predict(x_test)  # Predicting the results

print("Best HyperParameter: ", gsv.best_params_)
print("Best Accuracy:",best_model.score(x_test,y_test))

**Confusion Matrix:**

In [None]:
plot_confusion_matrix(gsv,x_test, y_test)

conf_metr = confusion_matrix(y_test, logr_pred)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,logr_pred))
print("Accuracy:",metrics.accuracy_score(y_test, logr_pred))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, logr_pred))
print("Precision:",metrics.precision_score(y_test, logr_pred))

In [None]:
# ROC Curve
metrics.plot_roc_curve(gsv, x_test, y_test)

## **3. Decision Tree:**
This model is hyperparameter tunned. Unpruned Tree is basic model, however, Pruned tree is the one with tunning parameters.

***Unpruned Treee:***

In [None]:
dtree_up = DecisionTreeClassifier()
dtree_up.fit(x_train, y_train)                  # Fitting model with x_train and y_train
dtree_pred_up = dtree_up.predict(x_test)        # Predicting the results
print("Accuracy is: ",metrics.accuracy_score(y_test, dtree_pred_up))

***Pruned Tree: HyperParameter Tunning***

In [None]:
depth = np.arange(1, 20, 1)

dtree_pr = DecisionTreeClassifier()
hyperParam = [{'max_depth':depth}]

gsv = GridSearchCV(dtree_pr,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(x_train,y_train)                          # Fitting model with x_train and y_train
dtree_pred_pr = best_model.best_estimator_.predict(x_test)     # Predicting the results

print("Best HyperParameter: ", gsv.best_params_)
print("Best Accuracy:",best_model.score(x_test,y_test))

**Confusion Matrix:** Pruned Tree (Because Pruned tree has good accuracy than unpruned tree)

In [None]:
plot_confusion_matrix(gsv,x_test, y_test)

conf_metr = confusion_matrix(y_test, dtree_pred_pr)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,dtree_pred_pr))
print("Accuracy:",metrics.accuracy_score(y_test, dtree_pred_pr))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, dtree_pred_pr))
print("Precision:",metrics.precision_score(y_test, dtree_pred_pr))

In [None]:
# ROC Curve
metrics.plot_roc_curve(gsv, x_test, y_test)

## **4. Random Forest:**
This is hyperparameter tunned model. max_depth, n_estimators have been used as tunning parameters.

In [None]:
estimators = [10,50,80,100,150,200,250,300]

rf = RandomForestClassifier(max_depth=3,random_state=5)
hyperParam = [{'n_estimators':estimators}]

gsv = GridSearchCV(rf,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(x_train,y_train)                   # Fitting model with x_train and y_train
rf_pred = best_model.best_estimator_.predict(x_test)    # Predicting the results

print("Best HyperParameter: ", gsv.best_params_)
print("Best Accuracy:",best_model.score(x_test,y_test))

**Confusion Matrix:**

In [None]:
plot_confusion_matrix(gsv,x_test, y_test)

conf_metr = confusion_matrix(y_test, rf_pred)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,rf_pred))
print("Accuracy:",metrics.accuracy_score(y_test, rf_pred))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, rf_pred))
print("Precision:",metrics.precision_score(y_test, rf_pred))

In [None]:
# ROC Curve
metrics.plot_roc_curve(gsv, x_test, y_test)

# **5. SVC (Support Vector Classifier)**
This is hyperparameter tunned model. Various values of Kernels has been used for tunning.

In [None]:
kernels = ['rbf','linear','poly','sigmoid']

svc = SVC()
hyperParam = [{'kernel':kernels}]

gsv = GridSearchCV(svc,hyperParam,cv=5,verbose=1)
best_model = gsv.fit(x_train, y_train)                       # Fitting model with x_train and y_train
svc_pred = best_model.best_estimator_.predict(x_test)        # Predicting the results

print("Best HyperParameter: ",gsv.best_params_)
print("Best Accuracy :",best_model.score(x_test, y_test))

**Confusion Matrix:**

In [None]:
plot_confusion_matrix(gsv,x_test, y_test)

conf_metr = confusion_matrix(y_test, svc_pred)

print("Confusion Matrix: \n {}".format(conf_metr))
print(metrics.classification_report(y_test,svc_pred))
print("Accuracy:",metrics.accuracy_score(y_test, svc_pred))
print("Recall/Sensitivity/True Positive Rate:",metrics.recall_score(y_test, svc_pred))
print("Precision:",metrics.precision_score(y_test, svc_pred))

In [None]:
# ROC Curve
metrics.plot_roc_curve(gsv, x_test, y_test)

# **Bagging with all classifiers using Cross Validation:**
Now, we will use all 5 Models we used before and after applying VotingClassifier we can get the best result by voting between all models.

In [None]:
# Creating classifiers
knn = KNeighborsClassifier()
lg = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svc = SVC()

clf_array = [knn, lg, dt, rf,svc]

for clf in clf_array:
    cc_scores = cross_val_score(clf, x, y, cv=10, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, max_samples=0.25, max_features=10, random_state=3)
    bagging_scores = cross_val_score(bagging_clf, x, y, cv=10, n_jobs=-1)
    
    print("Accuracy of: {1:.3f}, std: (+/-) {2:.3f} [{0}]".format(clf.__class__.__name__,cc_scores.mean(), cc_scores.std()))
    print("Accuracy of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(clf.__class__.__name__,bagging_scores.mean(), bagging_scores.std()))

In [None]:
clf = [knn, lg, dt, rf,svc]
eclf = VotingClassifier(estimators=[('KNN', knn), ('Logistic Regression', lg), ('Decision Tree', dt), ('Random Forest', rf), ('SVC', svc)], voting='hard')
for clf, label in zip([knn, lg, dt, rf,svc, eclf], ['KNN', 'Logistic Regression', 'Decision Tree', 'Random Forest', 'SVC', 'Ensemble']):
    scores = cross_val_score(clf, x_train, y_train, cv=10, scoring='accuracy')
    print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Above, Logistic Regression and SVC turns out to be best for our dataset. These 2 are showing best result with Accuracy 95.1 % and Standard Deviation is 0.00 (Very close to 0).

# **Boosting with all classifiers using Cross Validation:**
Again, we will use all 5 models with boosting method. After that, we will apply EnsembleVoteClassifier to vote the best boosting methods.
Here, three boosting methods are used: Ada Boost, Gradient Boost, & XG Boost.

In [None]:
# Creating classifiers
knn = KNeighborsClassifier()
lg = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svc = SVC()
ada_boost = AdaBoostClassifier()
grad_boost = GradientBoostingClassifier()
xgb_boost = XGBClassifier()
boost_array = [ada_boost, grad_boost, xgb_boost]
clf = [knn, lg, dt, rf,svc]
eclf = EnsembleVoteClassifier(clfs=[ada_boost, grad_boost, xgb_boost], voting='hard')
labels = ['Ada Boost', 'Grad Boost', 'XG Boost', 'Ensemble']
for clf, label in zip([ada_boost, grad_boost, xgb_boost, eclf], labels):
    scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
    print("Accuracy: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

Above, Ada Boost came out to be the best boosting method for this data set. For Ada Boost, Accuracy is 95 % and Standard Deviation is 0.001.

### **Conclusion:**
Conclusively, we have a very small dataset of 5100 records. Out of which around 4% records had missing BMI values. We used mean avg age of the same age group for missing values.
After that, 5 machine learning models have been used: KNN, Logistic Regression, Decision Tree, Random Forest and SVC. These models are hyperparameter tunned. This project also used bagging and boosting methods with all 5 ML models. VoteClassifier's has been used to cast a vote between all 5 models and will select best model with best results.
Logistic Regression and SVC turns out to be best for this dataset with 95.1% Accuracy and 0.00 of Standard Deviation. Whereas, when looking at ROC Curve, it can be clearly observed that AUC (Area Under Curve value should be max for good results) of Logistic Regressions is 0.80 whereas AUC of SVC model is 0.56. Therefore, Logistic Regression turns out to be best model for this dataset when compared with AUC value after checking final Accuracy and Standard Deviation.
3 boosting methods are used: Ada Boost, Gradient Boost, & XG Boost. Ada Boost was best with Accuracy 95% and Standard Deviation of 0.001.