### Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
data.head()

### Checking out what types of data that we are dealing with

In [None]:
data.info()

### Looks like we have a lot of missing data on bmi feature

In [None]:
data.isnull().sum()

### Visualizing missing values

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(data.isnull(), cbar=False, cmap="viridis")

In [None]:
data.duplicated().sum()

### Basic descriptive analysis

In [None]:
data.describe().transpose()

# Exploratory Data Analysis

### The feature "stroke" is our target feature/y-variable. Noted they are imbalanced

In [None]:
data["stroke"].value_counts()

In [None]:
stroke_percent = len(data[data["stroke"] == 1]) / len(data[data["stroke"] == 0])*100

### Our target feature dataset is significantly imbalanced. 

In [None]:
print("The total stroke patient in this dataset accounts for: {}".format(stroke_percent))

### A fast way to check out the relationship with all other features

In [None]:
data.corr()["stroke"].sort_values(ascending=False)

### Since bmi is missing many values, I need to find a way to fill the missing values. Looks like age has the highest correlation with this feature. Will obtain the mean value of bmi for the age class. 

In [None]:
data.corr()["bmi"].sort_values(ascending=False)

In [None]:
sns.set_palette("PRGn")
sns.pairplot(data)

In [None]:
plt.figure(figsize=(12,7))
sns.set_context("paper", font_scale=1.5)
sns.heatmap(data.corr(), cmap="plasma")

In [None]:
data.columns

### Dropping feature "id" as it serves no purpose

In [None]:
data = data.drop("id", axis=1)

### The gender data is sorta balanced

In [None]:
data["gender"].value_counts(normalize=True)*100

In [None]:
plt.figure(figsize=(15,7))
sns.violinplot(x="gender", y="age", data=data, hue="stroke", shade=True, palette="plasma")

In [None]:
plt.figure(figsize=(15,7))
sns.kdeplot(x=data["age"], shade=True, color="red")

### Only ~5% of the data had pre-existing heart disease

In [None]:
data["heart_disease"].value_counts(normalize=True)*100

In [None]:
data.columns

### Age has the highest correlation with BMI. Thus, it makes sense to replace msising values in BMI with the mean value of the age group. However, this requires some feature engineering. Will need to create age groups to obtain the mean value for bmi based on the age group. The bins will be labeled 1-5.

In [None]:
data.corr()["bmi"]

In [None]:
labels = [1,2,3,4,5]
data["age_bin"] = pd.cut(data["age"], 5, labels=labels)

In [None]:
data["age_bin"].value_counts()

In [None]:
gb_bmi = data.groupby("age_bin")["bmi"].mean()

### Obtaining the mean value of bmi for each age group

In [None]:
gb_bmi

### Replacing missing values on bmi feature

In [None]:
def impute_bmi(cols):
    bmi = cols[0]
    age_bin = cols[1]
    
    if pd.isnull(bmi):
        if age_bin == 1:
            return 20.7
        elif age_bin == 2:
            return 28.6
        elif age_bin == 3:
            return 31.4
        elif age_bin == 4:
            return 31.6
        elif age_bin == 5:
            return 29.4
    else:
        return bmi

In [None]:
data["bmi"] = data[["bmi","age_bin"]].apply(impute_bmi, axis=1)

### All missing values were filled

In [None]:
data["bmi"].isnull().sum()

In [None]:
data.head()

In [None]:
data["ever_married"].value_counts()

In [None]:
plt.figure(figsize=(12,7))
sns.violinplot(x="ever_married", y="age", data=data, hue="stroke", palette="Reds")
plt.legend(bbox_to_anchor=(1.2,0.5))

In [None]:
data["hypertension"].value_counts() 

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(x=data["hypertension"], color="Green")

In [None]:
data["work_type"].value_counts() 

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(y=data["work_type"], palette="rainbow")

In [None]:
data["Residence_type"].value_counts() 

In [None]:
plt.figure(figsize=(9,5))
sns.countplot(x=data["Residence_type"], color="pink")

In [None]:
data["avg_glucose_level"].value_counts()

### Noted that the glucose level feature is obviously right skewed

In [None]:
plt.figure(figsize=(15,7))
sns.distplot(x=data["avg_glucose_level"], bins=30, color="red")

In [None]:
sns.jointplot(x="bmi", y="avg_glucose_level", data=data, color="green", alpha=0.5)

### Noted that the BMI feature is obviously right skewed

In [None]:
plt.figure(figsize=(15,7))
sns.kdeplot(x=data["bmi"], shade=True, color="red")

In [None]:
sns.set_palette("RdBu")
g = sns.FacetGrid(data=data, col="gender", hue="stroke")
g.map(plt.scatter, "age", "bmi").add_legend()

In [None]:
g = sns.FacetGrid(data=data, col="gender", hue="stroke")
g.map(plt.scatter, "age", "avg_glucose_level").add_legend()

In [None]:
data.describe().transpose()

In [None]:
data["smoking_status"].value_counts()

In [None]:
plt.figure(figsize=(15,7))
sns.violinplot(x="smoking_status", y="age", data=data, hue="stroke", palette="Spectral_r")
plt.legend(bbox_to_anchor=(1.1,0.5))

### Setting up the data into binaries for machine learning 

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.corr()["stroke"].sort_values(ascending=False)

In [None]:
dummy_features = pd.get_dummies(data[["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]], 
                                drop_first=True, dtype=int)

In [None]:
data = pd.concat((data.drop(["gender", "ever_married", "work_type", "Residence_type", "smoking_status"], axis=1), 
                  dummy_features), axis=1)

In [None]:
data.head()

### Transforming feature to log due to high skewness 

In [None]:
data["avg_glucose_level"].skew()

In [None]:
data["avg_glucose_level"] = np.log10(data["avg_glucose_level"])

In [None]:
data["bmi"].skew()

In [None]:
data["bmi"] = np.log10(data["bmi"])

### The feature Age has acceptable skewness so will not transform log

In [None]:
data["age"].skew()

In [None]:
data.head()

In [None]:
data = data.drop("age_bin", axis=1)

In [None]:
data.head()

### Final check before machine learning model

In [None]:
data.shape

In [None]:
data.info()

### Setting up X and y variable and importing libraries

In [None]:
X = data.drop(["stroke"], axis=1)
y = data["stroke"]

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from imblearn import over_sampling 
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)

In [None]:
log_model = LogisticRegression()

In [None]:
tree_model = DecisionTreeClassifier()

In [None]:
random_model = RandomForestClassifier()

In [None]:
g_model = GaussianNB()

In [None]:
xgb_model = XGBClassifier()

In [None]:
svc_model = SVC()

### Splitting train and test data. Only 80% of the data will be used for training, which is arbitrary. Train data will be scaled. Fit_transform not used on the test data to prevent data leakge. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### A simple logistic regression model will be used. 
### Noted that this model yields a very high accuracy rate 95%. At a glance, this model is perfect. However, if we looked at the classfication report on 1s, the scores are 0s across the board. The confusion matrix tells us that out of 50 minority class in test data, the model predicted none accurately. The model failed to predict stroke. 

In [None]:
log_model.fit(X_train, y_train)
log_predict = log_model.predict(X_test)
print(confusion_matrix(y_test, log_predict))
print(classification_report(y_test, log_predict))

### Accuracy 95% is excellent. While the accuracy is at 95%, the score tells us that this model works very well in predicting non-stroke but failed miserably in predicting stroke. 

### Regardless the accuracy, the model doesn't serve the purpose if it can't predict stroke. Similarly to fraud cases, models are built to find and recognize a fraud case out of a million cases. It does not serve the purpose if the model was not sensitive and could not predict frauds accurately.   

### From the healthcare provider perspective, it is important for the model to recognize stroke. So in this case, recall is a better measurement than accuracy of a model. 

### Recalling our target feature dataset is significantly imbalanced. To solve this, oversampling technqiue SMOTE will be used.

In [None]:
sm = SMOTE(random_state=42)

In [None]:
X_sm, y_sm = sm.fit_resample(X_train, y_train)

In [None]:
X_sm.shape

In [None]:
X_train.shape

In [None]:
y_sm.shape

### With SMOTE applied, the model has improved tremendously, able to predict 12 accurately out of 50 based on the confusion matrix, yielding 0.24 for recall. 

In [None]:
random_model.fit(X_sm, y_sm)
random_predict = random_model.predict(X_test)
print(confusion_matrix(y_test, random_predict))
print(classification_report(y_test, random_predict))

### Logistic regression model with SMOTE applied yields a significant recall at 0.80, able to predict 40 cases out of 50 accurately. This is a significant improvement from previous. 

In [None]:
log_model.fit(X_sm, y_sm)
log_predict = log_model.predict(X_test)
print(confusion_matrix(y_test, log_predict))
print(classification_report(y_test, log_predict))

In [None]:
tree_model.fit(X_sm, y_sm)
tree_predict = tree_model.predict(X_test)
print(confusion_matrix(y_test, tree_predict))
print(classification_report(y_test, tree_predict))

In [None]:
knn_model.fit(X_sm, y_sm)
knn_predict = knn_model.predict(X_test)
print(confusion_matrix(y_test, knn_predict))
print(classification_report(y_test, knn_predict))

### This model has recall at 0.60, able to predict 30 out of 50 accurately. 

In [None]:
svc_model.fit(X_sm, y_sm)
svc_predict = svc_model.predict(X_test)
print(confusion_matrix(y_test, svc_predict))
print(classification_report(y_test, svc_predict))

### By far, the naive bayes model yields 0.98 recall, predicted accurately 49 out of 50. 

In [None]:
g_model.fit(X_sm, y_sm)
g_predict = g_model.predict(X_test)
print(confusion_matrix(y_test, g_predict))
print(classification_report(y_test, g_predict))

In [None]:
xgb_model.fit(X_sm, y_sm)
xgb_predict = xgb_model.predict(X_test)
print(confusion_matrix(y_test, xgb_predict))
print(classification_report(y_test, xgb_predict))

### Model comparison with accuracy

In [None]:
cols= ["Models", "Accuracy", "Precision", "Recall", "f1_score"]

report = [["Random Forest",0.90, 0.13, 0.18,0.15], ["Logistic Regression",0.76, 0.14, 0.80,0.24,], 
          ["Decision Tree", 0.86, 0.12, 0.30, 0.18], ["K-nearest neighbor", 0.81,0.10, 0.36, 0.16], 
          ["Support Vector Machines", 0.79, 0.14, 0.60, 0.22], ["GaussianNB", 0.24,0.06,0.98,0.11], 
          ["XGBoost", 0.93, 0.21, 0.14, 0.17]]

model_comparison = pd.DataFrame(report, columns= cols)
model_comparison.sort_values("Accuracy", ascending=False)

### Model comparison with recall

In [None]:
model_comparison.sort_values("Recall", ascending=False)