# Task Description

Dataset: Stroke Prediction Dataset

Kaggle: https://www.kaggle.com/fedesoriano/stroke-prediction-dataset

In [None]:
# %% === < Global Setting: Time and Seed > ===
import time
import numpy as np
UTC_time = time.strftime("%Y%m%d-%H%M%S", time.localtime()) # Colab UTC time
local_time = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()+8*3600)) # Colab for UTC+8
print("UTC Time:", UTC_time)
print("Local Time (UTC+8):", local_time)
seed = int(round(1000000*np.random.random()))
seed = 2021
print("Seed:", seed)
np.random.seed(seed)

## Data Importing and Preprocessing

Using Pandas to import data and doing preprocessing

In [None]:
# %% === < Importing the raw data > ===
import pandas as pd
data_raw = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
print(data_raw.shape)
data_raw.head()

In [None]:
# %% === < Droping out the null data and useless variables > ===
data_raw[data_raw.isnull().any(axis=1)]

In [None]:
print(data_raw["gender"].value_counts())

In [None]:
data_raw[data_raw['gender']=='Other']

In [None]:
data_dropna = data_raw.dropna()
print(data_dropna.shape)
data_dropna = data_dropna.drop([3116])
print(data_dropna["gender"].value_counts())
print(data_dropna.shape)
data_dropna = data_dropna.drop(columns=['id']) # dropout non-using column
print(data_dropna.shape)
data_dropna.head()

## Data preprocessing and Visualization

Using Pandas, Matplolib, Seaborn to prepare date for analysis and visualize

In [None]:
data_dropna["work_type"] = data_dropna["work_type"].astype('category')
data_dropna["smoking_status"] = data_dropna["smoking_status"].astype('category')
data_dropna["Residence_type"] = data_dropna["Residence_type"].astype('object')
data_dropna["hypertension"] = data_dropna["hypertension"].astype('object')
data_dropna["heart_disease"] = data_dropna["heart_disease"].astype('object')
data_dropna["stroke"] = data_dropna["stroke"].astype('int8')
print(data_dropna.dtypes)

In [None]:
# %% === < Showing correlations between variables > ===
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12

plt.figure(figsize=(8,6))
sns.heatmap(data_dropna.corr(), cmap="coolwarm", annot=True, vmin=-1, vmax=1, fmt='.2g')
plt.title('Correlation of Continuous Variables')
# plt.savefig(output_folder+'Correlation of Variables.png', dpi=300)

In [None]:
data_conti = data_dropna.select_dtypes(include=['float64']).copy()
data_conti.head()

In [None]:
# %% === < Drawing violinplots of continuous variables > ===
for idx,feature in enumerate(data_conti):
  plt.figure(figsize=(8,6))
  sns.violinplot(y = data_conti[feature], x = data_dropna['stroke'], palette="Set3")
  plt.title('Violinplot of %s'%feature)
#   plt.savefig(output_folder+'Violinplot of %s.png'%feature, dpi=300)

In [None]:
data_obj = data_dropna.select_dtypes(include=['object']).copy()
data_obj.head()

In [None]:
# %% === < Drawing boxplots of catergorical variables > ===
for idx,feature in enumerate(data_obj):
  plt.figure(figsize=(8,6))
  sns.barplot(x = data_obj[feature], y = data_dropna['stroke'])
  plt.title('Barplot of %s'%feature)
#   plt.savefig(output_folder+'Barplot of %s.png'%feature, dpi=300)

In [None]:
print(data_obj["ever_married"].value_counts())
print()
print(data_obj["gender"].value_counts())

In [None]:
cleanup_vars = {"ever_married": {"No":0,"Yes":1},
         "gender": {"Male":0,"Female":1}}
data_obj = data_obj.replace(cleanup_vars).astype('object')
data_obj.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
label_vars = ["Residence_type"]
data_obj[label_vars] = data_obj[label_vars].apply(LabelEncoder().fit_transform).astype('object')
data_obj.head()

In [None]:
data_catg = data_dropna.select_dtypes(include=['category']).copy()
data_catg.head()

In [None]:
# %% === < Drawing boxplots of catergorical variables > ===
for idx,feature in enumerate(data_catg):
  plt.figure(figsize=(8,6))
  sns.barplot(x = data_catg[feature], y = data_dropna['stroke'])
  plt.title('Barplot of %s'%feature)
#   plt.savefig(output_folder+'Barplot of %s.png'%feature, dpi=300)

### One-Hot-Encoding 

Using one-hot-encoding (dummy coding) to sparse data

For preparing to train models

Reference:

* https://medium.com/@PatHuang/%E5%88%9D%E5%AD%B8python%E6%89%8B%E8%A8%98-3-%E8%B3%87%E6%96%99%E5%89%8D%E8%99%95%E7%90%86-label-encoding-one-hot-encoding-85c983d63f87

* https://www.kaggle.com/getting-started/27270

* https://pbpython.com/categorical-encoding.html

In [None]:
import pandas as pd
data_dummy = pd.get_dummies(data_catg)
data_dummy.head()

## Rebuilding Data (concat)

If neccessary, droping out useless variables

In [None]:
import pandas as pd
data = pd.concat([data_conti,data_obj,data_dummy,data_dropna["stroke"]], axis=1)
print(data.shape)
data.head()
# data.to_excel(output_folder+'Data_StatisticsDummy.xlsx',sheet_name='dummy') 

## Data Balance

Using re-sampling method to balance different targets

Reference (Imbalanced-Learn):

* https://imbalanced-learn.org/stable/index.html

In [None]:
from sklearn.model_selection import train_test_split # Import train_test_split function

dropout_cols = ['stroke']
X = data.drop(columns=dropout_cols) # Predictors
y = data['stroke'] # Target variable
print('Shape of original dummy coding data X and y: ',X.shape,y.shape)
print()

# === Spliting dataset into training set and test set
X_train_raw, X_test, y_train_raw, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test
print('Shape of testing data X and y: ',X_test.shape,y_test.shape)
print('Testing data - No stroke: %d'%y_test[y==0].shape)
print('Testing data - Yes stroke: %d'%y_test[y==1].shape)
print()
print('Shape of training data X and y: ',X_train_raw.shape,y_train_raw.shape)
print('Before over sampling - No stroke: %d'%y_train_raw[y==0].shape)
print('Before over sampling - Yes stroke: %d'%y_train_raw[y==1].shape)
print()

In [None]:
# === Over sampling to balance the different labels of data
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
# from imblearn.over_sampling import SMOTENC

resampling_method = BorderlineSMOTE
X_train, y_train = resampling_method(sampling_strategy='not majority').fit_resample(X_train_raw,y_train_raw)
print('Shape of random over sampling dummy coding data X and y: ',X_train.shape,y_train.shape)
print('After over sampling - No stroke: %d'%y_train[y_train==0].shape)
print('After over sampling - Yes stroke: %d'%y_train[y_train==1].shape)
print()

## Model Establishment and Evaluation

Using some classification algorithms to classify

Applying some evaluation indexes to check the fitting results and predicted results

In [None]:
# %% === < Classifiers: Predicted Results and Confusion Matrices > ===
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score

classifiers = [
  LogisticRegression(),
  KNeighborsClassifier(),
  GaussianNB(),
  SVC(probability=True), 
  DecisionTreeClassifier(), 
  QuadraticDiscriminantAnalysis(),
  RandomForestClassifier(), 
  AdaBoostClassifier(),
  MLPClassifier()
  ]

result_table = pd.DataFrame(columns=['Classifiers','Accuracy','F1','Precision','Recall','fpr','tpr','AUC'])

for classifier in classifiers:
  classifier_name = classifier.__class__.__name__
  model = classifier.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  y_score = model.predict_proba(X_test)[::,1]
  # === Confusion Matrix
  plt.figure(figsize=(8,7))
  sns.heatmap(confusion_matrix(y_test,y_pred,normalize=None), annot=True, cmap='YlGnBu')
  plt.ylabel('True label', fontsize=14)
  plt.xlabel('Predicted label', fontsize=14)
  plt.title('Confusion Matrix (%s)'%classifier_name, fontsize=14)
#   plt.savefig(output_folder+'Confusion Matrix (%s).png'%classifier_name, dpi=300)
  plt.show()
  # === Normalized Confusion Matrix
  plt.figure(figsize=(8,7))
  sns.heatmap(confusion_matrix(y_test,y_pred,normalize='true'), annot=True, cmap='Blues', vmin=0, vmax=1)
  plt.ylabel('True label')
  plt.xlabel('Predicted label')
  plt.title('Normalized Confusion Matrix (%s)'%classifier_name)
#   plt.savefig(output_folder+'Normalized Confusion Matrix (%s).png'%classifier_name, dpi=300)
  plt.show()
  # === Result
  accuracy = accuracy_score(y_test,y_pred)
  f1 = f1_score(y_test,y_pred)
  precision = precision_score(y_test,y_pred)
  recall = recall_score(y_test,y_pred)
  fpr, tpr, _ = roc_curve(y_test, y_score)
  auc = roc_auc_score(y_test, y_score)
  # === Table of Result
  result_table = result_table.append({
      'Classifiers':classifier_name,
      'Accuracy':accuracy,
      'F1':f1,
      'Precision':precision,
      'Recall':recall,
      'fpr':fpr,
      'tpr':tpr,
      'AUC':auc
      },
    ignore_index=True
    )
result_table.set_index('Classifiers', inplace=True)

In [None]:
# %% === < Predicted Results Output > ===
df_result = result_table[['Accuracy','F1','Precision','Recall']]
df_result = df_result.round(4)
print(df_result)
# df_result.to_excel(output_folder+'Result_%s.xlsx'%local_time,sheet_name='result') 

In [None]:
# %% === < ROC Curves to compare different models > ===
import numpy as np

fig = plt.figure(figsize=(10,9))

for idx in result_table.index:
  plt.plot(
    result_table.loc[idx]['fpr'], 
    result_table.loc[idx]['tpr'], 
    label="{}, AUC={:.4f}".format(idx, result_table.loc[idx]['AUC'])
    )
    
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate")
plt.ylabel("True Positive Rate")
plt.title('Comparison of ROC Curves')
plt.legend(prop={'size':12}, loc='lower right')
# plt.savefig(output_folder+'ROC Curves %s.png'%local_time, dpi=300)

In [None]:
# %% === < Finish Time > ===
import time
UTC_time = time.strftime("%Y%m%d-%H%M%S", time.localtime()) # Colab UTC time
local_time = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()+8*3600)) # Colab for UTC+8
print("UTC Time:", UTC_time)
print("Local Time (UTC+8):", local_time)