## Stroke Prediction - NYCU Midterm Project

## Import Packages
Import all necessary packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action="ignore")

import statsmodels.formula.api as smf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble, preprocessing
from xgboost.sklearn import XGBClassifier

## Data Preprocess
Read datasets and take a brief look.

In [None]:
# load data
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.info()
df.describe(include='all')

Convert categorical datas to objects.<br>
Fill missing values in `bmi`.<br>
Rename `Residence_type` to `residence_type`.

In [None]:
# Convert dtype
df['stroke'] = df['stroke'].astype(object)
df['hypertension'] = df['hypertension'].astype(object)
df['heart_disease'] = df['heart_disease'].astype(object)

# BMI missing value
df["bmi"] = df["bmi"].fillna(df["bmi"].mean())

# rename columns
df.rename(columns = {'Residence_type':'residence_type'}, inplace = True)

# Drop id columns
df = df.drop(columns=["id"])

## Descriptive Statistics
Use seaborn package to plot all categorical and numerical columns.

In [None]:
cat_data = [x for x in df.columns if df[x].dtype == "object"]
num_data = [y for y in df.columns if df[y].dtype != "object"]

for col in cat_data:
    plt.title(col)
    sns.countplot(df[col])
    plt.show()

for col in num_data:
    plt.title(col)
    sns.histplot(df[col],kde=True)
    plt.show()

## Logistic Regression
Perform logistic regression analysis on all fields.<br>
Select fields with significant differences for future analysis.

In [None]:
df['stroke'] = df['stroke'].astype(int)
all_col = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']

Y = 'stroke ~ gender'
for i in all_col[1:]:
    Y = Y + '+' + i 
results = smf.ols(Y, data=df).fit()
print(results.summary())

## Categorical Feature

In [None]:
gender = pd.get_dummies(df[['gender']])
work_type = pd.get_dummies(df[['work_type']])

# ever_married
mapping = {'Yes':1, 'No':0}
ever_married = df['ever_married'].map(mapping)

# residence_type
mapping = {'Urban':1, 'Rural':0}
residence_type = df['residence_type'].map(mapping)

# smoking_status
mapping = {'smokes':3, 'formerly smoked':2, 'never smoked':1, 'Unknown':0}
smoking_status = df['smoking_status'].map(mapping)

df_combine = pd.concat([gender,
                        df['age'],
                        df['hypertension'],
                        df['heart_disease'],
                        ever_married,
                        work_type,
                        residence_type,
                        df['avg_glucose_level'],
                        df['bmi'],
                        smoking_status, 
                        df['stroke']], axis=1)

df_select = pd.concat([df['age'],
                       df['hypertension'],
                       df['heart_disease'],
                       ever_married,
                       work_type,
                       df['avg_glucose_level'], 
                       df['stroke']], axis=1)

## Sampling
In order to avoid the huge difference data amounts between `stroke = 0` and `stroke = 1`,<br>
we select all datas from `stroke = 1` and randomly sample twice the amount of `stroke = 1`'s data from `stroke = 0`.

In [None]:
stroke_cnt = df_combine['stroke'].loc[df_combine['stroke']==1].count()

df_combine_equal = df_combine.loc[df_combine['stroke']==0].sample(n=stroke_cnt*2, random_state=1)
df_combine_equal = pd.concat([df_combine_equal, df_combine.loc[df_combine['stroke']==1]], axis=0)

df_select_equal = df_select.loc[df_select['stroke']==0].sample(n=stroke_cnt*2, random_state=1)
df_select_equal = pd.concat([df_select_equal, df_select.loc[df_select['stroke']==1]], axis=0)

## Correlation

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df_combine.corr(),annot=True,cmap="rainbow")
plt.title("Correleation Heatmap",fontsize=20,color="c")
plt.show()

## Evaluation Function
Customized function for result evaluation and result visualization.

In [None]:
def plot_cm(model, Y_test, Y_pred):
    cm = metrics.confusion_matrix(Y_test, Y_pred)
    
    f, axs = plt.subplots(2,1,figsize=(5,10))
    ax= plt.subplot(211)
    sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap="YlGnBu");
    ax.set_title(model);
    ax.set_xlabel('Predicted labels');
    ax.set_ylabel('True labels');
    
    ax1= plt.subplot(212)
    ax1.set_position([0.1, 0.13, 0.7, 0.6])
    data=[['Accuracy:', round(metrics.accuracy_score(Y_test, Y_pred), 4)],
          ['Precision:',round(metrics.precision_score(Y_test, Y_pred), 4)],
          ['Recall:',round(metrics.recall_score(Y_test, Y_pred), 4)],
          ['F1 Score:',round(metrics.f1_score(Y_test, Y_pred, average='weighted', labels=np.unique(Y_pred)), 4)]]
    ax1.axis('tight')
    ax1.axis('off')
    ax1.table(cellText=data,loc="center").scale(1, 1.5)

    plt.show()

## Models
1. XGBoost Classifier
1. Decision Tree Classifier
1. Support Vector Machine
1. Simple Neural Network

All model's test-train split ratio were set to 0.2

In [None]:
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

def compile_nn_model(df_for_shape):
    model = Sequential()
    model.add(Dense(16, input_dim=df_for_shape.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(2, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def combined_model(df_input):
    
    # Test train split
    X = df_input.iloc[:, :-1].values
    Y = df_input.iloc[:, -1].values

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=100)

    # XGBClassifier
    model = 'XGBClassifier'
    xgb = XGBClassifier(
        learning_rate= 0.01,
        n_estimators=1000, 
        use_label_encoder =False)
    xgb.fit(X_train,Y_train,eval_metric='auc')
    Y_pred = xgb.predict(X_test)
    plot_cm(model, Y_test, Y_pred)

    # DecisionTreeClassifier
    model = 'DecisionTreeClassifier'
    clf = DecisionTreeClassifier(criterion="entropy",
                                 max_depth=7)
    clf.fit(X_train,Y_train)
    Y_pred = clf.predict(X_test)    
    plot_cm(model, Y_test, Y_pred)

    # RandomForestClassifier
    model = 'RandomForestClassifier'
    forest = ensemble.RandomForestClassifier(n_estimators = 1000)
    forest.fit(X_train,Y_train)
    Y_pred = forest.predict(X_test)
    plot_cm(model, Y_test, Y_pred)

    # SVM
    model = 'SupportVectorMachine'
    svc = SVC()
    svc.fit(X_train,Y_train)
    Y_pred = svc.predict(X_test)
    plot_cm(model, Y_test, Y_pred)
    
    # SimpleNeuralNetwork
    df_input = df_input.astype('float32')

    X = df_input.iloc[:, 1:-1].values
    Y = df_input.iloc[:, -1].values

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=100)
    Y_train = np_utils.to_categorical(Y_train)

    model = compile_nn_model(X_test)
    callback = EarlyStopping(monitor='f1', patience=3)
    history = model.fit(X_train, Y_train, epochs=150, batch_size=10, verbose=0, callbacks=[callback])
    plt.plot(history.history['loss'])
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.show()

    Y_pred = model.predict(X_test)
    Y_pred = np.argmax(Y_pred, axis=1)

    model = 'SimpleNeuralNetwork'
    plot_cm(model, Y_test, Y_pred)

## Results

### Results for`df_combine` dataset

In [None]:
combined_model(df_combine)

### Results for`df_select` dataset

In [None]:
combined_model(df_select)

### Results for`df_combine_equal` dataset

In [None]:
combined_model(df_combine_equal)

### Results for`df_combine_equal` dataset

In [None]:
combined_model(df_select_equal)