In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import keras 
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from xgboost import plot_tree, plot_importance
from keras.models import Sequential, Model
from keras import optimizers
from keras.layers import Dense, Dropout, Input, Embedding, InputLayer
from keras.layers.merge import concatenate
from keras.utils import plot_model
from keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, auc, RocCurveDisplay
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

# Objective
The underlying notebook follows the objective to classify whether patients suffers of strokes using the given data. To approach this classification task 4 different models will be trained. In fact, an ordinary neural network, an embedding neural network, a xgboost classifier and random forest will be trained using stratified k-fold cross validation and SMOTE to overcome the challenge ofhandling the heavily biased dataset.      

# Initial Data Exploration

In [None]:
df = pd.read_csv(r'/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

## First Inspection

In [None]:
df.head()

In [None]:
df.dtypes

## Checking for missing values

In [None]:
missing_values = df.drop(labels='id', axis=1).isna().sum()
color2 = px.colors.qualitative.Bold[7]
fig = go.Figure(data=[go.Bar(x=list(missing_values.keys()), y=missing_values.values, marker=dict(color=color2))], layout=go.Layout(title=go.layout.Title(text="Missing values per column/feature"), yaxis_title="Count"))
fig.show()

### Conclusion drawn from first inspection 
Dataset contains 11 columns. Six of the columns (gender, hypertension, heart_disease, work_type, Residence_type, smoking_status) can be considered as categorical features, three (age, avg_glucose_level, bmi) as numercial features and one column (stroke) contains the target. All data types matching their content. Only one feature column (bmi) contains missing values. The overall objective is to come up with an appropriate model for stroke prediction. Therefore this notebook deals with a binary classification problem.

# Data Visualization

In [None]:
categorical_data = ['gender', 'hypertension', 'heart_disease',
       'work_type', 'Residence_type', 'smoking_status', 'ever_married']
numerical_data = df.drop(labels=categorical_data+['id', 'stroke'], axis=1).columns.tolist()
target_data = ['stroke']

### Visualization of Categorical Data

In [None]:
gender_dim = go.parcats.Dimension(values=df.gender, label='Gender')
hypertension_dim = go.parcats.Dimension(values=df.hypertension, label='Hypertension', categoryarray= [0,1], ticktext=['No Hypertension', 'Hypertension'])
heartdisease_dim = go.parcats.Dimension(values=df.heart_disease, label='Heart Disease', categoryarray= [0,1], ticktext=['No Heart Disease', 'Heart Disease'])
evermarried_dim = go.parcats.Dimension(values=df.ever_married, label='Ever Married')
worktype_dim = go.parcats.Dimension(values=df.work_type, label='Work Type')
residence_dim = go.parcats.Dimension(values=df.Residence_type, label='Residence Type')
smoking_dim = go.parcats.Dimension(values=df.smoking_status, label='Smoking')
stroke_dim = go.parcats.Dimension(values=df.stroke, label='Stroke', categoryarray= [0,1], ticktext=['No Stroke', 'Stroke'])

In [None]:
df['gender_nominal'] = np.nan
df.loc[df['gender'] == 'Male', 'gender_nominal'] = 0
df.loc[df['gender'] == 'Female', 'gender_nominal'] = 1

color=df.gender_nominal

fig = go.Figure(data = [go.Parcats(dimensions=[hypertension_dim, heartdisease_dim, evermarried_dim,
                                              worktype_dim, residence_dim, smoking_dim, stroke_dim, gender_dim],
        line={'color': color, 'colorscale': 'Earth'},
        hoveron='color', hoverinfo='count+probability',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')],
               layout=go.Layout(title=go.layout.Title(text="Overview about bias of each feature with respect to gender")))


fig.update_layout(
    autosize=False,
    width=1200,
    height=600
)

fig.show()
df.drop(labels='gender_nominal', axis=1, inplace=True)

In [None]:
color=df.stroke
colorscale=[[0, 'blue'], [1, 'yellow']]

fig = go.Figure(data = [go.Parcats(dimensions=[gender_dim, hypertension_dim, heartdisease_dim, evermarried_dim,
                                              worktype_dim, residence_dim, smoking_dim, stroke_dim],
        line={'color': color, 'colorscale': 'Earth'},
        hoveron='color', hoverinfo='count+probability',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')],
               layout=go.Layout(title=go.layout.Title(text="Overview about bias of each feature with respect to target")))


fig.update_layout(
    autosize=False,
    width=1200,
    height=600
)

fig.show()

### Conclusion drawn based on visualization of categorical features
Besides the fact that there is slightly more data of females, with respect to gender there seems to be no significant difference in distribution of the categorical features. From the second parcat plot is gets clear that the dataset is biased with respect to the target variable so that a resampling (downsampling and/or upsampling) needs to be considered.

### Visualization of numercial/continous data

In [None]:
df[numerical_data].describe()

In [None]:
corr_columns = numerical_data + ['stroke']
pearson_corr = df[corr_columns].corr(method='pearson')
spearman_corr = df[corr_columns].corr(method='spearman')

X = [corr_columns[k] for k in range(len(corr_columns))]

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson Correlation Matrix', 'Spearman Correlation Matrix'), shared_yaxes=True)

   
fig.add_trace(go.Heatmap(z=pearson_corr,
                  x=X,
                  y=X,
                  xgap=1, ygap=1,
                  colorscale='Earth',
                  hovertext = round(pearson_corr, 2),
                  hoverinfo='text',showscale=False), row=1, col=1)

   
fig.add_trace(go.Heatmap(z=spearman_corr,
                  x=X,
                  y=X,
                  xgap=1, ygap=1,
                  colorbar_thickness=20,
                  colorbar_ticklen=3,
                  colorscale='Earth',
                  hovertext = round(spearman_corr, 2),
                  hoverinfo='text'), row=1, col=2)

fig.update_layout(title_text="Correlation Plots")


fig.show() 

In [None]:
fig = go.Figure(data=go.Splom(
                dimensions=[dict(label='Age',
                                values=df[numerical_data[0]]),
                           dict(label='Average Glucose Level',
                               values=df[numerical_data[1]]),
                           dict(label='Body-Mass-Index',
                               values=df[numerical_data[2]])],
                            showupperhalf=False,
                            text=df['stroke'],
                            marker=dict(color=df['stroke'],
                                       showscale=False,
                                       colorscale='Earth'),                          
))

title='Scatter Plot of Numerical Features'

fig.update_layout(
    title_text=title,
    autosize=False,
    width=800,
    height=800
)

fig.show()

### Conclusion drawn based on correlation and scatter plot
The heatmap with respect to the pearson correlation coefficients shows that there are only weak correlation between the avg_glucose_level and age as well as avg_glucose_level and bmi and a moderate correlation between bmi and age. This goes hand in hand with the spearman correlation coefficients which also takes non linear relationships into consideration. The scatter plots indicate a tendency that with increasing age, avg_glucose_level and bmi also the risk for a stroke increases. Furthermore age and avg_glucose_level seems to have a bigger effect on stroke risk.  

In [None]:
color1 = px.colors.qualitative.Dark2[6]
color2 = px.colors.qualitative.Bold[7]
color3 = px.colors.qualitative.Light24[0]

fig = px.histogram(df, x="bmi", color="gender", marginal="rug", hover_data=df.columns, title='Histogram Body-Mass-Index', color_discrete_sequence=[color1, color2, color3])
fig.show()

In [None]:
fig = px.histogram(df, x="avg_glucose_level", color="gender", marginal="rug", hover_data=df.columns, title='Histogram Average Glucose Level', color_discrete_sequence=[color1, color2, color3])
fig.show()

In [None]:
fig = px.histogram(df, x="age", color="gender", marginal="rug", hover_data=df.columns, title="Histogram Age", color_discrete_sequence=[color1, color2, color3])
fig.show()

### Conclusions drawn based on histograms
The distributions of the avg_glucose_level and bmi are following a slightly right-skewed normal distribution (longer right tail). Keeping a higher variance into consideration that seems also true for the age. There is no difference between the genders recognizable. Due to very minor samples the data for other genders is going to be neglected.   

# Data Cleaning

In [None]:
df = df[df['gender'] != 'Other']

# Feature Engineering (Feature Imputation, Scaling and Preparation)

### Split Data into numerical and categorical data

In [None]:
df_categorical = df[categorical_data]
df_numerical = df[numerical_data]

### Impute Numerical Data

In [None]:
imp = IterativeImputer(random_state=0)
imp.fit(df_numerical)
imputed_data = imp.transform(df_numerical).round(1)

### Scale Numerical Data

In [None]:
standard_scaler = StandardScaler()
standard_scaled_data = standard_scaler.fit_transform(imputed_data)

minmax_scaler = MinMaxScaler()
minmax_scaled_data = minmax_scaler.fit_transform(imputed_data)

### Preparing Dataset with Nominal Encoded Data

In [None]:
lab_enc = LabelEncoder()
lab_enc_features = df_categorical.apply(lab_enc.fit_transform)

df_prepared_nominal_standard = pd.DataFrame(data=np.concatenate((np.array(lab_enc_features), standard_scaled_data), axis=1), columns=categorical_data+numerical_data)
df_prepared_nominal_standard['stroke'] = df['stroke'].values

df_prepared_nominal_minmax = pd.DataFrame(data=np.concatenate((np.array(lab_enc_features), minmax_scaled_data), axis=1), columns=categorical_data+numerical_data)
df_prepared_nominal_minmax['stroke'] = df['stroke'].values

### Prepareing Dataset with One Hot Encoded Data

In [None]:
onehot_enc = OneHotEncoder()
onehot_features = onehot_enc.fit_transform(df_categorical).toarray().tolist()
onehot_feature_vector = [onehot_features[i] + standard_scaled_data.tolist()[i] for i in range(len(onehot_features))]

df_prepared_onehot = df_categorical
for i in range(len(numerical_data)):
    df_prepared_onehot[numerical_data[i]] = standard_scaled_data[:, i]
df_prepared_onehot['onehot_feature_vector'] = onehot_feature_vector
df_prepared_onehot['stroke'] = df['stroke'].values

## Resampling pipeline for training datasets to compensate huge bias using SMOTE

In [None]:
oversampling = SMOTE(sampling_strategy=0.7, random_state=33)
undersampling = RandomUnderSampler(sampling_strategy=1, random_state=33)
steps= [('o', oversampling), ('u', undersampling)]
pipline = Pipeline(steps=steps)

# Design, Training and Evaluation of different models

## Embedding Neural Network

### Model Architecture

In [None]:
callback = EarlyStopping(monitor='loss', patience=3)

in_layers = list()
em_layers = list()
for category in categorical_data:
    n_labels = df_prepared_nominal_standard[category].nunique()
    in_layer = Input(shape=(1,))
    em_layer = Embedding(n_labels, 5)(in_layer)
    in_layers.append(in_layer)
    em_layers.append(em_layer)

in_layer_num = Input(shape=(1,3,))    

merge_first = concatenate(em_layers)
merge_second = concatenate([merge_first, in_layer_num], axis=2)
dense = Dense(16, activation='relu')(merge_second)
dense = Dense(16, activation='relu')(dense)
output = Dense(1, activation='sigmoid')(dense)

in_layers.append(in_layer_num)

model = Model(inputs=in_layers, outputs=output)

In [None]:
plot_model(model, show_shapes=True, to_file='embedding_model.png')

### Model training with Resampling and Stratified-KFold using nominal encoded and standard scaled features

In [None]:
skf = StratifiedKFold(n_splits=5)

X = np.array(df_prepared_nominal_standard[categorical_data+numerical_data])
y = np.array(df_prepared_nominal_standard['stroke'])

acc_per_fold_embedding_dnn = list()
loss_per_fold_embedding_dnn = list()
auc_per_fold_embedding_dnn = list()

for fold, (train_index, test_index) in enumerate (skf.split(X, y)):
    features_nominal_resampled, target_nominal_resampled = pipline.fit_resample(X[train_index], y[train_index])
    features_nominal_resampled[: , :6] = np.around(features_nominal_resampled[:, :6], 0)
    
    
    ### input for training ###
    training_input = list()
    
    for i, _ in enumerate(categorical_data):
        s = np.index_exp[:, i]
        in_array = features_nominal_resampled[s]
        training_input.append(in_array)
    
    training_input.append(np.expand_dims(features_nominal_resampled[:, 7:], axis=1))
    training_target = target_nominal_resampled.reshape(-1,1,1)
    
    ### input for validation ### 
    validation_input = list()
    
    features_nominal_validation = X[test_index]
    validation_target = y[test_index]
    
    for i, _ in enumerate(categorical_data):
        s = np.index_exp[:, i]
        in_array = features_nominal_validation[s]
        validation_input.append(in_array)
    
    validation_input.append(np.expand_dims(features_nominal_validation[:, 7:], axis=1))
    validation_target = validation_target.reshape(-1,1,1)
    
    ### compile model ###
    opt = optimizers.Adam(lr=0.01)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    
     
    ### model training ###    
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print(f'Training for Embedding Neural Network fold {fold} ...')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')

        
    model.fit(x=training_input, y=training_target, batch_size=16, epochs=150, verbose=0, callbacks=[callback])
    
    scores = model.evaluate(x=validation_input, y=validation_target, verbose=0)
    
    acc_per_fold_embedding_dnn.append(scores[1] * 100)
    loss_per_fold_embedding_dnn.append(scores[0])
    
    predictions_embedding_dnn = model.predict(validation_input).reshape(-1,1)
    predictions_embedding_dnn = (predictions_embedding_dnn > 0.5).astype(int)
    
    
    #model evaluation
    
    print(f'Score for fold {fold} using evaluate function: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    print('')
    
    print(f'Confusion matrix {confusion_matrix(validation_target.reshape(-1,1), predictions_embedding_dnn)} for fold {fold}')
    print('')

    print(f'Classification report {classification_report(validation_target.reshape(-1,1), predictions_embedding_dnn)} for fold {fold}')
    print('')
    
    
    fpr, tpr, thresholds = roc_curve(validation_target.reshape(-1,1), predictions_embedding_dnn)
    roc_auc = auc(fpr, tpr)
    auc_per_fold_embedding_dnn.append(roc_auc*100)
#     display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
#     display.plot()  
#     plt.show()

    print(f'Area Under the ROC Curve (AUC) {roc_auc} for fold {fold}')
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')
    
accuracy_embedding_dnn_overall_standard = sum(acc_per_fold_embedding_dnn)/len(acc_per_fold_embedding_dnn)
auc_embedding_dnn_overall_standard = sum(auc_per_fold_embedding_dnn)/len(auc_per_fold_embedding_dnn) 


print('')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print(f'Results for Embedding Neural Network')
print('------------------------------------------------------------------------')
print('')
print(f'Average accuracy over all folds for Embedding Neural Network is {accuracy_embedding_dnn_overall_standard}%')
print(f'Average area under curve (AUC) over all folds for Embedding Neural Network is {auc_embedding_dnn_overall_standard}')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print('')  

### Model training with Resampling and Stratified-KFold using nominal encoded and minmax scaled features

In [None]:
callback = EarlyStopping(monitor='loss', patience=3)

in_layers = list()
em_layers = list()
for category in categorical_data:
    n_labels = df_prepared_nominal_minmax[category].nunique()
    in_layer = Input(shape=(1,))
    em_layer = Embedding(n_labels, 5)(in_layer)
    in_layers.append(in_layer)
    em_layers.append(em_layer)

in_layer_num = Input(shape=(1,3,))    

merge_first = concatenate(em_layers)
merge_second = concatenate([merge_first, in_layer_num], axis=2)
dense = Dense(16, activation='relu')(merge_second)
dense = Dense(16, activation='relu')(dense)
output = Dense(1, activation='sigmoid')(dense)

in_layers.append(in_layer_num)

model = Model(inputs=in_layers, outputs=output)

In [None]:
skf = StratifiedKFold(n_splits=5)

X = np.array(df_prepared_nominal_minmax[categorical_data+numerical_data])
y = np.array(df_prepared_nominal_minmax['stroke'])

acc_per_fold_embedding_dnn = list()
loss_per_fold_embedding_dnn = list()
auc_per_fold_embedding_dnn = list()

for fold, (train_index, test_index) in enumerate (skf.split(X, y)):
    features_nominal_resampled, target_nominal_resampled = pipline.fit_resample(X[train_index], y[train_index])
    features_nominal_resampled[: , :6] = np.around(features_nominal_resampled[:, :6], 0)
    
    
    ### input for training ###
    training_input = list()
    
    for i, _ in enumerate(categorical_data):
        s = np.index_exp[:, i]
        in_array = features_nominal_resampled[s]
        training_input.append(in_array)
    
    training_input.append(np.expand_dims(features_nominal_resampled[:, 7:], axis=1))
    training_target = target_nominal_resampled.reshape(-1,1,1)
    
    ### input for validation ### 
    validation_input = list()
    
    features_nominal_validation = X[test_index]
    validation_target = y[test_index]
    
    for i, _ in enumerate(categorical_data):
        s = np.index_exp[:, i]
        in_array = features_nominal_validation[s]
        validation_input.append(in_array)
    
    validation_input.append(np.expand_dims(features_nominal_validation[:, 7:], axis=1))
    validation_target = validation_target.reshape(-1,1,1)
    
    ### compile model ###
    opt = optimizers.Adam(lr=0.01)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    
     
    ### model training ###    
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print(f'Training for Embedding Neural Network fold {fold} ...')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')

        
    model.fit(x=training_input, y=training_target, batch_size=16, epochs=150, verbose=0, callbacks=[callback])
    
    scores = model.evaluate(x=validation_input, y=validation_target, verbose=0)
    
    acc_per_fold_embedding_dnn.append(scores[1] * 100)
    loss_per_fold_embedding_dnn.append(scores[0])
    
    predictions_embedding_dnn = model.predict(validation_input).reshape(-1,1)
    predictions_embedding_dnn = (predictions_embedding_dnn > 0.5).astype(int)
    
    
    #model evaluation
    
    print(f'Score for fold {fold} using evaluate function: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    print('')
    
    print(f'Confusion matrix {confusion_matrix(validation_target.reshape(-1,1), predictions_embedding_dnn)} for fold {fold}')
    print('')

    print(f'Classification report {classification_report(validation_target.reshape(-1,1), predictions_embedding_dnn)} for fold {fold}')
    print('')
    
    
    fpr, tpr, thresholds = roc_curve(validation_target.reshape(-1,1), predictions_embedding_dnn)
    roc_auc = auc(fpr, tpr)
    auc_per_fold_embedding_dnn.append(roc_auc*100)
#     display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
#     display.plot()  
#     plt.show()

    print(f'Area Under the ROC Curve (AUC) {roc_auc} for fold {fold}')
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')
    
accuracy_embedding_dnn_overall_minmax = sum(acc_per_fold_embedding_dnn)/len(acc_per_fold_embedding_dnn)
auc_embedding_dnn_overall_minmax = sum(auc_per_fold_embedding_dnn)/len(auc_per_fold_embedding_dnn) 


print('')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print(f'Results for Embedding Neural Network')
print('------------------------------------------------------------------------')
print('')
print(f'Average accuracy over all folds for Embedding Neural Network is {accuracy_embedding_dnn_overall_minmax}%')
print(f'Average area under curve (AUC) over all folds for Embedding Neural Network is {auc_embedding_dnn_overall_minmax}')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print('')  

## XGBoost Model for Binary Classification (using One-Hot Encoded Data)

### Model Architecture

In [None]:
xg_model = XGBClassifier(max_depth=3, learning_rate=0.01, n_estimators=100)

### Model training using Resampling and Stratified-KFold 

In [None]:
skf = StratifiedKFold(n_splits=5)

X = np.array(list(df_prepared_onehot['onehot_feature_vector'].apply(lambda x: list(x))))
y = np.array(df_prepared_onehot['stroke'])

acc_per_fold_xg = list()
loss_per_fold_xg = list()
auc_per_fold_xg = list()

for fold, (train_index, test_index) in enumerate (skf.split(X, y)):
    features_onehot_resampled, target_onehot_resampled = pipline.fit_resample(X[train_index], y[train_index])
    features_onehot_resampled[: , :6] = np.around(features_onehot_resampled[:, :6], 0)
    
    xg_model.fit(np.array(features_onehot_resampled), target_onehot_resampled)
    
    target_predictions_model_xg = xg_model.predict(X[test_index])
    
    accuracy_xg = accuracy_score(y[test_index], target_predictions_model_xg)
    acc_per_fold_xg.append(accuracy_xg * 100.0)
    
    print('')
    print('')
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print(f'Training for XGBoost fold {fold} ...')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')
    
    
    # model evaluation
    
    print(f'Accuracy for fold {fold}: {accuracy_xg * 100.0}%')
    print('')
    
    
    print(f'Confusion matrix {confusion_matrix(y[test_index], target_predictions_model_xg)} for fold {fold}')
    print('')

    
    print(f'Classification report {classification_report(y[test_index], target_predictions_model_xg)} for fold {fold}')
    print('')
    
    
    fpr, tpr, thresholds = roc_curve(y[test_index], target_predictions_model_xg)
    roc_auc = auc(fpr, tpr)
    auc_per_fold_xg.append(roc_auc * 100)
#     display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
#     display.plot()  
#     plt.show()

    print(f'Area Under the ROC Curve (AUC) {roc_auc} for fold {fold}')
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')
    
accuracy_xg_overall = sum(acc_per_fold_xg)/len(acc_per_fold_xg)
auc_xg_overall = sum(auc_per_fold_xg)/len(auc_per_fold_xg) 


print('')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print(f'Results for XGBoost Classifier')
print('------------------------------------------------------------------------')
print('')
print(f'Average accuracy over all folds for XGBoost is {accuracy_xg_overall}%')
print(f'Average area under curve (AUC) over all folds for XGBoost is {auc_xg_overall}')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print('')  

## Deep Neural Network for Binary Classification (using One-Hot Encoded Data)

### Model Architecture

In [None]:
callback = EarlyStopping(monitor='loss', patience=3)

dnn_model = Sequential()
dnn_model.add(Dense(16, input_dim=22, activation='relu'))
dnn_model.add(Dense(16, activation='relu'))
dnn_model.add(Dense(1, activation='sigmoid'))

dnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
plot_model(dnn_model, show_shapes=True, to_file='dnn_model.png')

### Model training using Resampling and Stratified-KFold 

In [None]:
skf = StratifiedKFold(n_splits=5)

X = np.array(list(df_prepared_onehot['onehot_feature_vector'].apply(lambda x: list(x))))
y = np.array(df_prepared_onehot['stroke'])

acc_per_fold_dnn = list()
loss_per_fold_dnn = list()
auc_per_fold_dnn = list()

for fold, (train_index, test_index) in enumerate (skf.split(X, y)):
    features_onehot_resampled, target_onehot_resampled = pipline.fit_resample(X[train_index], y[train_index])
    features_onehot_resampled[: , :6] = np.around(features_onehot_resampled[:, :6], 0)
    
    dnn_model.fit(np.array(features_onehot_resampled), target_onehot_resampled, epochs=150, batch_size=16, verbose=0, callbacks=[callback])
    
    scores = dnn_model.evaluate(x=X[test_index], y=y[test_index], verbose=0)
    
    acc_per_fold_dnn.append(scores[1] * 100)
    loss_per_fold_dnn.append(scores[0])
    
    target_predictions_model_dnn = dnn_model.predict(X[test_index])
    target_predictions_model_dnn = (target_predictions_model_dnn > 0.5).astype(int)
    
    
    print('')
    print('')
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print(f'Training for Deep Neural Network Fold {fold} ...')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')
    
    #model evaluation
    
    print(f'Score for fold {fold} using evaluate function: {dnn_model.metrics_names[0]} of {scores[0]}; {dnn_model.metrics_names[1]} of {scores[1]*100}%')
    print('')
    
    
    print(f'Confusion matrix {confusion_matrix(y[test_index], target_predictions_model_dnn)} for fold {fold}')
    print('')

    
    print(f'Classification report {classification_report(y[test_index], target_predictions_model_dnn)} for fold {fold}')
    print('')
    
    
    fpr, tpr, thresholds = roc_curve(y[test_index], target_predictions_model_dnn)
    roc_auc = auc(fpr, tpr)
    auc_per_fold_dnn.append(roc_auc*100)
#     display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
#     display.plot()  
#     plt.show()

    print(f'Area Under the ROC Curve (AUC) {roc_auc} for fold {fold}')
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')
    
    
accuracy_dnn_overall = sum(acc_per_fold_dnn)/len(acc_per_fold_dnn)
auc_dnn_overall = sum(auc_per_fold_dnn)/len(auc_per_fold_dnn) 


print('')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print(f'Results for Deep Neural Network')
print('------------------------------------------------------------------------')
print('')
print(f'Average accuracy over all folds for Deep Neural Network is {accuracy_dnn_overall}%')
print(f'Average area under curve (AUC) over all folds for Deep Neural Network is {auc_dnn_overall}')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print('')      

## Random Forest Classifier for Binary Classification

### Model Architecture

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, verbose=1)

### Model training using Resampling and Stratified-KFold 

In [None]:
skf = StratifiedKFold(n_splits=5)

X = np.array(df_prepared_nominal_standard[categorical_data+numerical_data])
y = np.array(df_prepared_nominal_standard['stroke'])

acc_per_fold_rf = list()
loss_per_fold_rf = list()
auc_per_fold_rf = list()

for fold, (train_index, test_index) in enumerate (skf.split(X, y)):
    features_nominal_resampled, target_nominal_resampled = pipline.fit_resample(X[train_index], y[train_index])
    features_nominal_resampled[: , :6] = np.around(features_nominal_resampled[:, :6], 0)
    
    rf_model.fit(np.array(features_nominal_resampled), target_nominal_resampled)
    
    target_predictions_model_rf = rf_model.predict(X[test_index])
    
    accuracy_rf = accuracy_score(y[test_index], target_predictions_model_rf)
    acc_per_fold_rf.append(accuracy_rf * 100.0)
    
    print('')
    print('')
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print(f'Training for Random Forest fold {fold} ...')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')
    
    #model evaluation
    
    print(f'Accuracy for fold {fold}: {accuracy_rf * 100.0}%')
    print('')
    
    
    print(f'Confusion matrix {confusion_matrix(y[test_index], target_predictions_model_rf)} for fold {fold}')
    print('')

    
    print(f'Classification report {classification_report(y[test_index], target_predictions_model_rf)} for fold {fold}')
    print('')
    
    
    fpr, tpr, thresholds = roc_curve(y[test_index], target_predictions_model_rf)
    roc_auc = auc(fpr, tpr)
    auc_per_fold_rf.append(roc_auc*100)
#     display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
#     display.plot()  
#     plt.show()

    print(f'Area Under the ROC Curve (AUC) {roc_auc} for fold {fold}')
    print('')
    print('------------------------------------------------------------------------')
    print('------------------------------------------------------------------------')
    print('')
    
    
accuracy_rf_overall = sum(acc_per_fold_rf)/len(acc_per_fold_rf)
auc_rf_overall = sum(auc_per_fold_rf)/len(auc_per_fold_rf) 


print('')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print(f'Results for Random Forest Classifier')
print('------------------------------------------------------------------------')
print('')
print(f'Average accuracy over all folds for Random Forest is {accuracy_rf_overall}%')
print(f'Average area under curve (AUC) over all folds for Random Forest is {auc_rf_overall}')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print('')    


# Model Comparison and Conclusion

In [None]:
accuracy_dict = {'XGBoost': accuracy_xg_overall, 'Random Forest': accuracy_rf_overall, 'Deep Neural Network': accuracy_dnn_overall, 'Embedding Neural Network (standard scaled)': accuracy_embedding_dnn_overall_standard, 'Embedding Neural Network (minmax scaled)': accuracy_embedding_dnn_overall_minmax}
auc_dict = {'XGBoost': auc_xg_overall, 'Random Forest': auc_rf_overall, 'Deep Neural Network': auc_dnn_overall, 'Embedding Neural Network (standard scaled)': auc_embedding_dnn_overall_standard, 'Embedding Neural Network (minmax scaled)': auc_embedding_dnn_overall_minmax}
sorted_models = sorted(auc_dict, key=auc_dict.get, reverse=True)
sorted_auc = sorted(list(auc_dict.values()), reverse=True)

In [None]:
color1 = px.colors.qualitative.Dark2[6]
color2 = px.colors.qualitative.Bold[7]

fig = make_subplots(rows=1, cols=1, shared_xaxes = True, shared_yaxes=True, vertical_spacing=0.001, horizontal_spacing=0.1)

fig.append_trace(go.Bar(
            x=list(accuracy_dict.values()),
            y=list(accuracy_dict.keys()),
            name='Accuracy',
            orientation='h', marker_color=color1), 1,1)


fig.append_trace(go.Bar(
            x=list(auc_dict.values()),
            y=list(auc_dict.keys()),
            name='AUC',
            orientation='h',  marker_color=color2), 1,1)

fig.update_layout(
    title='Comparison of different models',
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
    ))

fig.update_layout(xaxis_range=[0,100])


fig.show()

print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print(f'Ranking of the trained classifiers:')
print(f'1. {sorted_models[0]}')
print(f'2. {sorted_models[1]}')
print(f'3. {sorted_models[2]}')
print(f'4. {sorted_models[3]}')
print(f'4. {sorted_models[4]}')
print('------------------------------------------------------------------------')
print('------------------------------------------------------------------------')
print('')
print(f'Conclusion:')
print(f'Due to the very biased dataset high accuracy alone cannot be considered as a feasible metric to evaluate the proposed models.')
print(f'Even a missclassification of all true positive occasions of strokes would lead to a very high accuracy of >95% since the minority class covers less than 5% of the samples.')
print(f'Also when dealing with medical data one should never rely on accurcy or precision only since recall is a very crucial metric provding the necessary insight about the proportion of actual positives identified correctly.')
print(f'For that reason to compare the different here proposed models the AUC ROC was determined as a performance measurement for the classification problem.')
print(f'Based on that with an AUC ROC of {round(list(auc_dict.values())[0], 2)} the {sorted_models[0]} can be recommended as the best performing model for the here addressed classification task.')
print(f'One should keep in mind that with an AUC ROC of {round(list(auc_dict.values())[0], 2)} the model should still be improved before taking it into inference.')
print(f'Nevertheless the model can already provdide an assistance to indicate the stroke risk.')