In [2]:
#!/usr/bin/python3

import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
import missingno
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, average_precision_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.utils import shuffle

from imblearn.under_sampling import RandomUnderSampler

from tensorflow_addons import losses
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report

# Create the dataframe of the csv file
df = pandas.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

# Display summary of the dataframe
print(df.info())

# By manual inspection, we can confirm that the 'id' column can be removed as it does not contain any
# relevant information
df.drop('id', axis = 1, inplace = True)

df['gender'].value_counts()

In [3]:
# Check the 'age' attribute values
df.age.sort_values()
temp_df_age = df[df['stroke'] == 1]['age']

# Check the least values of 'age' attribute
temp_df_age.nsmallest(6)

# The age with value '1.32' seems invalid, it will be better to drop the instances with very small values of age
df = df[df['age'] > 13]

In [4]:
# Check the correlation among features
seaborn.heatmap(df.corr(), annot = True)

In [5]:
# Since there is only 1 row with the gender as 'Other', we can drop it as it won't be much relevant in the analysis and prediction
df = df.drop(df.loc[df['gender'] == 'Other'].index)

# Checks which columns in the dataset have atleast one null value
def find_columns_with_null_value(dataframe):
    cols_list = dataframe.columns
    n_rows = len(dataframe)
    result = []
    for i in range(len(cols_list)):
        if dataframe.count()[i] < n_rows:
            result.append(cols_list[i])
            print('Number of missing values in ' + str(cols_list[i]) + ' = ' + str(n_rows - dataframe.count()[i]))
    return result

'''
Fills the missing values in a column by the median value
Reference: https://stackoverflow.com/questions/18689823/pandas-dataframe-replace-nan-values-with-average-of-columns
'''
def fill_missing_values_by_median(dataframe, column_name):
    dataframe[column].fillna(dataframe[column].median(), inplace = True)
    return dataframe

# Checking relation between gender and stroke
seaborn.countplot(x='gender', data = df, hue = 'stroke', palette=['yellow',"red"])

# Number of rows in the dataframe
print('Number of rows in the dataframe: ' + str(len(df)))
print('Column(s) with atleast one NULL value (missing value): ' + str(find_columns_with_null_value(df)))

In [6]:
''' 
This function assumes that there does not exist any column in the dataset which has numerical
values and which is categorical

Reference: https://stackoverflow.com/questions/29803093/check-which-columns-in-dataframe-are-categorical
'''
def identify_categorical_columns(dataframe):
    cols_list = dataframe.columns
    numerical_cols_list = dataframe.select_dtypes('number').columns
    result = list(filter(lambda column: (column not in numerical_cols_list), set(cols_list)))
    return result

# Find out the columns which have categorical values
print('Categorical Columns: ' + str(identify_categorical_columns(df)))

In [7]:
#Randomly shuffling the dataframe
# df = shuffle(df, random_state = 42)

'''
The attributes 'work_type' and 'smoking_status' have many possible values, so it will be better to encode these columns using one-hot encoding.

'gender', 'ever_married', and 'Residence_type' attributes can be transformed into binary features using the LabelEncoder.

References:
1. https://towardsdatascience.com/handling-categorical-data-the-right-way-9d1279956fc6
2. https://stackoverflow.com/questions/37292872/how-can-i-one-hot-encode-in-python
3. https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
'''

# Encoding the 'work_type' and 'smoking_status' attributes using one-hot encoding
temp_columns = pandas.get_dummies(df.work_type)
df.drop('work_type', axis = 1, inplace = True)
temp_df1 = df[[column for column in df.columns[:5]]]
temp_df1 = pandas.concat([temp_df1, temp_columns], axis = 1)
temp_df2 = pandas.get_dummies(df.smoking_status)
temp_df3 = df[[column for column in df.columns[5:8]]]
temp_df1 = pandas.concat([temp_df1, temp_df3, temp_df2, df['stroke']], axis = 1)

# Encoding the 'gender', 'ever_married' and 'Residence_type' attributes using label encoding
label_encoder = LabelEncoder()

temp_df1['gender'] = label_encoder.fit_transform(temp_df1['gender'])
temp_df1['ever_married'] = label_encoder.fit_transform(temp_df1['ever_married'])
temp_df1['Residence_type'] = label_encoder.fit_transform(temp_df1['Residence_type'])

df = temp_df1

df

In [8]:
'''
Check whether the dataset is balanced or imbalanced in terms of output labels.
'''

data_balance_check_labels = ['stroke = 0', 'stroke = 1']
total_instances_per_value = df['stroke'].value_counts()
pie_chart_colors = ['orange', 'red']
plt.figure(figsize=(6,6))
plt.pie(total_instances_per_value, labels = data_balance_check_labels, shadow = 1, explode = (0.1, 0), autopct='%1.2f%%', colors = pie_chart_colors)
plt.title('Percentage of rows in the dataset where stroke = 0 and stroke = 1')
plt.show()

In [9]:
'''
This is the main problem in the dataset - the dataset is imbalanced. This might lead to overfit and the resulting model
will be biased towards predicting 'no stroke (stroke = 0)
'''
# Splitting the dataset into the train data (80%) and the test data (20%)
# Reference: https://stackoverflow.com/questions/42191717/scikit-learn-random-state-in-splitting-dataset

train_data, test_data = train_test_split(df, test_size = 0.2, random_state = 42)

# Separating the input features and the output label from the training data and the test data
X_train_data = train_data.copy()
X_train_data = X_train_data.drop('stroke', axis = 1)
# X_train_data = train_data[[column for column in train_data.columns[:-1]]]
#y_train_data = train_data[[train_data.columns[len(train_data.columns) - 1]]]
y_train_data = train_data[['stroke']]
#X_test_data = test_data[[column for column in test_data.columns[:-1]]]
#y_test_data = test_data[[test_data.columns[len(test_data.columns) - 1]]]
X_test_data = test_data.copy()
X_test_data = X_test_data.drop('stroke', axis = 1)
y_test_data = test_data[['stroke']]
'''
If there are any missing values in any column, fill them by the median value of that column.
If we do this before the train-test split, there is a possibility of data leakage.
References: 
1. https://www.analyticsvidhya.com/blog/2021/05/dealing-with-missing-values-in-python-a-complete-guide/
2. https://www.analyticsvidhya.com/blog/2021/07/data-leakage-and-its-effect-on-the-performance-of-an-ml-model/
'''
for column in find_columns_with_null_value(X_train_data):
    X_train_data = fill_missing_values_by_median(X_train_data, column)

for column in find_columns_with_null_value(X_test_data):
    X_test_data = fill_missing_values_by_median(X_test_data, column)
    
print(sum(y_train_data.values.ravel()))
print(sum(y_test_data.values.ravel()))


In [10]:
# Scale the train and test data
data_scaler = MinMaxScaler()
X_train_data_scaled = data_scaler.fit_transform(X_train_data)
X_test_data_scaled = data_scaler.transform(X_test_data)

In [11]:
# Oversample the training data using Synthetic Minority Oversampling Technique (SMOTE)
# Reference: https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
smote_for_oversampling = SMOTE(random_state = 42)
X_train_oversampled, y_train_oversampled = smote_for_oversampling.fit_resample(X_train_data_scaled, y_train_data)
#X_test_data, y_test_data = smote_for_oversampling.fit_resample(X_test_data, y_test_data)
print(y_train_oversampled.value_counts())

# Random under-sampling removes the rows having majority class labels randomly
random_under_sampler = RandomUnderSampler(replacement = True, random_state = 42)

X_train_undersampled, y_train_undersampled = random_under_sampler.fit_resample(X_train_data_scaled, y_train_data)

print(y_train_undersampled.value_counts())

In [12]:
'''
References:
1. https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
2. https://towardsdatascience.com/a-simple-example-of-pipeline-in-machine-learning-with-scikit-learn-e726ffbb6976
3. https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9
4. https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_numpy.html
'''

# Training and testing a logistic regression model on the oversampled training data
# logistic_reg_pipeline =  make_pipeline(StandardScaler(), LogisticRegression(random_state = 42))
# logistic_reg_pipeline.fit(X_train_oversampled, y_train_oversampled.values.ravel())
logistic_reg = LogisticRegression(random_state = 42)
logistic_reg.fit(X_train_oversampled, y_train_oversampled.values.ravel())
# y_predicted_lr = logistic_reg_pipeline.predict(X_test_data_scaled)
y_predicted_lr = logistic_reg.predict(X_test_data_scaled)

confusion_matrix_lr = confusion_matrix(y_test_data, y_predicted_lr)
print('Confusion Matrix: ')
print(confusion_matrix_lr)

print('Accuracy (in %): ' + str(logistic_reg.score(X_test_data_scaled, y_test_data)*100))

precision_lr = precision_score(y_test_data, y_predicted_lr)
print('Precision: ' + str(precision_lr))

recall_lr = recall_score(y_test_data, y_predicted_lr)
print('Recall: ' + str(recall_lr))

f1_lr = f1_score(y_test_data, y_predicted_lr)
print('F1 score: ' + str(f1_lr))

# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize = (10, 7))
seaborn.heatmap(confusion_matrix_lr, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 3, cbar = True, annot_kws = {'fontsize': 17},
                xticklabels = ['No stroke (predicted)', 'Stroke (predicted)'], yticklabels = ['No stroke', 'Stroke'])
plt.yticks(rotation = 0)
plt.show()

In [13]:
# Training and testing a SVM model on the oversampled training data
#svc_pipeline =  make_pipeline(StandardScaler(), SVC(random_state = 42))
#svc_pipeline.fit(X_train_oversampled, y_train_oversampled.values.ravel())

svc = SVC(random_state = 42)
svc.fit(X_train_oversampled, y_train_oversampled.values.ravel())
y_predicted_svc = svc.predict(X_test_data_scaled)

confusion_matrix_svc = confusion_matrix(y_test_data, y_predicted_svc)
print('Confusion Matrix: ')
print(confusion_matrix_svc)

print('Accuracy (in %): ' + str(svc.score(X_test_data_scaled, y_test_data)))

precision_svc = precision_score(y_test_data, y_predicted_svc)
print('Precision: ' + str(precision_svc))

recall_svc = recall_score(y_test_data, y_predicted_svc)
print('Recall: ' + str(recall_svc))

f1_svc = f1_score(y_test_data, y_predicted_svc)
print('F1 score: ' + str(f1_svc))

# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize = (10, 7))
seaborn.heatmap(confusion_matrix_svc, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 3, cbar = True, annot_kws = {'fontsize': 17},
                xticklabels = ['No stroke (predicted)', 'Stroke (predicted)'], yticklabels = ['No stroke', 'Stroke'])
plt.yticks(rotation = 0)
plt.show()

In [14]:
# Training and testing a Logistic Regression model on the undersampled training data
logistic_reg_undersampled = LogisticRegression(random_state = 42)
logistic_reg_undersampled.fit(X_train_undersampled, y_train_undersampled.values.ravel())
# y_predicted_lr = logistic_reg_pipeline.predict(X_test_data_scaled)
y_predicted_lr_us = logistic_reg_undersampled.predict(X_test_data_scaled)

confusion_matrix_lr_us = confusion_matrix(y_test_data, y_predicted_lr_us)
print('Confusion Matrix: ')
print(confusion_matrix_lr_us)

print('Accuracy (in %): ' + str(logistic_reg_undersampled.score(X_test_data_scaled, y_test_data)*100))

precision_lr_us = precision_score(y_test_data, y_predicted_lr_us)
print('Precision: ' + str(precision_lr_us))

recall_lr_us = recall_score(y_test_data, y_predicted_lr_us)
print('Recall: ' + str(recall_lr_us))

f1_lr_us = f1_score(y_test_data, y_predicted_lr_us)
print('F1 score: ' + str(f1_lr_us))

# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize = (10, 7))
seaborn.heatmap(confusion_matrix_lr_us, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 3, cbar = True, annot_kws = {'fontsize': 17},
                xticklabels = ['No stroke (predicted)', 'Stroke (predicted)'], yticklabels = ['No stroke', 'Stroke'])
plt.yticks(rotation = 0)
plt.show()

In [15]:
# Training and testing a SVM model on the undersampled training data
svc_undersampled = SVC(random_state = 42)
svc_undersampled.fit(X_train_undersampled, y_train_undersampled.values.ravel())
y_predicted_svc_us = svc_undersampled.predict(X_test_data_scaled)

confusion_matrix_svc_us = confusion_matrix(y_test_data, y_predicted_svc_us)
print('Confusion Matrix: ')
print(confusion_matrix_svc_us)

print('Accuracy (in %): ' + str(svc_undersampled.score(X_test_data_scaled, y_test_data)))

precision_svc_us = precision_score(y_test_data, y_predicted_svc_us)
print('Precision: ' + str(precision_svc_us))

recall_svc_us = recall_score(y_test_data, y_predicted_svc_us)
print('Recall: ' + str(recall_svc_us))

f1_svc_us = f1_score(y_test_data, y_predicted_svc_us)
print('F1 score: ' + str(f1_svc_us))

# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize = (10, 7))
seaborn.heatmap(confusion_matrix_svc_us, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 3, cbar = True, annot_kws = {'fontsize': 17},
                xticklabels = ['No stroke (predicted)', 'Stroke (predicted)'], yticklabels = ['No stroke', 'Stroke'])
plt.yticks(rotation = 0)
plt.show()

In [None]:
# Training and testing neural network on the oversampled training data
nn_model = keras.Sequential([
    keras.layers.Dense(17, input_dim=17, activation='relu'),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(
            learning_rate=0.001, beta_1 = 0.9, beta_2=0.99 , epsilon=1e-05,amsgrad=False,name='Adam'
        ), loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_oversampled, y_train_oversampled.values.ravel(), epochs=500)
    
print(nn_model.evaluate(X_test_data_scaled, y_test_data))
    
y_predicted_nn_model = nn_model.predict(X_test_data_scaled)
y_predicted_nn_model = numpy.round(y_predicted_nn_model)
    
print("Classification Report: \n", classification_report(y_test_data, y_predicted_nn_model))

In [None]:
# Training and testing neural network on the undersampled training data
nn_model_us = keras.Sequential([
    keras.layers.Dense(17, input_dim=17, activation='relu'),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

nn_model_us.compile(optimizer=tf.keras.optimizers.Adam(
            learning_rate=0.001, beta_1 = 0.9, beta_2=0.99 , epsilon=1e-05,amsgrad=False,name='Adam'
        ), loss='binary_crossentropy', metrics=['accuracy'])
nn_model_us.fit(X_train_undersampled, y_train_undersampled.values.ravel(), epochs=500)
    
print(nn_model_us.evaluate(X_test_data_scaled, y_test_data))
    
y_predicted_nn_model_us = nn_model_us.predict(X_test_data_scaled)
y_predicted_nn_model_us = numpy.round(y_predicted_nn_model_us)
    
print("Classification Report: \n", classification_report(y_test_data, y_predicted_nn_model_us))

In [20]:
from sklearn.tree import DecisionTreeClassifier

In [22]:
# Training and testing decision trees on the oversampled training data
decision_tree_model = DecisionTreeClassifier(criterion="gini", random_state=42, min_samples_leaf=5, max_depth=3)   
decision_tree_model.fit(X_train_oversampled,y_train_oversampled.values.ravel())
y_predicted_dt = decision_tree_model.predict(X_test_data_scaled)
# accuracy_score(y_test,y_predicted_dt)

confusion_matrix_dt = confusion_matrix(y_test_data, y_predicted_dt)
print('Confusion Matrix: ')
print(confusion_matrix_dt)

print('Accuracy (in %): ' + str(decision_tree_model.score(X_test_data_scaled, y_test_data)))

precision_dt = precision_score(y_test_data, y_predicted_dt)
print('Precision: ' + str(precision_dt))

recall_dt = recall_score(y_test_data, y_predicted_dt)
print('Recall: ' + str(recall_dt))

f1_dt = f1_score(y_test_data, y_predicted_dt)
print('F1 score: ' + str(f1_dt))

# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize = (10, 7))
seaborn.heatmap(confusion_matrix_dt, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 3, cbar = True, annot_kws = {'fontsize': 17},
                xticklabels = ['No stroke (predicted)', 'Stroke (predicted)'], yticklabels = ['No stroke', 'Stroke'])
plt.yticks(rotation = 0)
plt.show()

In [23]:
# Training and testing decision trees on the undersampled training data
decision_tree_model_us = DecisionTreeClassifier(criterion="gini", random_state=42, min_samples_leaf=5, max_depth=3)  
decision_tree_model_us.fit(X_train_undersampled,y_train_undersampled.values.ravel())
y_predicted_dt_us = decision_tree_model_us.predict(X_test_data_scaled)
# accuracy_score(y_test,y_predicted_dt)

confusion_matrix_dt_us = confusion_matrix(y_test_data, y_predicted_dt_us)
print('Confusion Matrix: ')
print(confusion_matrix_dt_us)

print('Accuracy (in %): ' + str(decision_tree_model_us.score(X_test_data_scaled, y_test_data)))

precision_dt_us = precision_score(y_test_data, y_predicted_dt_us)
print('Precision: ' + str(precision_dt_us))

recall_dt_us = recall_score(y_test_data, y_predicted_dt_us)
print('Recall: ' + str(recall_dt_us))

f1_dt_us = f1_score(y_test_data, y_predicted_dt_us)
print('F1 score: ' + str(f1_dt_us))

# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize = (10, 7))
seaborn.heatmap(confusion_matrix_dt_us, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 3, cbar = True, annot_kws = {'fontsize': 17},
                xticklabels = ['No stroke (predicted)', 'Stroke (predicted)'], yticklabels = ['No stroke', 'Stroke'])
plt.yticks(rotation = 0)
plt.show()

In [24]:
from sklearn.ensemble import AdaBoostClassifier

# Training and testing adaboost on the oversampled training data
adaboost_classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1)
# Train Adaboost Classifer
adaboost_model = adaboost_classifier.fit(X_train_oversampled, y_train_oversampled.values.ravel())

y_predicted_adaboost = adaboost_model.predict(X_test_data_scaled)
# accuracy_score(y_test,y_predicted_dt)

confusion_matrix_adaboost = confusion_matrix(y_test_data, y_predicted_adaboost)
print('Confusion Matrix: ')
print(confusion_matrix_adaboost)

print('Accuracy (in %): ' + str(adaboost_model.score(X_test_data_scaled, y_test_data)))

precision_adaboost = precision_score(y_test_data, y_predicted_adaboost)
print('Precision: ' + str(precision_adaboost))

recall_adaboost = recall_score(y_test_data, y_predicted_adaboost)
print('Recall: ' + str(recall_adaboost))

f1_adaboost = f1_score(y_test_data, y_predicted_adaboost)
print('F1 score: ' + str(f1_adaboost))

# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize = (10, 7))
seaborn.heatmap(confusion_matrix_adaboost, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 3, cbar = True, annot_kws = {'fontsize': 17},
                xticklabels = ['No stroke (predicted)', 'Stroke (predicted)'], yticklabels = ['No stroke', 'Stroke'])
plt.yticks(rotation = 0)
plt.show()

In [25]:
# Training and testing adaboost on the undersampled training data
adaboost_classifier_us = AdaBoostClassifier(n_estimators=50, learning_rate=1)
# Train Adaboost Classifer
adaboost_model_us = adaboost_classifier_us.fit(X_train_undersampled, y_train_undersampled.values.ravel())

y_predicted_adaboost_us = adaboost_model_us.predict(X_test_data_scaled)
# accuracy_score(y_test,y_predicted_dt)

confusion_matrix_adaboost_us = confusion_matrix(y_test_data, y_predicted_adaboost_us)
print('Confusion Matrix: ')
print(confusion_matrix_adaboost_us)

print('Accuracy (in %): ' + str(adaboost_model_us.score(X_test_data_scaled, y_test_data)))

precision_adaboost_us = precision_score(y_test_data, y_predicted_adaboost_us)
print('Precision: ' + str(precision_adaboost_us))

recall_adaboost_us = recall_score(y_test_data, y_predicted_adaboost_us)
print('Recall: ' + str(recall_adaboost_us))

f1_adaboost_us = f1_score(y_test_data, y_predicted_adaboost_us)
print('F1 score: ' + str(f1_adaboost_us))

# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize = (10, 7))
seaborn.heatmap(confusion_matrix_adaboost_us, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 3, cbar = True, annot_kws = {'fontsize': 17},
                xticklabels = ['No stroke (predicted)', 'Stroke (predicted)'], yticklabels = ['No stroke', 'Stroke'])
plt.yticks(rotation = 0)
plt.show()