In [541]:
# import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pickle

In [542]:
# import dataset
df = pd.read_excel("data_cleaned.xlsx")
df.head()

Unnamed: 0,S/N,Gender,Age,Education,Occupation,Marital Status,Ethnicity,Smoking,Diabetes_Type1,Diabetes_Type2,...,Insulin,Others,Drug Resistance,Neuropathy,Nephropathy,Retinopathy,CVD,Stroke,Peripheral Artery Disease,Mortality
0,1,F,59,Primary,Business,Married,Ijaw,Nil,Yes,,...,-,Yes,Yes,Nil,Yes,Nil,Nil,Nil,Nil,1.0
1,2,M,73,Secondary,Electrician,Married,Isoko,Yes,Yes,,...,Yes,-,Yes,Yes,Yes,Yes,Nil,Nil,Nil,1.0
2,3,M,51,Primary,B. Man,Married,Igbo,Nil,Yes,,...,-,Yes,-,Nil,Yes,Yes,Nil,Nil,Nil,1.0
3,4,F,84,Tertiary,B. Woman,Widow,Ishekiri,Nil,Yes,Yes,...,Yes,-,-,Yes,Yes,Nil,Nil,Yes,Nil,1.0
4,5,M,56,Secondary,Farmer,Married,Urhobo,Yes,Yes,,...,-,Yes,Yes,Nil,Yes,Nil,Nil,Nil,Yes,1.0


In [543]:
# drop ethnicity
df.drop("Ethnicity", axis=1, inplace=True)

In [544]:
# lower all text data
all_cols = df.columns
for col in all_cols:
    if df[col].dtype == 'object':
        df[col] = df[col].str.lower()

In [545]:
# replace filler values with nulls
df.replace("nil", np.nan, inplace=True)
df.replace("-", np.nan, inplace=True)

In [546]:
# check null counts
df.isna().sum()

S/N                            0
Gender                         0
Age                            0
Education                      5
Occupation                     4
Marital Status                 1
Smoking                       84
Diabetes_Type1                43
Diabetes_Type2                94
Gestational                  100
Diagnosis Date                37
Diagnosis Age                 94
Last Visit Date               25
Systolic BP                    0
Diastolic BP                   0
BMI                           50
Haemoglobin A1c                0
Insulin                       92
Others                         3
Drug Resistance               77
Neuropathy                    72
Nephropathy                   63
Retinopathy                   84
CVD                           88
Stroke                        95
Peripheral Artery Disease     92
Mortality                      1
dtype: int64

In [547]:
df.head()

Unnamed: 0,S/N,Gender,Age,Education,Occupation,Marital Status,Smoking,Diabetes_Type1,Diabetes_Type2,Gestational,...,Insulin,Others,Drug Resistance,Neuropathy,Nephropathy,Retinopathy,CVD,Stroke,Peripheral Artery Disease,Mortality
0,1,f,59,primary,business,married,,yes,,,...,,yes,yes,,yes,,,,,1.0
1,2,m,73,secondary,electrician,married,yes,yes,,,...,yes,,yes,yes,yes,yes,,,,1.0
2,3,m,51,primary,b. man,married,,yes,,,...,,yes,,,yes,yes,,,,1.0
3,4,f,84,tertiary,b. woman,widow,,yes,yes,,...,yes,,,yes,yes,,,yes,,1.0
4,5,m,56,secondary,farmer,married,yes,yes,,,...,,yes,yes,,yes,,,,yes,1.0


In [548]:
# fill null with no on yes/no type columns
fill_negative_columns = ["Smoking", 'Diabetes_Type1', 'Diabetes_Type2', 'Gestational', 'Insulin', 'Others', 'Drug Resistance', 'Neuropathy', 'Nephropathy', 'Retinopathy', 'CVD', "Stroke", 'Peripheral Artery Disease']
for col in fill_negative_columns:
    df[col].fillna("no", inplace=True)

In [549]:
# check null counts
df.isna().sum()

S/N                           0
Gender                        0
Age                           0
Education                     5
Occupation                    4
Marital Status                1
Smoking                       0
Diabetes_Type1                0
Diabetes_Type2                0
Gestational                   0
Diagnosis Date               37
Diagnosis Age                94
Last Visit Date              25
Systolic BP                   0
Diastolic BP                  0
BMI                          50
Haemoglobin A1c               0
Insulin                       0
Others                        0
Drug Resistance               0
Neuropathy                    0
Nephropathy                   0
Retinopathy                   0
CVD                           0
Stroke                        0
Peripheral Artery Disease     0
Mortality                     1
dtype: int64

In [550]:
# drop row with missing data on target variable
df.dropna(subset="Mortality", inplace=True)

In [551]:
# drop redundant features
df.drop(columns=["S/N", "Diagnosis Date", "Diagnosis Age", "Last Visit Date"], axis=1, inplace=True)

for col in df.columns:
    val_length = len(df[col].value_counts())
    if val_length == 1:
        df.drop(col, axis=1, inplace=True)

In [552]:
# fill missing with mode
cols_to_fill_mode = ["Education", "Occupation", "Marital Status"]
for col in cols_to_fill_mode:
    df[col].fillna(df[col].mode().values[0], inplace=True)

In [553]:
df.head()

Unnamed: 0,Gender,Age,Education,Occupation,Marital Status,Smoking,Diabetes_Type1,Diabetes_Type2,Systolic BP,Diastolic BP,...,Insulin,Others,Drug Resistance,Neuropathy,Nephropathy,Retinopathy,CVD,Stroke,Peripheral Artery Disease,Mortality
0,f,59,primary,business,married,no,yes,no,178,121,...,no,yes,yes,no,yes,no,no,no,no,1.0
1,m,73,secondary,electrician,married,yes,yes,no,192,87,...,yes,no,yes,yes,yes,yes,no,no,no,1.0
2,m,51,primary,b. man,married,no,yes,no,163,80,...,no,yes,no,no,yes,yes,no,no,no,1.0
3,f,84,tertiary,b. woman,widow,no,yes,yes,172,91,...,yes,no,no,yes,yes,no,no,yes,no,1.0
4,m,56,secondary,farmer,married,yes,yes,no,100,60,...,no,yes,yes,no,yes,no,no,no,yes,1.0


In [554]:
# fix haemoglobin column data type to numerical
df['Haemoglobin A1c'] = df['Haemoglobin A1c'].apply(lambda x: x.replace("mg/dl", "")).astype('int')

In [555]:
df["Mortality"].value_counts()

1.0    85
0.0    14
Name: Mortality, dtype: int64

In [556]:
# one hot encoding the dataframe
ohe_df = pd.get_dummies(df.drop("Mortality", axis=1))

In [557]:
# scaling
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(ohe_df)
y = df["Mortality"]

In [558]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=9)

In [559]:
# SMOTE to fix imbalance
sm = SMOTE(random_state=9)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [560]:
# Random Forest Model
rfc_model = RandomForestClassifier(random_state=9)
rfc_model.fit(X_res, y_res)
y_pred = rfc_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.17      0.33      0.22         3
         1.0       0.86      0.71      0.77        17

    accuracy                           0.65        20
   macro avg       0.51      0.52      0.50        20
weighted avg       0.75      0.65      0.69        20



In [561]:
# Gradient Boosting Model
gbc_model = GradientBoostingClassifier(random_state=9)
gbc_model.fit(X_res, y_res)
y_pred = gbc_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.25      0.67      0.36         3
         1.0       0.92      0.65      0.76        17

    accuracy                           0.65        20
   macro avg       0.58      0.66      0.56        20
weighted avg       0.82      0.65      0.70        20



In [562]:
# Decision Tree Model
dtc_model = DecisionTreeClassifier(random_state=9)
dtc_model.fit(X_res, y_res)
y_pred = dtc_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.25      0.67      0.36         3
         1.0       0.92      0.65      0.76        17

    accuracy                           0.65        20
   macro avg       0.58      0.66      0.56        20
weighted avg       0.82      0.65      0.70        20



In [563]:
# SVM Model
svc_model = SVC(random_state=9)
svc_model.fit(X_res, y_res)
y_pred = svc_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.29      0.67      0.40         3
         1.0       0.92      0.71      0.80        17

    accuracy                           0.70        20
   macro avg       0.60      0.69      0.60        20
weighted avg       0.83      0.70      0.74        20



In [564]:
# MLP Model
nn_model = MLPClassifier(max_iter=750, random_state=9)
nn_model.fit(X_res, y_res)
y_pred = nn_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.17      0.33      0.22         3
         1.0       0.86      0.71      0.77        17

    accuracy                           0.65        20
   macro avg       0.51      0.52      0.50        20
weighted avg       0.75      0.65      0.69        20



In [565]:
# KNN Classifier Model
knn_model = KNeighborsClassifier()
knn_model.fit(X_res, y_res)
y_pred = knn_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.22      0.67      0.33         3
         1.0       0.91      0.59      0.71        17

    accuracy                           0.60        20
   macro avg       0.57      0.63      0.52        20
weighted avg       0.81      0.60      0.66        20



Best Model : Random Forest Classifier

In [566]:
# RFC Best Model
rfc_model = RandomForestClassifier(random_state=9)
rfc_model.fit(X_scaled, y)
y_pred = rfc_model.predict(X_scaled)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        14
         1.0       1.00      1.00      1.00        85

    accuracy                           1.00        99
   macro avg       1.00      1.00      1.00        99
weighted avg       1.00      1.00      1.00        99



In [567]:
# save necessary files and best model

with open('df_col_names.pkl', "wb") as f:
    pickle.dump(df.drop("Mortality", axis=1).columns, f)

with open("ohe_df_col_names.pkl", "wb") as f:
    pickle.dump(ohe_df.columns, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open('model.pkl', 'wb') as f:
    pickle.dump(dtc_model, f)