In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="subprocess.run")

%run utility_functions.ipynb

In [2]:
df_original = pd.read_csv("Original_Preprocessed_data.csv")
df_syn = pd.read_csv("Synthetic-TVAE.csv")

# Train test split

In [3]:
from sklearn.model_selection import train_test_split

X = df_original.drop('Healthy', axis=1)
y = df_original['Healthy']

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)

In [4]:
# X_test.to_csv("X_test.csv", index=None)
# y_test.to_csv("y_test.csv", index=None)

# Concat X_temp and Synthetic

In [5]:
train_original = pd.concat([X_temp, y_temp], axis=1)

In [6]:
total_train = pd.concat([train_original, df_syn])

In [7]:
X = total_train.drop('Healthy', axis=1)
y = total_train['Healthy']
y_train = y.copy()

# Scaling

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import f1_score, classification_report

rf_classifier = LogisticRegression(random_state=42)
rf_classifier.fit(X_train, y_train)
print("============================= Test result ===========================")
y_pred = rf_classifier.predict(X_test)
evaluate_classifier(y_test, y_pred) 

Accuracy: 0.9709208400646203
AUC Score: 0.9317755255255254
F1 Score: 0.984641638225256
Specificity: 0.8888888888888888
Sensitivity: 0.9746621621621622
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.89      0.73        27
           1       0.99      0.97      0.98       592

    accuracy                           0.97       619
   macro avg       0.81      0.93      0.86       619
weighted avg       0.98      0.97      0.97       619



# RandomForest

In [10]:
from sklearn.ensemble import RandomForestClassifier

clf_random = RandomForestClassifier(n_estimators=300, random_state=42)
clf_random.fit(X_train, y_train)

print("============================= Test result ===========================")
y_pred = clf_random.predict(X_test)
evaluate_classifier(y_test, y_pred) 

Accuracy: 0.9935379644588045
AUC Score: 0.9612737737737737
F1 Score: 0.9966216216216216
Specificity: 0.9259259259259259
Sensitivity: 0.9966216216216216
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        27
           1       1.00      1.00      1.00       592

    accuracy                           0.99       619
   macro avg       0.96      0.96      0.96       619
weighted avg       0.99      0.99      0.99       619



# TabNet

In [11]:
from pytorch_tabnet.tab_model import TabNetClassifier

clf_tabnet = TabNetClassifier()
clf_tabnet.fit(
  X_train, y_train,
  eval_set=[(X_test, y_test)],
  eval_metric=['auc']
)

epoch 0  | loss: 0.8674  | val_0_auc: 0.44904 |  0:00:00s
epoch 1  | loss: 0.62225 | val_0_auc: 0.48111 |  0:00:00s
epoch 2  | loss: 0.49351 | val_0_auc: 0.51846 |  0:00:00s
epoch 3  | loss: 0.42708 | val_0_auc: 0.57658 |  0:00:01s
epoch 4  | loss: 0.3566  | val_0_auc: 0.70126 |  0:00:01s
epoch 5  | loss: 0.33345 | val_0_auc: 0.78278 |  0:00:01s
epoch 6  | loss: 0.27899 | val_0_auc: 0.80868 |  0:00:02s
epoch 7  | loss: 0.24195 | val_0_auc: 0.81313 |  0:00:02s
epoch 8  | loss: 0.22743 | val_0_auc: 0.81694 |  0:00:02s
epoch 9  | loss: 0.19006 | val_0_auc: 0.79861 |  0:00:02s
epoch 10 | loss: 0.19053 | val_0_auc: 0.79292 |  0:00:03s
epoch 11 | loss: 0.19117 | val_0_auc: 0.75507 |  0:00:03s
epoch 12 | loss: 0.16116 | val_0_auc: 0.76079 |  0:00:03s
epoch 13 | loss: 0.12599 | val_0_auc: 0.76739 |  0:00:04s
epoch 14 | loss: 0.13928 | val_0_auc: 0.79104 |  0:00:04s
epoch 15 | loss: 0.1427  | val_0_auc: 0.8268  |  0:00:04s
epoch 16 | loss: 0.1127  | val_0_auc: 0.84162 |  0:00:04s
epoch 17 | los

In [12]:
y_pred = clf_tabnet.predict(X_test)
evaluate_classifier(y_test, y_pred) 

Accuracy: 0.9483037156704361
AUC Score: 0.7785598098098097
F1 Score: 0.9727427597955707
Specificity: 0.5925925925925926
Sensitivity: 0.964527027027027
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.59      0.50        27
           1       0.98      0.96      0.97       592

    accuracy                           0.95       619
   macro avg       0.71      0.78      0.74       619
weighted avg       0.96      0.95      0.95       619



## Decision Tree

In [14]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
evaluate_classifier(y_test, y_pred) 

Accuracy: 0.9886914378029079
AUC Score: 0.9940878378378378
F1 Score: 0.994052676295667
Specificity: 1.0
Sensitivity: 0.9881756756756757
Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.89        27
           1       1.00      0.99      0.99       592

    accuracy                           0.99       619
   macro avg       0.90      0.99      0.94       619
weighted avg       0.99      0.99      0.99       619

