In [16]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="subprocess.run")

%run utility_functions.ipynb

In [17]:
df_original = pd.read_csv("data/Original_Preprocessed_data.csv")
df_original.shape

(1547, 25)

# Train test split

In [24]:
from sklearn.model_selection import train_test_split

X = df_original.drop('Healthy', axis=1)
y = df_original['Healthy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, stratify=y, random_state=42)

In [25]:
X_train.shape

(1005, 24)

In [26]:
y_train.value_counts()

Healthy
1    961
0     44
Name: count, dtype: int64

# SMOTE

In [27]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)

# # Upsample the minority class
# X_train, y_train = smote.fit_resample(X_train, y_train)

In [28]:
# X_test.to_csv("X_test.csv", index=None)
# y_test.to_csv("y_test.csv", index=None)

# Scaling

In [29]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import f1_score, classification_report

rf_classifier = LogisticRegression(random_state=42)
rf_classifier.fit(X_train, y_train)
print("============================= Test result ===========================")
y_pred = rf_classifier.predict(X_test)
evaluate_classifier(y_test, y_pred) 

Accuracy: 0.985239852398524
AUC Score: 0.8468626958197202
F1 Score: 0.9923371647509579
Specificity: 0.6956521739130435
Sensitivity: 0.9980732177263969
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.70      0.80        23
           1       0.99      1.00      0.99       519

    accuracy                           0.99       542
   macro avg       0.96      0.85      0.90       542
weighted avg       0.98      0.99      0.98       542



# RandomForest

In [31]:
from sklearn.ensemble import RandomForestClassifier

clf_random = RandomForestClassifier(n_estimators=300, random_state=42)
clf_random.fit(X_train, y_train)

print("============================= Test result ===========================")
y_pred = clf_random.predict(X_test)
evaluate_classifier(y_test, y_pred) 

Accuracy: 0.9981549815498155
AUC Score: 0.9782608695652174
F1 Score: 0.9990375360923965
Specificity: 0.9565217391304348
Sensitivity: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        23
           1       1.00      1.00      1.00       519

    accuracy                           1.00       542
   macro avg       1.00      0.98      0.99       542
weighted avg       1.00      1.00      1.00       542



In [38]:
from pytorch_tabnet.tab_model import TabNetClassifier

clf_tabnet = TabNetClassifier()
clf_tabnet.fit(
  X_train, y_train,
  eval_set=[(X_test, y_test)],
  eval_metric=['auc']
)

y_pred = clf_tabnet.predict(X_test)
evaluate_classifier(y_test, y_pred) 



epoch 0  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 1  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 2  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 3  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 4  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 5  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 6  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 7  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 8  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 9  | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s
epoch 10 | loss: 0.0     | val_0_auc: 0.42971 |  0:00:00s

Early stopping occurred at epoch 10 with best_epoch = 0 and best_val_0_auc = 0.42971
Accuracy: 0.6512915129151291
AUC Score: 0.4231800284828684
F1 Score: 0.7869222096956031
Specificity: 0.17391304347826086
Sensitivity: 0.6724470134874759
Classification Report:
              precision    recall  f1-score   support

           0       0.02      0.17      0.04  



In [34]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
evaluate_classifier(y_test, y_pred) 

Accuracy: 1.0
AUC Score: 1.0
F1 Score: 1.0
Specificity: 1.0
Sensitivity: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00       519

    accuracy                           1.00       542
   macro avg       1.00      1.00      1.00       542
weighted avg       1.00      1.00      1.00       542



## TabFPN

In [35]:
from tabpfn import TabPFNClassifier

classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)

classifier.fit(X_train, y_train)
y_pred, p_eval = classifier.predict(X_test, return_winning_probability=True)



In [37]:
y_pred = classifier.predict(X_test)
evaluate_classifier(y_test, y_pred) 



Accuracy: 0.9870848708487084
AUC Score: 0.8686018262545028
F1 Score: 0.9932885906040269
Specificity: 0.7391304347826086
Sensitivity: 0.9980732177263969
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.74      0.83        23
           1       0.99      1.00      0.99       519

    accuracy                           0.99       542
   macro avg       0.97      0.87      0.91       542
weighted avg       0.99      0.99      0.99       542

