In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="subprocess.run")

%run utility_functions.ipynb

In [2]:
df_original = pd.read_csv("data/Original_Preprocessed_data.csv")
df_syn = pd.read_csv("data/Synthetic.csv")

# Train test split

In [3]:
from sklearn.model_selection import train_test_split

X = df_original.drop('Healthy', axis=1)
y = df_original['Healthy']

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [4]:
# X_test.to_csv("X_test.csv", index=None)
# y_test.to_csv("y_test.csv", index=None)

# Concat X_temp and Synthetic

In [5]:
train_original = pd.concat([X_temp, y_temp], axis=1)

In [6]:
total_train = pd.concat([train_original, df_syn])

In [7]:
X = total_train.drop('Healthy', axis=1)
y = total_train['Healthy']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scaling

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import f1_score, classification_report

rf_classifier = LogisticRegression(random_state=42)
rf_classifier.fit(X_train, y_train)
print("============================= Test result ===========================")
y_pred = rf_classifier.predict(X_test)
evaluate_classifier(y_test, y_pred) 
print("============================= Validation result ===========================")
y_pred = rf_classifier.predict(X_val)
evaluate_classifier(y_val, y_pred) 

Accuracy: 0.7
AUC Score: 0.8066563066563067
F1 Score: 0.8151093439363817
Specificity: 0.9230769230769231
Sensitivity: 0.6902356902356902
Classification Report:
              precision    recall  f1-score   support

           0       0.12      0.92      0.21        13
           1       1.00      0.69      0.82       297

    accuracy                           0.70       310
   macro avg       0.56      0.81      0.51       310
weighted avg       0.96      0.70      0.79       310

Accuracy: 0.5586592178770949
AUC Score: 0.5
F1 Score: 0.0
Specificity: 1.0
Sensitivity: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.56      1.00      0.72       300
           1       0.00      0.00      0.00       237

    accuracy                           0.56       537
   macro avg       0.28      0.50      0.36       537
weighted avg       0.31      0.56      0.40       537



# RandomForest

In [16]:
from sklearn.ensemble import RandomForestClassifier

clf_random = RandomForestClassifier(n_estimators=300, random_state=42)
clf_random.fit(X_train, y_train)

print("============================= Test result ===========================")
y_pred = clf_random.predict(X_test)
evaluate_classifier(y_test, y_pred) 
print("============================= Validation result ===========================")
y_pred = clf_random.predict(X_val)
evaluate_classifier(y_val, y_pred) 

Accuracy: 0.9580645161290322
AUC Score: 0.9413364413364413
F1 Score: 0.9777015437392796
Specificity: 0.9230769230769231
Sensitivity: 0.9595959595959596
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.92      0.65        13
           1       1.00      0.96      0.98       297

    accuracy                           0.96       310
   macro avg       0.75      0.94      0.81       310
weighted avg       0.98      0.96      0.96       310

Accuracy: 0.5586592178770949
AUC Score: 0.5
F1 Score: 0.0
Specificity: 1.0
Sensitivity: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.56      1.00      0.72       300
           1       0.00      0.00      0.00       237

    accuracy                           0.56       537
   macro avg       0.28      0.50      0.36       537
weighted avg       0.31      0.56      0.40       537

