In [4]:
# Imports 
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, accuracy_score, classification_report
from sklearn.preprocessing import label_binarize

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import os

Loading Preprocessed Data

In [3]:
DATA_DIR = "../Data/train_test_data/"

# Load datasets from Parquet files
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
# Replace invalid characters in column names
train_df.columns = [col.replace('<', '').replace('[', '').replace(']', '').replace(',', '') for col in train_df.columns]
val_df.columns = [col.replace('<', '').replace('[', '').replace(']', '').replace(',', '') for col in val_df.columns]
test_df.columns = [col.replace('<', '').replace('[', '').replace(']', '').replace(',', '') for col in test_df.columns]

# Separate features and targets
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

# Convetntional Benchmark Using GridSearch for hyper param tuning
# Separate features and targets
X_train = pd.concat([X_train, X_val], axis=0)
y_train = pd.concat([y_train, y_val], axis=0)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (3526, 130)
X_test shape: (882, 130)


In [5]:
def calculate_auc(y_true, y_prob):
  n_classes = len(np.unique(y_true))
  y_true_binarized = label_binarize(y_true.values, classes=np.arange(n_classes))

  auc_scores = roc_auc_score(y_true_binarized, y_prob, multi_class='ovr', average=None)
  for i, auc_score in enumerate(auc_scores):
    print(f"Class {i} AUC: {auc_score:.4f}")
  overall_auc_macro = roc_auc_score(y_true_binarized, y_prob, multi_class='ovr', average='macro')
  print(f"Overall AUC (Macro-average): {overall_auc_macro:.4f}")
  return

In [6]:
def perform_grid_search(model, param_grid, X_train, y_train, X_test, y_test):
    """
    Performs grid search to find the best model parameters and evaluates the model.

    Args:
    model (estimator): The machine learning model to tune.
    param_grid (dict): Dictionary with parameters names (str) as keys and lists of parameter settings to try as values.
    X_train (array-like): Training data features.
    y_train (array-like): Training data labels.
    X_test (array-like): Test data features.
    y_test (array-like): Test data labels.

    Returns:
    dict: A dictionary containing the best parameters, best cross-validation score, test accuracy, and classification report.
    """
    # Initialize the GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Output the best parameters and the best cross-validation score
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation score: {:.2f}%".format(grid_search.best_score_ * 100))

    # Evaluate on the test set with the best found parameters
    best_model = grid_search.best_estimator_
    y_test_pred = best_model.predict(X_test)
    y_test_proba = best_model.predict_proba(X_test)
    calculate_auc(y_test, y_test_proba)
    class_report = classification_report(y_test, y_test_pred, zero_division=0)
    print(class_report)

    return

In [7]:
# SVM
svm = SVC(probability=True, random_state=42)
param_grid = {
    'C': [1, 10, 100]
}
perform_grid_search(svm, param_grid, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters: {'C': 1}
Best cross-validation score: 83.58%
Class 0 AUC: 0.7687
Class 1 AUC: 0.6762
Class 2 AUC: 0.8082
Class 3 AUC: 0.9470
Class 4 AUC: 0.8529
Overall AUC (Macro-average): 0.8106
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       734
           1       0.00      0.00      0.00         7
           2       0.31      0.04      0.07       107
           3       0.00      0.00      0.00        31
           4       0.00      0.00      0.00         3

    accuracy                           0.83       882
   macro avg       0.23      0.21      0.19       882
weighted avg       0.73      0.83      0.76       882



In [8]:
# Decision Tree
dt_clf = DecisionTreeClassifier(random_state=2023)
param_grid = {'max_depth': [None, 10, 20, 30],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}
perform_grid_search(dt_clf, param_grid, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best cross-validation score: 80.54%
Class 0 AUC: 0.5788
Class 1 AUC: 0.5737
Class 2 AUC: 0.6637
Class 3 AUC: 0.4650
Class 4 AUC: 0.4613
Overall AUC (Macro-average): 0.5485
              precision    recall  f1-score   support

           0       0.86      0.91      0.88       734
           1       0.00      0.00      0.00         7
           2       0.36      0.29      0.32       107
           3       0.29      0.16      0.21        31
           4       0.00      0.00      0.00         3

    accuracy                           0.80       882
   macro avg       0.30      0.27      0.28       882
weighted avg       0.77      0.80      0.78       882



In [9]:
# Random Forest
param_grid = {
    'n_estimators': [10, 30],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],    # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]       # Minimum number of samples required at each leaf node
}
rf = RandomForestClassifier(random_state=42)
perform_grid_search(rf, param_grid, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 30}
Best cross-validation score: 84.80%
Class 0 AUC: 0.7970
Class 1 AUC: 0.8476
Class 2 AUC: 0.8273
Class 3 AUC: 0.9546
Class 4 AUC: 0.6968
Overall AUC (Macro-average): 0.8247
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       734
           1       0.00      0.00      0.00         7
           2       0.57      0.07      0.13       107
           3       1.00      0.03      0.06        31
           4       0.00      0.00      0.00         3

    accuracy                           0.84       882
   macro avg       0.48      0.22      0.22       882
weighted avg       0.80      0.84      0.78       882



In [10]:
# Gaussian Naive Bayes
gnb = GaussianNB()
param_grid = {'var_smoothing': [1e-10, 1e-9, 1e-8]}
perform_grid_search(gnb, param_grid, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters: {'var_smoothing': 1e-08}
Best cross-validation score: 10.21%
Class 0 AUC: 0.5142
Class 1 AUC: 0.5985
Class 2 AUC: 0.5949
Class 3 AUC: 0.5389
Class 4 AUC: 0.5246
Overall AUC (Macro-average): 0.5542
              precision    recall  f1-score   support

           0       0.88      0.05      0.10       734
           1       0.01      0.57      0.02         7
           2       0.17      0.22      0.19       107
           3       0.07      0.29      0.11        31
           4       0.00      0.33      0.01         3

    accuracy                           0.09       882
   macro avg       0.23      0.29      0.09       882
weighted avg       0.76      0.09      0.11       882

