In [10]:
import tqdm
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from pyod.utils.utility import standardizer
from scipy.io import loadmat

from pyod.models.iforest import IForest
from pyod.models.dif import DIF
from pyod.models.loda import LODA

In [None]:
### Balanced Accuracy and ROC AUC###

def _confusion_stats(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    return recall, specificity

def bal_acc(y_true, y_pred):
    recall, specificity = _confusion_stats(y_true, y_pred)
    balanced_accuracy = 0.5 * (recall + specificity)
    return balanced_accuracy

def roc_auc(y_true, y_scores):
    y_true = np.array(y_true).ravel()
    y_scores = np.array(y_scores).ravel()
    
    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    
    # Compute AUC
    auc_score = auc(fpr, tpr)
    
    return auc_score, fpr, tpr

In [14]:
from sklearn.preprocessing import StandardScaler, RobustScaler

shuttle_data = loadmat('shuttle.mat')

X = shuttle_data['X']
y = shuttle_data['y'].ravel()

splits = 10
ba_scores = {'IForest': [], 'LODA': [], 'DIF': []}
roc_auc_scores = {'IForest': [], 'LODA': [], 'DIF': []}

for split in tqdm.tqdm(range(splits), desc="Processing splits"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=split)
    scaler = StandardScaler()
    # scaler = RobustScaler()
    X_train_norm = scaler.fit_transform(X_train)
    X_test_norm = scaler.transform(X_test)

    # IForest
    clf = IForest(contamination=0.02, random_state=split)
    clf.fit(X_train_norm)
    scores = clf.decision_function(X_test_norm)
    y_pred = clf.predict(X_test_norm)
    ba_scores['IForest'].append(bal_acc(y_test, y_pred))
    auc_score, _, _ = roc_auc(y_test, scores)
    roc_auc_scores['IForest'].append(auc_score)

    # LODA
    loda = LODA(contamination=0.02)
    loda.fit(X_train_norm)
    scores = loda.decision_function(X_test_norm)
    y_pred = loda.predict(X_test_norm)
    ba_scores['LODA'].append(bal_acc(y_test, y_pred))
    auc_score, _, _ = roc_auc(y_test, scores)
    roc_auc_scores['LODA'].append(auc_score)

    # DIF
    dif = DIF(contamination=0.02, random_state=split)
    dif.fit(X_train_norm)
    scores = dif.decision_function(X_test_norm)
    y_pred = dif.predict(X_test_norm)
    ba_scores['DIF'].append(bal_acc(y_test, y_pred))
    auc_score, _, _ = roc_auc(y_test, scores)
    roc_auc_scores['DIF'].append(auc_score)

for model in ['IForest', 'LODA', 'DIF']:
    print(f"{model}: Mean BA = {np.mean(ba_scores[model]):.4f}, Mean ROC AUC = {np.mean(roc_auc_scores[model]):.4f}")

Processing splits: 100%|██████████| 10/10 [09:26<00:00, 56.68s/it]

IForest: Mean BA = 0.6370, Mean ROC AUC = 0.9968
LODA: Mean BA = 0.5597, Mean ROC AUC = 0.6870
DIF: Mean BA = 0.5017, Mean ROC AUC = 0.9691



