In [7]:
from sklearn.model_selection import train_test_split
from pyod.models.lof import LOF
from pyod.models.combination import average, maximization
from pyod.utils.utility import standardizer
from scipy.io import loadmat
import numpy as np

In [9]:
### Balanced Accuracy ###

def confusion_stats(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    return TP, TN, FP, FN

def metrics_from_confusion(TP, TN, FP, FN):
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    balanced_accuracy = 0.5 * (recall + specificity)
    return balanced_accuracy

In [83]:
from sklearn.preprocessing import StandardScaler, RobustScaler

cardio_data = loadmat('cardio.mat')

X = cardio_data['X']
y = cardio_data['y'].ravel()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, random_state = 42, stratify = y
)

scaler = StandardScaler()
# scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [84]:
### now onto the ensemble itself ###
contamination = np.sum(y_train == 1) / np.size(y_train)
k_neighbors = range(30, 120, 3)

train_scores = []
test_scores = []

for i, neighbors in enumerate(k_neighbors):
    lof = LOF(n_neighbors=neighbors, contamination=contamination)
    lof.fit(X_train)

    y_train_pred = lof.predict(X_train)
    y_test_pred = lof.predict(X_test)

    # TP_train, TN_train, FP_train, FN_train = confusion_stats(y_train, y_train_pred)
    # TP_test, TN_test, FP_test, FN_test = confusion_stats(y_test, y_test_pred)
    # balanced_accuracy_train = metrics_from_confusion(TP_train, TN_train, FP_train, FN_train)
    # balanced_accuracy_test = metrics_from_confusion(TP_test, TN_test, FP_test, FN_test)

    decision_fun_train = lof.decision_function(X_train)
    decision_fun_test = lof.decision_function(X_test)

    train_scores.append(decision_fun_train)
    test_scores.append(decision_fun_test)

train_scores = np.vstack(train_scores)
test_scores  = np.vstack(test_scores)
train_scores = train_scores.T
test_scores  = test_scores.T


print("train_scores.shape:", train_scores.shape)
print("test_scores.shape:", test_scores.shape)

train_scores.shape: (1373, 30)
test_scores.shape: (458, 30)


In [85]:
### AVERAGE METHOD ###

train_scores_s = standardizer(train_scores)
test_scores_s  = standardizer(test_scores)
train_avg = average(train_scores_s)
test_avg  = average(test_scores_s)

# print(train_avg)
# print(test_avg)

thr = np.quantile(train_avg, 1-contamination)
y_train_ens = (train_avg >= thr).astype(int)
y_test_ens  = (test_avg  >= thr).astype(int)

print("true outliers (test):", int(y_test.sum()), "pred outliers (test):", int(y_test_ens.sum()))
TP, TN, FP, FN = confusion_stats(y_test, y_test_ens)
print("TP,TN,FP,FN:", TP, TN, FP, FN)
print("balanced accuracy (test):", metrics_from_confusion(TP, TN, FP, FN))

true outliers (test): 44 pred outliers (test): 31
TP,TN,FP,FN: 8 391 23 36
balanced accuracy (test): 0.5631313131313131


In [86]:
### MAXIMIZATION METHOD ###

train_max = maximization(train_scores_s)
test_max  = maximization(test_scores_s)

# print(train_avg)
# print(test_avg)

thr = np.quantile(train_max, 1-contamination)
y_train_ens = (train_max >= thr).astype(int)
y_test_ens  = (test_max  >= thr).astype(int)

print("true outliers (test):", int(y_test.sum()), "pred outliers (test):", int(y_test_ens.sum()))
TP, TN, FP, FN = confusion_stats(y_test, y_test_ens)
print("TP,TN,FP,FN:", TP, TN, FP, FN)
print("balanced accuracy (test):", metrics_from_confusion(TP, TN, FP, FN))

true outliers (test): 44 pred outliers (test): 31
TP,TN,FP,FN: 10 393 21 34
balanced accuracy (test): 0.5882740447957839


Final observations:
- Used LOF ensemble method
- Maximization is best when StandardScaler is used, but Average becomes best if RobustScaler is used