# Import Python packages

In [4]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat


# Import Pyod Packages and Methods

In [6]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging


# Import Metrics Packages

In [8]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# Define data file and read X and y

In [9]:
mat_file_list = [
    'arrhythmia.mat',
    'cardio.mat',
    'glass.mat',
    'ionosphere.mat',
    'letter.mat',
    'lympho.mat',
    'mnist.mat',
    'musk.mat',
    'optdigits.mat',
    'pendigits.mat',
    'pima.mat',
    'satellite.mat',
    'satimage-2.mat',
    'shuttle.mat',
    'vertebral.mat',
    'vowels.mat',
    'wbc.mat'
]

# Define nine outlier detection tools to be compared

In [12]:
df_columns = ['Data', '#Samples', '# Dimensions', 'Outlier Perc', 'ABOD', 'CBLOF', 'FB', 'HBOS', 'IForest', 'KNN', 'LOF', 'MCD', 'OCSVM', 'PCA']

roc_df = pd.DataFrame(columns = df_columns)
prn_df = pd.DataFrame(columns = df_columns)
time_df = pd.DataFrame(columns = df_columns)

# Input(Independenent) Feature Shape in Mat file format

In [15]:
data = loadmat('Anamoly_detec_data/arrhythmia.mat')
type(data['X']),data['X'].shape

(numpy.ndarray, (452, 274))

# Dependent/ Target/ Output Feature shape

In [17]:
type(data['y']),data['y'].shape

(numpy.ndarray, (452, 1))

# ROC (Region of Characterstics) Performance Evulotion table

In [19]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# precision_n_scores - Performance evulotion table

In [21]:
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Time dataframe

In [23]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Exploring all Mat files

In [47]:
random_state = np.random.RandomState(42)
from time import time

for mat_file in mat_file_list:
    print("\n...Processing", mat_file, '....')
    mat = loadmat(os.path.join('Anamoly_detec_data', mat_file))
    
    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits = 4)
    
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
            
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = random_state)
    
    X_train_norm, X_test_norm = standardizer(X_train, X_test)
    
    classifiers = {
    'ABOD': ABOD(contamination=outliers_fraction),
    'CBLOF': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state),
    'FeatureBagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state),
    'HBOS': HBOS(contamination=outliers_fraction),
    'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state),
    'KNN': KNN(contamination=outliers_fraction),
    'LOF': LOF(contamination=outliers_fraction),
    'MCD': MCD(contamination=outliers_fraction, random_state=random_state),
    'OCSVM': OCSVM(contamination=outliers_fraction),
    'PCA': PCA(contamination=outliers_fraction, random_state=random_state),
    }
    
    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1-t0, ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, execution time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)
        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis = 0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis = 0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis = 0)




...Processing arrhythmia.mat ....
ABOD ROC:0.7687, precision @ rank n:0.3571, execution time: 0.5766s
CBLOF ROC:0.7789, precision @ rank n:0.4643, execution time: 0.2618s
FeatureBagging ROC:0.7796, precision @ rank n:0.4643, execution time: 0.6266s
HBOS ROC:0.8511, precision @ rank n:0.5714, execution time: 0.0969s




Isolation Forest ROC:0.8637, precision @ rank n:0.6071, execution time: 0.4794s
KNN ROC:0.782, precision @ rank n:0.5, execution time: 0.0919s
LOF ROC:0.7787, precision @ rank n:0.4643, execution time: 0.0674s




MCD ROC:0.8228, precision @ rank n:0.4286, execution time: 1.0993s
OCSVM ROC:0.7986, precision @ rank n:0.5, execution time: 0.044s
PCA ROC:0.8, precision @ rank n:0.5, execution time: 0.068s

...Processing cardio.mat ....
ABOD ROC:0.5892, precision @ rank n:0.1918, execution time: 0.6846s
CBLOF ROC:0.8845, precision @ rank n:0.4932, execution time: 0.2149s
FeatureBagging ROC:0.6385, precision @ rank n:0.1781, execution time: 1.1763s
HBOS ROC:0.8373, precision @ rank n:0.4521, execution time: 0.027s




Isolation Forest ROC:0.951, precision @ rank n:0.6027, execution time: 0.9384s
KNN ROC:0.734, precision @ rank n:0.3562, execution time: 0.3308s
LOF ROC:0.588, precision @ rank n:0.1507, execution time: 0.1739s




MCD ROC:0.8534, precision @ rank n:0.411, execution time: 1.0883s
OCSVM ROC:0.9478, precision @ rank n:0.5342, execution time: 0.1299s
PCA ROC:0.9616, precision @ rank n:0.6849, execution time: 0.028s

...Processing glass.mat ....
ABOD ROC:0.6951, precision @ rank n:0.25, execution time: 0.0979s
CBLOF ROC:0.811, precision @ rank n:0.25, execution time: 0.068s
FeatureBagging ROC:0.7073, precision @ rank n:0.25, execution time: 0.06s
HBOS ROC:0.7073, precision @ rank n:0.0, execution time: 0.003s




Isolation Forest ROC:0.7134, precision @ rank n:0.25, execution time: 0.2918s
KNN ROC:0.8384, precision @ rank n:0.25, execution time: 0.012s
LOF ROC:0.7043, precision @ rank n:0.25, execution time: 0.005s
MCD ROC:0.8293, precision @ rank n:0.0, execution time: 0.0755s
OCSVM ROC:0.6585, precision @ rank n:0.25, execution time: 0.001s
PCA ROC:0.686, precision @ rank n:0.25, execution time: 0.0078s

...Processing ionosphere.mat ....




ABOD ROC:0.9181, precision @ rank n:0.8431, execution time: 0.1499s
CBLOF ROC:0.9176, precision @ rank n:0.8039, execution time: 0.068s
FeatureBagging ROC:0.9303, precision @ rank n:0.8039, execution time: 0.076s
HBOS ROC:0.6052, precision @ rank n:0.3922, execution time: 0.013s




Isolation Forest ROC:0.8516, precision @ rank n:0.6078, execution time: 0.3518s
KNN ROC:0.932, precision @ rank n:0.8824, execution time: 0.031s
LOF ROC:0.9227, precision @ rank n:0.7843, execution time: 0.008s
MCD ROC:0.9669, precision @ rank n:0.8627, execution time: 0.1279s
OCSVM ROC:0.8257, precision @ rank n:0.6863, execution time: 0.01s
PCA ROC:0.7941, precision @ rank n:0.5686, execution time: 0.004s

...Processing letter.mat ....
ABOD ROC:0.8783, precision @ rank n:0.4375, execution time: 0.7056s
CBLOF ROC:0.7783, precision @ rank n:0.1875, execution time: 0.2109s
FeatureBagging ROC:0.8947, precision @ rank n:0.4062, execution time: 1.2352s
HBOS ROC:0.6063, precision @ rank n:0.0938, execution time: 0.014s




Isolation Forest ROC:0.6279, precision @ rank n:0.0625, execution time: 0.4447s
KNN ROC:0.8573, precision @ rank n:0.3125, execution time: 0.1869s
LOF ROC:0.8765, precision @ rank n:0.3438, execution time: 0.1049s
MCD ROC:0.8061, precision @ rank n:0.1875, execution time: 2.4145s
OCSVM ROC:0.5927, precision @ rank n:0.125, execution time: 0.1399s
PCA ROC:0.5216, precision @ rank n:0.125, execution time: 0.013s

...Processing lympho.mat ....
ABOD ROC:0.9831, precision @ rank n:0.0, execution time: 0.079s
CBLOF ROC:1.0, precision @ rank n:1.0, execution time: 0.0969s
FeatureBagging ROC:1.0, precision @ rank n:1.0, execution time: 0.059s
HBOS ROC:1.0, precision @ rank n:1.0, execution time: 0.009s




Isolation Forest ROC:1.0, precision @ rank n:1.0, execution time: 0.4437s
KNN ROC:1.0, precision @ rank n:1.0, execution time: 0.015s
LOF ROC:1.0, precision @ rank n:1.0, execution time: 0.004s
MCD ROC:1.0, precision @ rank n:1.0, execution time: 0.1249s
OCSVM ROC:1.0, precision @ rank n:1.0, execution time: 0.003s
PCA ROC:1.0, precision @ rank n:1.0, execution time: 0.004s

...Processing 



mnist.mat ....
ABOD ROC:0.7628, precision @ rank n:0.3367, execution time: 11.4239s
CBLOF ROC:0.8389, precision @ rank n:0.3912, execution time: 1.7279s
FeatureBagging ROC:0.7157, precision @ rank n:0.3741, execution time: 74.8768s
HBOS ROC:0.5766, precision @ rank n:0.1361, execution time: 0.1489s




Isolation Forest ROC:0.7915, precision @ rank n:0.2687, execution time: 4.8727s
KNN ROC:0.8498, precision @ rank n:0.432, execution time: 10.0817s
LOF ROC:0.7195, precision @ rank n:0.3673, execution time: 13.4412s




MCD ROC:0.8713, precision @ rank n:0.2653, execution time: 9.4811s
OCSVM ROC:0.854, precision @ rank n:0.3946, execution time: 6.9063s
PCA ROC:0.8534, precision @ rank n:0.3878, execution time: 0.2181s

...Processing musk.mat ....
ABOD ROC:0.2161, precision @ rank n:0.1, execution time: 3.5624s
CBLOF ROC:1.0, precision @ rank n:1.0, execution time: 0.5377s
FeatureBagging ROC:0.473, precision @ rank n:0.125, execution time: 26.2046s
HBOS ROC:0.9999, precision @ rank n:0.975, execution time: 0.1039s




Isolation Forest ROC:1.0, precision @ rank n:1.0, execution time: 2.037s
KNN ROC:0.8009, precision @ rank n:0.175, execution time: 2.38s
LOF ROC:0.4629, precision @ rank n:0.125, execution time: 2.1205s
MCD ROC:1.0, precision @ rank n:1.0, execution time: 45.3393s
OCSVM ROC:1.0, precision @ rank n:1.0, execution time: 1.3562s
PCA ROC:1.0, precision @ rank n:1.0, execution time: 0.1829s

...Processing optdigits.mat ....
ABOD ROC:0.4894, precision @ rank n:0.0152, execution time: 5.3887s
CBLOF ROC:0.7901, precision @ rank n:0.0, execution time: 1.2652s
FeatureBagging ROC:0.5062, precision @ rank n:0.0303, execution time: 20.8161s
HBOS ROC:0.8774, precision @ rank n:0.2121, execution time: 0.047s




Isolation Forest ROC:0.686, precision @ rank n:0.0303, execution time: 1.584s
KNN ROC:0.406, precision @ rank n:0.0, execution time: 2.5794s
LOF ROC:0.5277, precision @ rank n:0.0303, execution time: 2.0487s




MCD ROC:0.3822, precision @ rank n:0.0, execution time: 1.7558s
OCSVM ROC:0.5171, precision @ rank n:0.0, execution time: 1.7659s
PCA ROC:0.526, precision @ rank n:0.0, execution time: 0.058s

...Processing pendigits.mat ....
ABOD ROC:0.667, precision @ rank n:0.0526, execution time: 3.266s
CBLOF ROC:0.8082, precision @ rank n:0.1579, execution time: 0.3598s
FeatureBagging ROC:0.4889, precision @ rank n:0.0526, execution time: 4.9899s
HBOS ROC:0.9348, precision @ rank n:0.2632, execution time: 0.012s




Isolation Forest ROC:0.939, precision @ rank n:0.3333, execution time: 0.8954s
KNN ROC:0.7371, precision @ rank n:0.0702, execution time: 1.1716s
LOF ROC:0.4965, precision @ rank n:0.0702, execution time: 1.0284s
MCD ROC:0.8204, precision @ rank n:0.0877, execution time: 2.8133s
OCSVM ROC:0.9235, precision @ rank n:0.3158, execution time: 1.5331s
PCA ROC:0.9309, precision @ rank n:0.3158, execution time: 0.0104s

...Processing pima.mat ....
ABOD ROC:0.7163, precision @ rank n:0.5253, execution time: 0.2958s
CBLOF ROC:0.67, precision @ rank n:0.4949, execution time: 0.2129s
FeatureBagging ROC:0.6448, precision @ rank n:0.4444, execution time: 0.1309s
HBOS ROC:0.711, precision @ rank n:0.5354, execution time: 0.004s




Isolation Forest ROC:0.6829, precision @ rank n:0.5253, execution time: 0.3118s
KNN ROC:0.7395, precision @ rank n:0.5859, execution time: 0.075s
LOF ROC:0.6574, precision @ rank n:0.4646, execution time: 0.027s
MCD ROC:0.7175, precision @ rank n:0.5152, execution time: 0.0979s
OCSVM ROC:0.6561, precision @ rank n:0.5051, execution time: 0.021s
PCA ROC:0.6762, precision @ rank n:0.5354, execution time: 0.003s

...Processing satellite.mat ....
ABOD ROC:0.5653, precision @ rank n:0.3962, execution time: 4.0945s
CBLOF ROC:0.7241, precision @ rank n:0.5412, execution time: 0.5706s
FeatureBagging ROC:0.572, precision @ rank n:0.4, execution time: 11.4229s
HBOS ROC:0.7486, precision @ rank n:0.57, execution time: 0.041s




Isolation Forest ROC:0.6838, precision @ rank n:0.5812, execution time: 1.2932s
KNN ROC:0.6853, precision @ rank n:0.4988, execution time: 1.9728s
LOF ROC:0.572, precision @ rank n:0.395, execution time: 1.571s
MCD ROC:0.8055, precision @ rank n:0.6762, execution time: 3.9965s
OCSVM ROC:0.6478, precision @ rank n:0.5225, execution time: 2.8782s
PCA ROC:0.5923, precision @ rank n:0.465, execution time: 0.1419s

...Processing satimage-2.mat ....
ABOD ROC:0.8432, precision @ rank n:0.2333, execution time: 21.1379s
CBLOF ROC:0.9998, precision @ rank n:0.9333, execution time: 0.9574s
FeatureBagging ROC:0.5235, precision @ rank n:0.1667, execution time: 12826.9263s
HBOS ROC:0.9784, precision @ rank n:0.6, execution time: 0.039s




Isolation Forest ROC:0.9955, precision @ rank n:0.8667, execution time: 1.7279s
KNN ROC:0.9515, precision @ rank n:0.4333, execution time: 1.4141s
LOF ROC:0.5257, precision @ rank n:0.1667, execution time: 1.7519s
MCD ROC:0.9963, precision @ rank n:0.6667, execution time: 5.7145s
OCSVM ROC:0.9997, precision @ rank n:0.9, execution time: 2.0667s
PCA ROC:0.9816, precision @ rank n:0.7333, execution time: 0.031s

...Processing shuttle.mat ....
ABOD ROC:0.6171, precision @ rank n:0.2003, execution time: 32.7257s
CBLOF ROC:0.6273, precision @ rank n:0.2025, execution time: 1.555s
FeatureBagging ROC:0.4725, precision @ rank n:0.0257, execution time: 240.265s
HBOS ROC:0.9871, precision @ rank n:0.9985, execution time: 0.071s




Isolation Forest ROC:0.9976, precision @ rank n:0.9501, execution time: 12.5653s
KNN ROC:0.6507, precision @ rank n:0.212, execution time: 46.2263s
LOF ROC:0.5556, precision @ rank n:0.1548, execution time: 57.2963s






MCD ROC:0.9899, precision @ rank n:0.7395, execution time: 60.2875s
OCSVM ROC:0.9934, precision @ rank n:0.956, execution time: 169.2807s
PCA ROC:0.9915, precision @ rank n:0.9516, execution time: 0.0515s

...Processing vertebral.mat ....
ABOD ROC:0.5366, precision @ rank n:0.2143, execution time: 0.0989s
CBLOF ROC:0.439, precision @ rank n:0.0714, execution time: 0.07s
FeatureBagging ROC:0.5279, precision @ rank n:0.1429, execution time: 0.056s
HBOS ROC:0.3506, precision @ rank n:0.0, execution time: 0.002s




Isolation Forest ROC:0.3789, precision @ rank n:0.0, execution time: 0.3028s
KNN ROC:0.4573, precision @ rank n:0.0714, execution time: 0.016s
LOF ROC:0.4983, precision @ rank n:0.1429, execution time: 0.006s
MCD ROC:0.4103, precision @ rank n:0.0714, execution time: 0.066s
OCSVM ROC:0.4686, precision @ rank n:0.0714, execution time: 0.002s
PCA ROC:0.4085, precision @ rank n:0.0, execution time: 0.002s

...Processing vowels.mat ....
ABOD ROC:0.9616, precision @ rank n:0.6316, execution time: 0.5244s
CBLOF ROC:0.8963, precision @ rank n:0.3158, execution time: 0.1335s
FeatureBagging ROC:0.9365, precision @ rank n:0.3684, execution time: 0.4187s
HBOS ROC:0.6876, precision @ rank n:0.1579, execution time: 0.009s




Isolation Forest ROC:0.8214, precision @ rank n:0.1579, execution time: 0.7721s
KNN ROC:0.9734, precision @ rank n:0.4737, execution time: 0.2322s
LOF ROC:0.9398, precision @ rank n:0.3684, execution time: 0.054s
MCD ROC:0.7243, precision @ rank n:0.1053, execution time: 1.4263s
OCSVM ROC:0.8163, precision @ rank n:0.2632, execution time: 0.057s
PCA ROC:0.6297, precision @ rank n:0.1579, execution time: 0.004s

...Processing wbc.mat ....
ABOD ROC:0.921, precision @ rank n:0.375, execution time: 0.1639s
CBLOF ROC:0.9149, precision @ rank n:0.375, execution time: 0.072s
FeatureBagging ROC:0.9271, precision @ rank n:0.375, execution time: 0.0929s
HBOS ROC:0.9479, precision @ rank n:0.5, execution time: 0.013s




Isolation Forest ROC:0.9418, precision @ rank n:0.625, execution time: 0.2901s
KNN ROC:0.9444, precision @ rank n:0.5, execution time: 0.027s
LOF ROC:0.9227, precision @ rank n:0.375, execution time: 0.009s
MCD ROC:0.9288, precision @ rank n:0.5, execution time: 0.1119s
OCSVM ROC:0.9358, precision @ rank n:0.375, execution time: 0.007s
PCA ROC:0.9262, precision @ rank n:0.375, execution time: 0.004s
