# Import Python Package

In [1]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# Install Pyod

In [None]:
!pip install pyod 

# Import pyod Package for Anomaly Detection

In [3]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



# Import Metrics Packages

In [4]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# Define Data File and Read X, Y

In [5]:
mat_file_list = ['arrhythmia.mat',
                 'cardio.mat',
                 'glass.mat',
                 'ionosphere.mat',
                 'letter.mat',
                 'lympho.mat',
                 'mnist.mat',
                 'musk.mat',
                 'optdigits.mat',
                 'pendigits.mat',
                 'pima.mat',
                 'satellite.mat',
                 'satimage-2.mat',
                 'shuttle.mat',
                 'vertebral.mat',
                 'vowels.mat',
                 'wbc.mat']

In [6]:
# Visualise the List
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

# Import the Mat Files in

In [7]:
from scipy.io import loadmat

In [10]:
#Importing the Data
data=loadmat('C:/-----/data/cardio.mat') # Path Cleared Down Post Running

#Checking the data

data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [20]:
#Checking other features of the one loaded file
print("Length of the Data :",len(data))
print("\n")
print("Keys in the Data (Stored as Dictionary) :\n",data.keys())
print("\n")
print("Visualising the Values Stored: \n",data.values())

Length of the Data  5


Keys in the Data (Stored as Dictionary) :
 dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])


Visualising the Values Stored: 
 dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])


# Input(Independent Variable) Feature Shape in Mat file format

In [21]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

# Output(Dependent Variable) Feature Shape in Mat file format

In [22]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [23]:
#Defining Column Headers in the Final File

df_columns = ['Data', '#Samples', '# Dimensions', 'Outlier Perc',
              'ABOD', 'CBLOF', 'FB', 'HBOS', 'IForest', 'KNN', 'LOF', 'MCD',
              'OCSVM', 'PCA']

# ROC Performance Evaluation Table
[An ROC curve (receiver operating characteristic curve) is a graph showing the performance of a classification model at all classification thresholds]

In [24]:
#Defining
roc_df = pd.DataFrame(columns=df_columns)

#Visualising
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Precision Scrore  Evaluation Table 

precision (also called positive predictive value) is the fraction of relevant instances among the retrieved instances, while recall (also known as sensitivity) is the fraction of the total amount of relevant instances that were actually retrieved. Both precision and recall are therefore based on an understanding and measure of relevance.

In [25]:
#Defining
prn_df = pd.DataFrame(columns=df_columns)

#Visualising
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Time Data Frame

    Storing for viewing the performance of both the scores

In [26]:
#Defining
time_df = pd.DataFrame(columns=df_columns)

#Visualising
time_df


Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Defining a Program and Reading all the Mat Files

In [27]:
from time import time
random_state = np.random.RandomState(42) # To have the same random state as Displayed in the Class

for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...') # For Visual Display of the Code
    mat = loadmat(os.path.join('data', mat_file)) # Sequential Input of the file

    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                        random_state=random_state)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
        contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor': CBLOF(
            contamination=outliers_fraction, check_estimator=False,
            random_state=random_state),
        'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                          random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(
            contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,
                                    random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)': LOF(
            contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)': MCD(
            contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)': PCA(
            contamination=outliers_fraction, random_state=random_state),
    }

    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(
            clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 1.3402s




Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643, execution time: 1.3516s
Feature Bagging ROC:0.7799, precision @ rank n:0.5, execution time: 0.5328s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 1.4572s
Isolation Forest ROC:0.8478, precision @ rank n:0.5357, execution time: 0.3828s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.0781s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.0625s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 0.5984s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.0546s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.0429s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @ rank n:0.1875, execution time: 0.3478s
Cluster-based Local Outlier Factor ROC:0.8221, precision @ rank n:0.4844, execution time: 0.121s




Feature Bagging ROC:0.4879, precision @ rank n:0.1406, execution time: 0.6889s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precision @ rank n:0.4688, execution time: 0.0156s
Isolation Forest ROC:0.9316, precision @ rank n:0.4531, execution time: 0.3519s
K Nearest Neighbors (KNN) ROC:0.6959, precision @ rank n:0.2812, execution time: 0.1301s
Local Outlier Factor (LOF) ROC:0.4715, precision @ rank n:0.125, execution time: 0.0915s




Minimum Covariance Determinant (MCD) ROC:0.8778, precision @ rank n:0.3906, execution time: 0.4555s
One-class SVM (OCSVM) ROC:0.9507, precision @ rank n:0.5938, execution time: 0.0937s
Principal Component Analysis (PCA) ROC:0.9638, precision @ rank n:0.6875, execution time: 0.0156s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @ rank n:0.25, execution time: 0.0312s




Cluster-based Local Outlier Factor ROC:0.8506, precision @ rank n:0.25, execution time: 0.0642s
Feature Bagging ROC:0.7043, precision @ rank n:0.25, execution time: 0.0346s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precision @ rank n:0.0, execution time: 0.0s
Isolation Forest ROC:0.7195, precision @ rank n:0.25, execution time: 0.2791s
K Nearest Neighbors (KNN) ROC:0.7805, precision @ rank n:0.25, execution time: 0.0156s
Local Outlier Factor (LOF) ROC:0.7774, precision @ rank n:0.25, execution time: 0.0s
Minimum Covariance Determinant (MCD) ROC:0.7165, precision @ rank n:0.0, execution time: 0.0469s
One-class SVM (OCSVM) ROC:0.6189, precision @ rank n:0.25, execution time: 0.0s
Principal Component Analysis (PCA) ROC:0.622, precision @ rank n:0.25, execution time: 0.0s

... Processing ionosphere.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9004, precision @ rank n:0.8214, execution time: 0.0781s




Cluster-based Local Outlier Factor ROC:0.8952, precision @ rank n:0.8036, execution time: 0.0806s
Feature Bagging ROC:0.8933, precision @ rank n:0.75, execution time: 0.0617s
Histogram-base Outlier Detection (HBOS) ROC:0.5195, precision @ rank n:0.3393, execution time: 0.0156s
Isolation Forest ROC:0.8294, precision @ rank n:0.6607, execution time: 0.2959s
K Nearest Neighbors (KNN) ROC:0.9134, precision @ rank n:0.8393, execution time: 0.0156s
Local Outlier Factor (LOF) ROC:0.8989, precision @ rank n:0.75, execution time: 0.0156s
Minimum Covariance Determinant (MCD) ROC:0.9399, precision @ rank n:0.8571, execution time: 0.0545s
One-class SVM (OCSVM) ROC:0.8372, precision @ rank n:0.7143, execution time: 0.005s
Principal Component Analysis (PCA) ROC:0.7971, precision @ rank n:0.5893, execution time: 0.0123s

... Processing letter.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.8465, precision @ rank n:0.275, execution time: 0.3398s
Cluster-based Local Outlier Factor ROC:0.7423, precisi



Feature Bagging ROC:0.866, precision @ rank n:0.4, execution time: 0.6818s
Histogram-base Outlier Detection (HBOS) ROC:0.5728, precision @ rank n:0.125, execution time: 0.0s
Isolation Forest ROC:0.5836, precision @ rank n:0.05, execution time: 0.361s
K Nearest Neighbors (KNN) ROC:0.845, precision @ rank n:0.3, execution time: 0.125s
Local Outlier Factor (LOF) ROC:0.8409, precision @ rank n:0.325, execution time: 0.0894s
Minimum Covariance Determinant (MCD) ROC:0.7499, precision @ rank n:0.075, execution time: 0.9975s
One-class SVM (OCSVM) ROC:0.5744, precision @ rank n:0.1, execution time: 0.0798s
Principal Component Analysis (PCA) ROC:0.48, precision @ rank n:0.05, execution time: 0.007s

... Processing lympho.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9382, precision @ rank n:0.4, execution time: 0.0269s
Cluster-based Local Outlier Factor ROC:0.9709, precision @ rank n:0.6, execution time: 0.0419s




Feature Bagging ROC:0.9673, precision @ rank n:0.6, execution time: 0.0409s
Histogram-base Outlier Detection (HBOS) ROC:0.9964, precision @ rank n:0.8, execution time: 0.009s
Isolation Forest ROC:0.9855, precision @ rank n:0.6, execution time: 0.3241s
K Nearest Neighbors (KNN) ROC:0.9636, precision @ rank n:0.6, execution time: 0.007s
Local Outlier Factor (LOF) ROC:0.9636, precision @ rank n:0.6, execution time: 0.002s
Minimum Covariance Determinant (MCD) ROC:0.9164, precision @ rank n:0.6, execution time: 0.0359s
One-class SVM (OCSVM) ROC:0.9636, precision @ rank n:0.6, execution time: 0.002s
Principal Component Analysis (PCA) ROC:0.9818, precision @ rank n:0.8, execution time: 0.002s

... Processing mnist.mat ...




Angle-based Outlier Detector (ABOD) ROC:0.7813, precision @ rank n:0.3562, execution time: 6.6252s




Cluster-based Local Outlier Factor ROC:0.8447, precision @ rank n:0.4007, execution time: 0.5639s
Feature Bagging ROC:0.7259, precision @ rank n:0.3664, execution time: 42.8234s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precision @ rank n:0.1199, execution time: 0.0499s
Isolation Forest ROC:0.7813, precision @ rank n:0.3116, execution time: 1.7257s
K Nearest Neighbors (KNN) ROC:0.8409, precision @ rank n:0.4144, execution time: 6.3426s
Local Outlier Factor (LOF) ROC:0.7085, precision @ rank n:0.339, execution time: 5.3962s




Minimum Covariance Determinant (MCD) ROC:0.863, precision @ rank n:0.3973, execution time: 1.9748s
One-class SVM (OCSVM) ROC:0.8417, precision @ rank n:0.3801, execution time: 4.2831s
Principal Component Analysis (PCA) ROC:0.8396, precision @ rank n:0.3767, execution time: 0.1249s

... Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.0809, precision @ rank n:0.0333, execution time: 1.8751s
Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 0.1732s




Feature Bagging ROC:0.5228, precision @ rank n:0.1667, execution time: 12.04s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precision @ rank n:0.9667, execution time: 0.0534s
Isolation Forest ROC:0.9992, precision @ rank n:0.9, execution time: 1.0252s
K Nearest Neighbors (KNN) ROC:0.7348, precision @ rank n:0.2333, execution time: 1.6345s
Local Outlier Factor (LOF) ROC:0.5323, precision @ rank n:0.1333, execution time: 1.6597s
Minimum Covariance Determinant (MCD) ROC:1.0, precision @ rank n:0.9667, execution time: 8.5137s
One-class SVM (OCSVM) ROC:1.0, precision @ rank n:1.0, execution time: 1.0909s
Principal Component Analysis (PCA) ROC:1.0, precision @ rank n:1.0, execution time: 0.1406s

... Processing optdigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.4428, precision @ rank n:0.0161, execution time: 2.1458s




Cluster-based Local Outlier Factor ROC:0.7852, precision @ rank n:0.0, execution time: 0.2813s
Feature Bagging ROC:0.4641, precision @ rank n:0.0484, execution time: 12.4229s
Histogram-base Outlier Detection (HBOS) ROC:0.8822, precision @ rank n:0.2581, execution time: 0.0156s
Isolation Forest ROC:0.5442, precision @ rank n:0.0161, execution time: 0.8215s
K Nearest Neighbors (KNN) ROC:0.3824, precision @ rank n:0.0, execution time: 1.6727s
Local Outlier Factor (LOF) ROC:0.4584, precision @ rank n:0.0484, execution time: 1.3913s




Minimum Covariance Determinant (MCD) ROC:0.3486, precision @ rank n:0.0, execution time: 0.8213s
One-class SVM (OCSVM) ROC:0.4972, precision @ rank n:0.0, execution time: 1.2514s
Principal Component Analysis (PCA) ROC:0.504, precision @ rank n:0.0, execution time: 0.0399s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7008, precision @ rank n:0.0308, execution time: 1.3261s




Cluster-based Local Outlier Factor ROC:0.9609, precision @ rank n:0.3077, execution time: 0.2164s
Feature Bagging ROC:0.4687, precision @ rank n:0.0462, execution time: 4.1565s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precision @ rank n:0.2615, execution time: 0.0156s
Isolation Forest ROC:0.9482, precision @ rank n:0.2615, execution time: 0.5724s
K Nearest Neighbors (KNN) ROC:0.7602, precision @ rank n:0.0462, execution time: 0.5512s
Local Outlier Factor (LOF) ROC:0.481, precision @ rank n:0.0462, execution time: 0.5146s
Minimum Covariance Determinant (MCD) ROC:0.8271, precision @ rank n:0.0615, execution time: 1.8184s
One-class SVM (OCSVM) ROC:0.93, precision @ rank n:0.2923, execution time: 0.869s
Principal Component Analysis (PCA) ROC:0.9332, precision @ rank n:0.3385, execution time: 0.0156s

... Processing pima.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.6757, precision @ rank n:0.5106, execution time: 0.1474s




Cluster-based Local Outlier Factor ROC:0.684, precision @ rank n:0.4681, execution time: 0.071s
Feature Bagging ROC:0.6446, precision @ rank n:0.4468, execution time: 0.0757s
Histogram-base Outlier Detection (HBOS) ROC:0.7169, precision @ rank n:0.5213, execution time: 0.0s
Isolation Forest ROC:0.6777, precision @ rank n:0.4787, execution time: 0.3146s
K Nearest Neighbors (KNN) ROC:0.7252, precision @ rank n:0.5106, execution time: 0.0363s
Local Outlier Factor (LOF) ROC:0.6604, precision @ rank n:0.4787, execution time: 0.009s
Minimum Covariance Determinant (MCD) ROC:0.7047, precision @ rank n:0.4787, execution time: 0.0675s
One-class SVM (OCSVM) ROC:0.6423, precision @ rank n:0.4574, execution time: 0.0s
Principal Component Analysis (PCA) ROC:0.6639, precision @ rank n:0.5, execution time: 0.0s

... Processing satellite.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5681, precision @ rank n:0.3918, execution time: 1.6065s




Cluster-based Local Outlier Factor ROC:0.7234, precision @ rank n:0.5574, execution time: 0.2386s
Feature Bagging ROC:0.557, precision @ rank n:0.4051, execution time: 6.5753s
Histogram-base Outlier Detection (HBOS) ROC:0.7393, precision @ rank n:0.5466, execution time: 0.0156s
Isolation Forest ROC:0.7094, precision @ rank n:0.578, execution time: 0.6477s
K Nearest Neighbors (KNN) ROC:0.6781, precision @ rank n:0.4994, execution time: 1.024s
Local Outlier Factor (LOF) ROC:0.5551, precision @ rank n:0.4051, execution time: 0.8801s
Minimum Covariance Determinant (MCD) ROC:0.792, precision @ rank n:0.6747, execution time: 1.603s
One-class SVM (OCSVM) ROC:0.636, precision @ rank n:0.5224, execution time: 1.2795s
Principal Component Analysis (PCA) ROC:0.5783, precision @ rank n:0.4559, execution time: 0.0219s

... Processing satimage-2.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.86, precision @ rank n:0.2593, execution time: 1.4214s




Cluster-based Local Outlier Factor ROC:0.9987, precision @ rank n:0.8889, execution time: 0.2152s
Feature Bagging ROC:0.4971, precision @ rank n:0.0741, execution time: 5.29s
Histogram-base Outlier Detection (HBOS) ROC:0.9837, precision @ rank n:0.5926, execution time: 0.0156s
Isolation Forest ROC:0.9973, precision @ rank n:0.8889, execution time: 0.5703s
K Nearest Neighbors (KNN) ROC:0.9505, precision @ rank n:0.3704, execution time: 0.7486s
Local Outlier Factor (LOF) ROC:0.5006, precision @ rank n:0.0741, execution time: 0.7034s
Minimum Covariance Determinant (MCD) ROC:0.9946, precision @ rank n:0.5185, execution time: 1.7102s
One-class SVM (OCSVM) ROC:0.9976, precision @ rank n:0.9259, execution time: 1.002s
Principal Component Analysis (PCA) ROC:0.9841, precision @ rank n:0.8519, execution time: 0.0156s

... Processing shuttle.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.6186, precision @ rank n:0.1918, execution time: 12.4773s




Cluster-based Local Outlier Factor ROC:0.6286, precision @ rank n:0.2336, execution time: 0.5017s
Feature Bagging ROC:0.5211, precision @ rank n:0.111, execution time: 45.384s
Histogram-base Outlier Detection (HBOS) ROC:0.9851, precision @ rank n:0.9857, execution time: 0.0156s
Isolation Forest ROC:0.9972, precision @ rank n:0.9337, execution time: 2.4191s
K Nearest Neighbors (KNN) ROC:0.645, precision @ rank n:0.2199, execution time: 7.9781s
Local Outlier Factor (LOF) ROC:0.5347, precision @ rank n:0.1406, execution time: 10.1356s






Minimum Covariance Determinant (MCD) ROC:0.9903, precision @ rank n:0.7534, execution time: 10.3088s
One-class SVM (OCSVM) ROC:0.9922, precision @ rank n:0.9553, execution time: 40.1747s
Principal Component Analysis (PCA) ROC:0.9902, precision @ rank n:0.9503, execution time: 0.0312s

... Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.2797, precision @ rank n:0.0, execution time: 0.064s
Cluster-based Local Outlier Factor ROC:0.3908, precision @ rank n:0.0, execution time: 0.0509s




Feature Bagging ROC:0.3027, precision @ rank n:0.0, execution time: 0.0582s
Histogram-base Outlier Detection (HBOS) ROC:0.2695, precision @ rank n:0.0, execution time: 0.002s
Isolation Forest ROC:0.3576, precision @ rank n:0.0, execution time: 0.3008s
K Nearest Neighbors (KNN) ROC:0.318, precision @ rank n:0.0, execution time: 0.01s
Local Outlier Factor (LOF) ROC:0.318, precision @ rank n:0.0, execution time: 0.003s
Minimum Covariance Determinant (MCD) ROC:0.3308, precision @ rank n:0.0, execution time: 0.0429s
One-class SVM (OCSVM) ROC:0.4087, precision @ rank n:0.0, execution time: 0.002s
Principal Component Analysis (PCA) ROC:0.3397, precision @ rank n:0.0, execution time: 0.002s

... Processing vowels.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9521, precision @ rank n:0.4706, execution time: 0.2325s
Cluster-based Local Outlier Factor ROC:0.9278, precision @ rank n:0.4118, execution time: 0.0625s




Feature Bagging ROC:0.9385, precision @ rank n:0.3529, execution time: 0.2651s
Histogram-base Outlier Detection (HBOS) ROC:0.6758, precision @ rank n:0.1765, execution time: 0.0s
Isolation Forest ROC:0.7469, precision @ rank n:0.1176, execution time: 0.3646s
K Nearest Neighbors (KNN) ROC:0.9568, precision @ rank n:0.5294, execution time: 0.0748s
Local Outlier Factor (LOF) ROC:0.9345, precision @ rank n:0.4118, execution time: 0.0323s
Minimum Covariance Determinant (MCD) ROC:0.6779, precision @ rank n:0.0, execution time: 0.6737s
One-class SVM (OCSVM) ROC:0.7415, precision @ rank n:0.2941, execution time: 0.0279s
Principal Component Analysis (PCA) ROC:0.5787, precision @ rank n:0.1176, execution time: 0.0s

... Processing wbc.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9232, precision @ rank n:0.3, execution time: 0.0938s




Cluster-based Local Outlier Factor ROC:0.9063, precision @ rank n:0.6, execution time: 0.0633s
Feature Bagging ROC:0.9415, precision @ rank n:0.5, execution time: 0.0727s
Histogram-base Outlier Detection (HBOS) ROC:0.9592, precision @ rank n:0.7, execution time: 0.008s
Isolation Forest ROC:0.9451, precision @ rank n:0.5, execution time: 0.3224s
K Nearest Neighbors (KNN) ROC:0.9437, precision @ rank n:0.5, execution time: 0.019s
Local Outlier Factor (LOF) ROC:0.9352, precision @ rank n:0.4, execution time: 0.0069s
Minimum Covariance Determinant (MCD) ROC:0.8986, precision @ rank n:0.4, execution time: 0.0678s
One-class SVM (OCSVM) ROC:0.9408, precision @ rank n:0.5, execution time: 0.005s
Principal Component Analysis (PCA) ROC:0.9324, precision @ rank n:0.6, execution time: 0.003s


# Visualising ROC

In [28]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8478,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9316,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8294,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5836,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7813,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9992,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5442,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9482,0.7602,0.481,0.8271,0.93,0.9332


# Visualizing Precision Score

In [29]:
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5357,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.1406,0.4688,0.4531,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3664,0.1199,0.3116,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.1667,0.9667,0.9,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2615,0.0462,0.0462,0.0615,0.2923,0.3385


# Visualising the Time Taken

In [30]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,1.3402,1.3516,0.5328,1.4572,0.3828,0.0781,0.0625,0.5984,0.0546,0.0429
0,cardio,1831,21,9.6122,0.3478,0.121,0.6889,0.0156,0.3519,0.1301,0.0915,0.4555,0.0937,0.0156
0,glass,214,9,4.2056,0.0312,0.0642,0.0346,0.0,0.2791,0.0156,0.0,0.0469,0.0,0.0
0,ionosphere,351,33,35.8974,0.0781,0.0806,0.0617,0.0156,0.2959,0.0156,0.0156,0.0545,0.005,0.0123
0,letter,1600,32,6.25,0.3398,0.1005,0.6818,0.0,0.361,0.125,0.0894,0.9975,0.0798,0.007
0,lympho,148,18,4.0541,0.0269,0.0419,0.0409,0.009,0.3241,0.007,0.002,0.0359,0.002,0.002
0,mnist,7603,100,9.2069,6.6252,0.5639,42.8234,0.0499,1.7257,6.3426,5.3962,1.9748,4.2831,0.1249
0,musk,3062,166,3.1679,1.8751,0.1732,12.04,0.0534,1.0252,1.6345,1.6597,8.5137,1.0909,0.1406
0,optdigits,5216,64,2.8758,2.1458,0.2813,12.4229,0.0156,0.8215,1.6727,1.3913,0.8213,1.2514,0.0399
0,pendigits,6870,16,2.2707,1.3261,0.2164,4.1565,0.0156,0.5724,0.5512,0.5146,1.8184,0.869,0.0156


# Define the number of inliers and outliers

In [31]:
n_samples = len(y)
clusters_separation = [0]

n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1

# Show the statistics of the data

In [32]:
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print('Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(shape=ground_truth.shape))
print(ground_truth)

Number of inliers: 357
Number of outliers: 21
Ground truth shape is (378,). Outlier are 1 and inliers are 0.

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]
