## Import packages

In [1]:
import os, sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# Import performance metrics
from pyod.utils.utility import standardizer, precision_n_scores
from sklearn.metrics import roc_auc_score

## Import algorithms

We will compare the ROC, precision and time of 10 different outlier detection algorithms:

In [2]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

columns = [
    'PCA',
    'MCD',
    'OCSVM',
    'LOF',
    'CBLOF',
    'KNN',
    'HBOS',
    'ABOD',
    'IForest',
    'FB'
]
df_about = pd.DataFrame(columns=['Dataset', '# Samples', '# Dimensions', 'Missing %'])
df_roc   = pd.DataFrame(columns=columns)  # ROC Performance
df_prn   = pd.DataFrame(columns=columns)  # Precision n scores
df_time  = pd.DataFrame(columns=columns)  # Time

## Train models

Files: https://drive.google.com/drive/folders/1oNbHnB_PrJC_s3GCbbQOCw_FDbUJpP1v

In [3]:
!ls data

arrhythmia.mat	letter.mat  optdigits.mat  satellite.mat   vowels.mat
cardio.mat	lympho.mat  pendigits.mat  satimage-2.mat  wbc.mat
glass.mat	mnist.mat   pima.mat	   shuttle.mat
ionosphere.mat	musk.mat    README.md	   vertebral.mat


In [4]:
# Check MATLAB format
loadmat('data/arrhythmia.mat')

{'__header__': b'MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Tue May 17 11:53:12 2016',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 75. ,   0. , 190. , ...,   2.9,  23.3,  49.4],
        [ 56. ,   1. , 165. , ...,   2.1,  20.4,  38.8],
        [ 54. ,   0. , 172. , ...,   3.4,  12.3,  49. ],
        ...,
        [ 36. ,   0. , 166. , ...,   1. , -44.2, -33.2],
        [ 32. ,   1. , 155. , ...,   2.4,  25. ,  46.6],
        [ 78. ,   1. , 160. , ...,   1.6,  21.3,  32.8]]),
 'y': array([[1],
        [0],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
 

### Testing with a single dataset

In [5]:
from time import time

import warnings
warnings.filterwarnings('ignore')

In [6]:
def stat_models(file):
    '''
    @param string file - Filepath of the dataset to use
    '''
    global df_about
    global df_roc
    global df_prn
    global df_time

    print("Processing " + file)

    # Import dataset
    mat = loadmat('data/' + file)

    X = mat['X']
    y = mat['y'].ravel()

    # Create a record to store performance
    # of different algorithms on this dataset
    n       = len(y)
    missing = np.count_nonzero(y) / n

    data_about = [file, n, X.ndim, round(missing * 100, ndigits=4)]
    data_time  = []
    data_roc   = []
    data_prn   = []

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

    # Standardize data
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    # Create different models to try
    classifiers = {
        'Principal Component Analysis (PCA)':
            PCA(contamination=missing, random_state=1),

        'Minimum Covariance Determinant (MCD)':
            MCD(contamination=missing, random_state=1),

        'One-Class Support Vector Machine (OCSVM)':
            OCSVM(contamination=missing),

        'Local Outlier Factor (LOF)':
            LOF(contamination=missing),

        'Cluster-Based Local Outlier Factor (CBLOF)':
            CBLOF(contamination=missing, check_estimator=False, random_state=1),

        'K-Nearest Neighbors (KNN)':
            KNN(contamination=missing),

        'Histogram-Base Outlier Detection (HBOS)':
            HBOS(contamination=missing),

        'Angle-based Outlier Detector (ABOD)':
            ABOD(contamination=missing),

        'Isolation Forest (IForest)':
            IForest(contamination=missing, random_state=1),

        'Feature Bagging (FB)':
            FeatureBagging(contamination=missing, random_state=1)   
    }

    # Run classifiers on dataset
    for clf_name, clf in classifiers.items():
        print("> " + clf_name)

        # Run algorithm
        t0 = time()
        clf.fit(X_train_norm)
        y_predict = clf.decision_function(X_test_norm)
        t1 = time()

        # Add info to performance record
        t   = round(t1 - t0, ndigits=4)
        data_time.append(t)

        roc = round(roc_auc_score(y_test, y_predict), ndigits=4)
        data_roc.append(roc)

        prn = round(precision_n_scores(y_test, y_predict), ndigits=4)
        data_prn.append(prn)

        print('> ROC: {roc:.4f}, precision: {prn:.4f} [time {duration:.4f}s]\n'.format(
            roc=roc,
            prn=prn,
            duration=t
        ))

    i = len(df_roc)
    df_about.loc[i] = data_about
    df_roc.loc[i]   = data_roc
    df_prn.loc[i]   = data_prn
    df_time.loc[i]  = data_time

stat_models('arrhythmia.mat')

Processing arrhythmia.mat
> Principal Component Analysis (PCA)
> ROC: 0.7859, precision: 0.5667 [time 0.0607s]

> Minimum Covariance Determinant (MCD)
> ROC: 0.7764, precision: 0.4000 [time 0.4106s]

> One-Class Support Vector Machine (OCSVM)
> ROC: 0.7875, precision: 0.5333 [time 0.0434s]

> Local Outlier Factor (LOF)
> ROC: 0.7870, precision: 0.5000 [time 0.0638s]

> Cluster-Based Local Outlier Factor (CBLOF)
> ROC: 0.7797, precision: 0.4667 [time 1.1364s]

> K-Nearest Neighbors (KNN)
> ROC: 0.7921, precision: 0.5000 [time 0.0709s]

> Histogram-Base Outlier Detection (HBOS)
> ROC: 0.8086, precision: 0.6000 [time 1.0251s]

> Angle-based Outlier Detector (ABOD)
> ROC: 0.7523, precision: 0.4333 [time 0.7412s]

> Isolation Forest (IForest)
> ROC: 0.8155, precision: 0.5333 [time 0.2196s]

> Feature Bagging (FB)
> ROC: 0.7843, precision: 0.5000 [time 0.5145s]



In [7]:
df_about

Unnamed: 0,Dataset,# Samples,# Dimensions,Missing %
0,arrhythmia.mat,452,2,14.6018


In [8]:
df_roc

Unnamed: 0,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FB
0,0.7859,0.7764,0.7875,0.787,0.7797,0.7921,0.8086,0.7523,0.8155,0.7843


In [9]:
df_prn

Unnamed: 0,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FB
0,0.5667,0.4,0.5333,0.5,0.4667,0.5,0.6,0.4333,0.5333,0.5


In [10]:
df_time

Unnamed: 0,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FB
0,0.0607,0.4106,0.0434,0.0638,1.1364,0.0709,1.0251,0.7412,0.2196,0.5145


### Run on all datasets

In [12]:
files = [
    'arrhythmia.mat',
    'letter.mat',
    'optdigits.mat',
    'satellite.mat',
    'vowels.mat',
    'cardio.mat',
    'lympho.mat', 
    'pendigits.mat',
    'satimage-2.mat',
    'wbc.mat',
    'glass.mat',
    'mnist.mat',
    'pima.mat',
    'shuttle.mat',
    'ionosphere.mat',
    'musk.mat',
    'vertebral.mat'
]

for file in files[1:]:
    stat_models(file)
    print('--------------------------------------------------\n')

Processing letter.mat
> Principal Component Analysis (PCA)
> ROC: 0.5723, precision: 0.1212 [time 0.1334s]

> Minimum Covariance Determinant (MCD)
> ROC: 0.8089, precision: 0.1818 [time 0.8274s]

> One-Class Support Vector Machine (OCSVM)
> ROC: 0.6509, precision: 0.1515 [time 0.0828s]

> Local Outlier Factor (LOF)
> ROC: 0.8838, precision: 0.3939 [time 0.0762s]

> Cluster-Based Local Outlier Factor (CBLOF)
> ROC: 0.7869, precision: 0.2121 [time 0.0839s]

> K-Nearest Neighbors (KNN)
> ROC: 0.8874, precision: 0.3333 [time 0.1057s]

> Histogram-Base Outlier Detection (HBOS)
> ROC: 0.5741, precision: 0.0909 [time 0.0080s]

> Angle-based Outlier Detector (ABOD)
> ROC: 0.8886, precision: 0.3333 [time 0.2609s]

> Isolation Forest (IForest)
> ROC: 0.6470, precision: 0.0909 [time 0.2474s]

> Feature Bagging (FB)
> ROC: 0.8818, precision: 0.3636 [time 0.5763s]

--------------------------------------------------

Processing optdigits.mat
> Principal Component Analysis (PCA)
> ROC: 0.5021, precis

> ROC: 0.8036, precision: 0.0000 [time 0.1495s]

> Feature Bagging (FB)
> ROC: 0.8810, precision: 0.0000 [time 0.0257s]

--------------------------------------------------

Processing mnist.mat
> Principal Component Analysis (PCA)
> ROC: 0.8492, precision: 0.3320 [time 0.0585s]

> Minimum Covariance Determinant (MCD)
> ROC: 0.8315, precision: 0.1406 [time 1.4729s]

> One-Class Support Vector Machine (OCSVM)
> ROC: 0.8482, precision: 0.3359 [time 4.6428s]

> Local Outlier Factor (LOF)
> ROC: 0.7040, precision: 0.3125 [time 5.5231s]

> Cluster-Based Local Outlier Factor (CBLOF)
> ROC: 0.8441, precision: 0.3672 [time 0.7480s]

> K-Nearest Neighbors (KNN)
> ROC: 0.8461, precision: 0.3828 [time 5.7838s]

> Histogram-Base Outlier Detection (HBOS)
> ROC: 0.5726, precision: 0.0977 [time 0.0382s]

> Angle-based Outlier Detector (ABOD)
> ROC: 0.7841, precision: 0.3398 [time 6.1933s]

> Isolation Forest (IForest)
> ROC: 0.8076, precision: 0.3047 [time 1.0516s]

> Feature Bagging (FB)
> ROC: 0.713

In [13]:
df_about.to_csv("df_about.csv", index=False)
df_roc.to_csv("df_roc.csv", index=False)
df_prn.to_csv("df_prn.csv", index=False)
df_time.to_csv("df_time.csv", index=False)

## Analyze performance

In [14]:
df_about

Unnamed: 0,Dataset,# Samples,# Dimensions,Missing %
0,arrhythmia.mat,452,2,14.6018
1,letter.mat,1600,2,6.25
2,optdigits.mat,5216,2,2.8758
3,satellite.mat,6435,2,31.6395
4,vowels.mat,1456,2,3.4341
5,cardio.mat,1831,2,9.6122
6,lympho.mat,148,2,4.0541
7,pendigits.mat,6870,2,2.2707
8,satimage-2.mat,5803,2,1.2235
9,wbc.mat,378,2,5.5556


In [16]:
df_roc

Unnamed: 0,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IForest,FB
0,0.7859,0.7764,0.7875,0.787,0.7797,0.7921,0.8086,0.7523,0.8155,0.7843
1,0.5723,0.8089,0.6509,0.8838,0.7869,0.8874,0.5741,0.8886,0.647,0.8818
2,0.5021,0.3745,0.4946,0.4621,0.7537,0.3713,0.8509,0.4815,0.7707,0.4398
3,0.5972,0.806,0.661,0.5762,0.7913,0.6901,0.7452,0.5876,0.6899,0.573
4,0.6431,0.8498,0.7845,0.9315,0.8843,0.961,0.7265,0.9522,0.772,0.9377
5,0.944,0.7689,0.9281,0.5807,0.7759,0.728,0.8236,0.6028,0.9192,0.6195
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9397,1.0,1.0
7,0.94,0.8395,0.9333,0.4484,0.972,0.741,0.9336,0.6897,0.9439,0.448
8,0.9978,0.9962,1.0,0.2696,1.0,0.9532,0.997,0.7649,0.9998,0.2751
9,0.9246,0.9155,0.9282,0.9176,0.8838,0.9366,0.9683,0.9085,0.9254,0.9197
