In [98]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

In [99]:
pip install pyod



In [100]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

In [101]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [102]:
mat_file_list=['arrhythmia.mat', 'cardio.mat', 'glass.mat', 'ionosphere.mat', 'letter.mat', 'lympho.mat', 'mnist.mat', 'musk.mat', 'optdigits.mat', 'pendigits.mat','pima.mat','satellite.mat','satimage-2.mat', 'shuttle.mat','vertebral.mat', 'vowels.mat','wbc.mat']

In [103]:
data=loadmat('cardio.mat')
print(data)

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '__version__': '1.0', '__globals__': [], 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), 'y': array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])}


In [140]:
from time import time
random_state=np.random.RandomState(42)

for mat_file in mat_file_list:
  print("\n... processing", mat_file,'...')
  mat = loadmat(mat_file)

  X = mat['X']
  y = mat['y'].ravel()
  outliers_fraction = np.count_nonzero(y) / len(y)
  outliers_percentage = round(outliers_fraction *  100, ndigits=4 )

  roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
  prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
  time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

  X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.4,  random_state=random_state)

  X_train_norm ,X_test_norm = standardizer(X_train, X_test)

  classifiers = {'Angle-based Outliers Detector (ABOD) ' : ABOD(contamination=outliers_fraction),
                 'Cluster-based Local Outlier Factor' : CBLOF(contamination=outliers_fraction,  check_estimator=False, random_state=random_state),
                 "Feature Bagging": FeatureBagging(contamination=outliers_fraction, random_state=random_state),
                 'Histogram-base Outlier Detection (HBOS) ' : HBOS(contamination=outliers_fraction),
                 'Isolation Forest':  IForest(contamination=outliers_fraction, random_state=random_state), 
                 'K Nearest Neighbours (KNN)': KNN(contamination=outliers_fraction), 
                 "Local Outlier Factor (LOF)": LOF(contamination=outliers_fraction),
                 "Minimum Convariance Determinant (MCD)": MCD(contamination=outliers_fraction, random_state=random_state), 
                 "One-class SVM(OCSVM)": OCSVM(contamination=outliers_fraction),
                 "Principal Component Analysis (PCA)" : PCA(contamination=outliers_fraction, random_state=random_state)}
  for clf_name, clf in classifiers.items():
    
    
    t0=time()
    clf.fit(X_train_norm)
    test_scores=clf.decision_function(X_test_norm)
    t1=time()
    duration=round(t1-t0, ndigits=4)
    time_list.append(duration)

    roc=round(roc_auc_score(y_test, test_scores), ndigits=4)
    prn=round(precision_n_scores(y_test, test_scores), ndigits=4)



    print('{clf_name} ROC : {roc}, precision @ rank n : {prn}, ''execution time : {duration}s'.format(clf_name=clf_name, roc=roc,prn=prn,duration=duration))


    roc_list.append(roc)
    prn_list.append(prn)


... processing arrhythmia.mat ...
Angle-based Outliers Detector (ABOD)  ROC : 0.7687, precision @ rank n : 0.3571, execution time : 0.1814s
Cluster-based Local Outlier Factor ROC : 0.7684, precision @ rank n : 0.4643, execution time : 0.1883s
Feature Bagging ROC : 0.7799, precision @ rank n : 0.5, execution time : 0.681s
Histogram-base Outlier Detection (HBOS)  ROC : 0.8511, precision @ rank n : 0.5714, execution time : 0.0723s
Isolation Forest ROC : 0.8527, precision @ rank n : 0.5714, execution time : 0.4871s
K Nearest Neighbours (KNN) ROC : 0.782, precision @ rank n : 0.5, execution time : 0.0979s
Local Outlier Factor (LOF) ROC : 0.7787, precision @ rank n : 0.4643, execution time : 0.0827s




Minimum Convariance Determinant (MCD) ROC : 0.8228, precision @ rank n : 0.4286, execution time : 1.2562s
One-class SVM(OCSVM) ROC : 0.7986, precision @ rank n : 0.5, execution time : 0.0621s
Principal Component Analysis (PCA) ROC : 0.8, precision @ rank n : 0.5, execution time : 0.082s

... processing cardio.mat ...
Angle-based Outliers Detector (ABOD)  ROC : 0.5763, precision @ rank n : 0.1875, execution time : 0.4251s
Cluster-based Local Outlier Factor ROC : 0.8221, precision @ rank n : 0.4844, execution time : 0.2609s
Feature Bagging ROC : 0.4879, precision @ rank n : 0.1406, execution time : 0.8409s
Histogram-base Outlier Detection (HBOS)  ROC : 0.8453, precision @ rank n : 0.4688, execution time : 0.007s
Isolation Forest ROC : 0.9414, precision @ rank n : 0.5, execution time : 0.4348s
K Nearest Neighbours (KNN) ROC : 0.6959, precision @ rank n : 0.2812, execution time : 0.1503s
Local Outlier Factor (LOF) ROC : 0.4715, precision @ rank n : 0.125, execution time : 0.1062s




Minimum Convariance Determinant (MCD) ROC : 0.8778, precision @ rank n : 0.3906, execution time : 0.8823s
One-class SVM(OCSVM) ROC : 0.9507, precision @ rank n : 0.5938, execution time : 0.1363s
Principal Component Analysis (PCA) ROC : 0.9638, precision @ rank n : 0.6875, execution time : 0.006s

... processing glass.mat ...
Angle-based Outliers Detector (ABOD)  ROC : 0.7104, precision @ rank n : 0.25, execution time : 0.0613s
Cluster-based Local Outlier Factor ROC : 0.8506, precision @ rank n : 0.25, execution time : 0.0652s
Feature Bagging ROC : 0.7043, precision @ rank n : 0.25, execution time : 0.0399s
Histogram-base Outlier Detection (HBOS)  ROC : 0.6524, precision @ rank n : 0.0, execution time : 0.005s
Isolation Forest ROC : 0.7195, precision @ rank n : 0.25, execution time : 0.3288s
K Nearest Neighbours (KNN) ROC : 0.7805, precision @ rank n : 0.25, execution time : 0.0091s
Local Outlier Factor (LOF) ROC : 0.7774, precision @ rank n : 0.25, execution time : 0.0041s
Minimum Conv



Angle-based Outliers Detector (ABOD)  ROC : 0.7813, precision @ rank n : 0.3562, execution time : 8.3074s
Cluster-based Local Outlier Factor ROC : 0.8447, precision @ rank n : 0.4007, execution time : 1.2227s
Feature Bagging ROC : 0.7259, precision @ rank n : 0.3664, execution time : 58.5885s
Histogram-base Outlier Detection (HBOS)  ROC : 0.5675, precision @ rank n : 0.1199, execution time : 0.0614s
Isolation Forest ROC : 0.7801, precision @ rank n : 0.2979, execution time : 1.584s
K Nearest Neighbours (KNN) ROC : 0.8409, precision @ rank n : 0.4144, execution time : 7.8034s
Local Outlier Factor (LOF) ROC : 0.7085, precision @ rank n : 0.339, execution time : 7.4489s




Minimum Convariance Determinant (MCD) ROC : 0.863, precision @ rank n : 0.3973, execution time : 4.2148s
One-class SVM(OCSVM) ROC : 0.8417, precision @ rank n : 0.3801, execution time : 5.9833s
Principal Component Analysis (PCA) ROC : 0.8396, precision @ rank n : 0.3767, execution time : 0.1388s

... processing musk.mat ...
Angle-based Outliers Detector (ABOD)  ROC : 0.0809, precision @ rank n : 0.0333, execution time : 2.4817s
Cluster-based Local Outlier Factor ROC : 1.0, precision @ rank n : 1.0, execution time : 0.4458s
Feature Bagging ROC : 0.5228, precision @ rank n : 0.1667, execution time : 15.1285s
Histogram-base Outlier Detection (HBOS)  ROC : 0.9999, precision @ rank n : 0.9667, execution time : 0.0768s
Isolation Forest ROC : 0.9996, precision @ rank n : 0.9333, execution time : 1.0752s
K Nearest Neighbours (KNN) ROC : 0.7348, precision @ rank n : 0.2333, execution time : 2.0405s
Local Outlier Factor (LOF) ROC : 0.5323, precision @ rank n : 0.1333, execution time : 1.8808s
Mi



Minimum Convariance Determinant (MCD) ROC : 0.3486, precision @ rank n : 0.0, execution time : 2.0958s
One-class SVM(OCSVM) ROC : 0.4972, precision @ rank n : 0.0, execution time : 1.9279s
Principal Component Analysis (PCA) ROC : 0.504, precision @ rank n : 0.0, execution time : 0.047s

... processing pendigits.mat ...
Angle-based Outliers Detector (ABOD)  ROC : 0.7008, precision @ rank n : 0.0308, execution time : 1.4589s
Cluster-based Local Outlier Factor ROC : 0.9609, precision @ rank n : 0.3077, execution time : 0.4313s
Feature Bagging ROC : 0.4687, precision @ rank n : 0.0462, execution time : 4.0726s
Histogram-base Outlier Detection (HBOS)  ROC : 0.9294, precision @ rank n : 0.2615, execution time : 0.01s
Isolation Forest ROC : 0.9422, precision @ rank n : 0.2769, execution time : 0.7933s
K Nearest Neighbours (KNN) ROC : 0.7602, precision @ rank n : 0.0462, execution time : 0.5661s
Local Outlier Factor (LOF) ROC : 0.481, precision @ rank n : 0.0462, execution time : 0.5111s
Minim



Minimum Convariance Determinant (MCD) ROC : 0.9903, precision @ rank n : 0.7534, execution time : 17.4198s
One-class SVM(OCSVM) ROC : 0.9922, precision @ rank n : 0.9553, execution time : 68.8122s
Principal Component Analysis (PCA) ROC : 0.9902, precision @ rank n : 0.9503, execution time : 0.0346s

... processing vertebral.mat ...
Angle-based Outliers Detector (ABOD)  ROC : 0.2797, precision @ rank n : 0.0, execution time : 0.0636s
Cluster-based Local Outlier Factor ROC : 0.3908, precision @ rank n : 0.0, execution time : 0.0532s
Feature Bagging ROC : 0.3027, precision @ rank n : 0.0, execution time : 0.0462s
Histogram-base Outlier Detection (HBOS)  ROC : 0.2695, precision @ rank n : 0.0, execution time : 0.0031s
Isolation Forest ROC : 0.3576, precision @ rank n : 0.0, execution time : 0.3211s
K Nearest Neighbours (KNN) ROC : 0.318, precision @ rank n : 0.0, execution time : 0.0102s
Local Outlier Factor (LOF) ROC : 0.318, precision @ rank n : 0.0, execution time : 0.0049s
Minimum Conv