In [1]:
# anaconda prompt
# pip install pyod or conda install -c conda-forge pyod
# colab 
# !pip install pyod

# Import Python Packages

In [2]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# Import Pyod Packages & the Methods

In [3]:
# Methods for anomaly detection
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



# Import Metrics Packages

In [4]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores #measure to identify perfomance of methods
from sklearn.metrics import roc_auc_score

# Define data file and read X and y

In [7]:
mat_file_list = ['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat','lympho.mat','mnist.mat',
                 'musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat',
                 'vertebral.mat','vowels.mat','wbc.mat']
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

# Loading mat file

In [8]:
data=loadmat('cardio.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [9]:
len(data)

5

In [10]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [11]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

# Input Feature Shape in Mat file format

In [12]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

# Output Feature Shape in Mat file format

In [13]:
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [14]:
df_columns=['Data','#Samples','#Dimension','Outlier Perc',
        'ABOD','LOF','CBLOF','FB','HBOS','IForest',
       'KNN','MCD','OCSVM','PCA']

# ROC(Region of characterstics) Performance evolution table

In [16]:
roc_df=pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,ABOD,LOF,CBLOF,FB,HBOS,IForest,KNN,MCD,OCSVM,PCA


# precision_n_scores Perfrmance Evolution Table

In [17]:
prn_df=pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,ABOD,LOF,CBLOF,FB,HBOS,IForest,KNN,MCD,OCSVM,PCA


# Time dataframe

In [18]:
time_df=pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,ABOD,LOF,CBLOF,FB,HBOS,IForest,KNN,MCD,OCSVM,PCA


# Exploaring all mat files

In [19]:
from time import time
random_state = np.random.RandomState(42)

for mat_file in mat_file_list:
    print("\n",mat_file)
    mat=loadmat(os.path.join(mat_file))
    X=mat['X']
    y=np.ravel(mat['y']) #method to convert 2d into 1d
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage=round(outliers_fraction*100,ndigits=4)
    
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    
    X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.4,random_state=random_state)
    X_train_norm, X_test_norm = standardizer(X_train, X_test)
    classifiers= {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                 'Cluster-based Local Outlier Factor':CBLOF(contamination=outliers_fraction, check_estimator= False, random_state=random_state),
                 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state),
                 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state),
                 'K Nearest Neighbors(KNN)': KNN(contamination=outliers_fraction),
                 'Local Outlier Factor(LOF)': LOF(contamination=outliers_fraction),
                 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state),
                 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
                 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state)}
    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1- t0, ndigits=4)
        time_list.append(duration)
        
        roc= round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)
        print('{clf_name} ROC:{roc}, precision @ rank n:{prn},''execution time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn ,duration=duration))
        
        roc_list.append(roc)
        prn_list.append(prn)
        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)
        
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)
        
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


 arrhythmia.mat
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571,execution time: 2.8714s
Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643,execution time: 3.1879s
Feature Bagging ROC:0.7799, precision @ rank n:0.5,execution time: 0.5886s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714,execution time: 1.9524s
Isolation Forest ROC:0.8527, precision @ rank n:0.5714,execution time: 0.6632s
K Nearest Neighbors(KNN) ROC:0.782, precision @ rank n:0.5,execution time: 0.1069s
Local Outlier Factor(LOF) ROC:0.7787, precision @ rank n:0.4643,execution time: 0.079s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286,execution time: 1.9241s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5,execution time: 0.038s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5,execution time: 0.1149s

 cardio.mat
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @ rank n:0.1875,execution time: 0.6406s
Cluster-based Local Outlier Factor ROC:0.8221, precision @ rank n:0.4844,execution time: 0.1789s
Feature Bagging ROC:0.4879, precision @ rank n:0.1406,execution time: 1.0274s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precision @ rank n:0.4688,execution time: 0.014s
Isolation Forest ROC:0.9414, precision @ rank n:0.5,execution time: 0.5487s
K Nearest Neighbors(KNN) ROC:0.6959, precision @ rank n:0.2812,execution time: 0.1979s
Local Outlier Factor(LOF) ROC:0.4715, precision @ rank n:0.125,execution time: 0.1129s




Minimum Covariance Determinant (MCD) ROC:0.8781, precision @ rank n:0.3906,execution time: 1.0924s
One-class SVM (OCSVM) ROC:0.9507, precision @ rank n:0.5938,execution time: 0.0969s
Principal Component Analysis (PCA) ROC:0.9638, precision @ rank n:0.6875,execution time: 0.067s

 glass.mat
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @ rank n:0.25,execution time: 0.0859s
Cluster-based Local Outlier Factor ROC:0.8506, precision @ rank n:0.25,execution time: 0.078s
Feature Bagging ROC:0.7043, precision @ rank n:0.25,execution time: 0.05s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precision @ rank n:0.0,execution time: 0.004s
Isolation Forest ROC:0.7195, precision @ rank n:0.25,execution time: 0.4367s
K Nearest Neighbors(KNN) ROC:0.7805, precision @ rank n:0.25,execution time: 0.017s
Local Outlier Factor(LOF) ROC:0.7774, precision @ rank n:0.25,execution time: 0.006s
Minimum Covariance Determinant (MCD) ROC:0.7165, precision @ rank n:0.0,execution time: 0.085s
One-cl



Angle-based Outlier Detector (ABOD) ROC:0.7813, precision @ rank n:0.3562,execution time: 9.7s
Cluster-based Local Outlier Factor ROC:0.8447, precision @ rank n:0.4007,execution time: 1.2422s
Feature Bagging ROC:0.7259, precision @ rank n:0.3664,execution time: 58.649s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precision @ rank n:0.1199,execution time: 0.1134s
Isolation Forest ROC:0.7801, precision @ rank n:0.2979,execution time: 4.5502s
K Nearest Neighbors(KNN) ROC:0.8409, precision @ rank n:0.4144,execution time: 9.8851s
Local Outlier Factor(LOF) ROC:0.7085, precision @ rank n:0.339,execution time: 6.7528s




Minimum Covariance Determinant (MCD) ROC:0.863, precision @ rank n:0.3973,execution time: 4.859s
One-class SVM (OCSVM) ROC:0.8417, precision @ rank n:0.3801,execution time: 4.3575s
Principal Component Analysis (PCA) ROC:0.8396, precision @ rank n:0.3767,execution time: 0.2241s

 musk.mat
Angle-based Outlier Detector (ABOD) ROC:0.0809, precision @ rank n:0.0333,execution time: 2.8596s
Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0,execution time: 0.4299s
Feature Bagging ROC:0.5228, precision @ rank n:0.1667,execution time: 15.1093s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precision @ rank n:0.9667,execution time: 0.0815s
Isolation Forest ROC:0.9996, precision @ rank n:0.9333,execution time: 2.0863s
K Nearest Neighbors(KNN) ROC:0.7348, precision @ rank n:0.2333,execution time: 2.4275s
Local Outlier Factor(LOF) ROC:0.5323, precision @ rank n:0.1333,execution time: 1.6527s
Minimum Covariance Determinant (MCD) ROC:1.0, precision @ rank n:0.9667,execution time:



Minimum Covariance Determinant (MCD) ROC:0.3486, precision @ rank n:0.0,execution time: 1.9141s
One-class SVM (OCSVM) ROC:0.4972, precision @ rank n:0.0,execution time: 1.4954s
Principal Component Analysis (PCA) ROC:0.504, precision @ rank n:0.0,execution time: 0.0869s

 pendigits.mat
Angle-based Outlier Detector (ABOD) ROC:0.7008, precision @ rank n:0.0308,execution time: 2.7s
Cluster-based Local Outlier Factor ROC:0.9609, precision @ rank n:0.3077,execution time: 0.3428s
Feature Bagging ROC:0.4687, precision @ rank n:0.0462,execution time: 5.6969s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precision @ rank n:0.2615,execution time: 0.0156s
Isolation Forest ROC:0.9422, precision @ rank n:0.2769,execution time: 0.8876s
K Nearest Neighbors(KNN) ROC:0.7602, precision @ rank n:0.0462,execution time: 0.8453s
Local Outlier Factor(LOF) ROC:0.481, precision @ rank n:0.0462,execution time: 0.684s
Minimum Covariance Determinant (MCD) ROC:0.8271, precision @ rank n:0.0615,execution time:



Minimum Covariance Determinant (MCD) ROC:0.9903, precision @ rank n:0.7534,execution time: 14.6989s
One-class SVM (OCSVM) ROC:0.9922, precision @ rank n:0.9553,execution time: 73.0463s
Principal Component Analysis (PCA) ROC:0.9902, precision @ rank n:0.9503,execution time: 0.047s

 vertebral.mat
Angle-based Outlier Detector (ABOD) ROC:0.2797, precision @ rank n:0.0,execution time: 0.0859s
Cluster-based Local Outlier Factor ROC:0.3908, precision @ rank n:0.0,execution time: 0.074s
Feature Bagging ROC:0.3027, precision @ rank n:0.0,execution time: 0.051s
Histogram-base Outlier Detection (HBOS) ROC:0.2695, precision @ rank n:0.0,execution time: 0.003s
Isolation Forest ROC:0.3576, precision @ rank n:0.0,execution time: 0.3826s
K Nearest Neighbors(KNN) ROC:0.318, precision @ rank n:0.0,execution time: 0.02s
Local Outlier Factor(LOF) ROC:0.318, precision @ rank n:0.0,execution time: 0.003s
Minimum Covariance Determinant (MCD) ROC:0.3321, precision @ rank n:0.0,execution time: 0.085s
One-clas

In [20]:
roc_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,ABOD,LOF,CBLOF,FB,HBOS,IForest,KNN,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9414,0.6959,0.4715,0.8781,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8309,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5778,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7801,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9996,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5764,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9422,0.7602,0.481,0.8271,0.93,0.9332


In [21]:
time_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,ABOD,LOF,CBLOF,FB,HBOS,IForest,KNN,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,2.8714,3.1879,0.5886,1.9524,0.6632,0.1069,0.079,1.9241,0.038,0.1149
0,cardio,1831,21,9.6122,0.6406,0.1789,1.0274,0.014,0.5487,0.1979,0.1129,1.0924,0.0969,0.067
0,glass,214,9,4.2056,0.0859,0.078,0.05,0.004,0.4367,0.017,0.006,0.085,0.002,0.016
0,ionosphere,351,33,35.8974,0.1229,0.0829,0.0929,0.012,0.4437,0.034,0.01,0.1069,0.006,0.019
0,letter,1600,32,6.25,0.6796,0.1709,0.8934,0.013,0.6126,0.1869,0.1209,1.5061,0.0819,0.006
0,lympho,148,18,4.0541,0.046,0.053,0.038,0.007,0.4077,0.01,0.006,0.048,0.001,0.002
0,mnist,7603,100,9.2069,9.7,1.2422,58.649,0.1134,4.5502,9.8851,6.7528,4.859,4.3575,0.2241
0,musk,3062,166,3.1679,2.8596,0.4299,15.1093,0.0815,2.0863,2.4275,1.6527,19.9487,1.1604,0.1778
0,optdigits,5216,64,2.8758,3.561,0.5648,16.249,0.051,1.4587,2.455,1.8556,1.9141,1.4954,0.0869
0,pendigits,6870,16,2.2707,2.7,0.3428,5.6969,0.0156,0.8876,0.8453,0.684,2.691,1.1589,0.01


In [22]:
prn_df

Unnamed: 0,Data,#Samples,#Dimension,Outlier Perc,ABOD,LOF,CBLOF,FB,HBOS,IForest,KNN,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.1406,0.4688,0.5,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3664,0.1199,0.2979,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.1667,0.9667,0.9333,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2769,0.0462,0.0462,0.0615,0.2923,0.3385
