In [1]:
import os
import sys
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

## Import PyOD Packages

In [3]:
from pyod.models.pca import  PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



In [4]:
#Metrics packages
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [5]:
mat_files = ["arrhythmia.mat","cardio.mat","glass.mat","ionosphere.mat","letter.mat","lympho.mat",
             "mnist.mat","musk.mat","optdigits.mat","pendigits.mat","pima.mat","satellite.mat","satimage-2.mat",
             "shuttle.mat","vertebral.mat","vowels.mat","wbc.mat"]

In [6]:
random_state = np.random.RandomState(42)

In [7]:
df_columns = ['Data','#Samples','#Dimensions','Outlier Perc','ABOD'
             ,'CBLOF','FB','HBOS','IForest','KNN','LOF','MCD','OCSVM','PCA']

roc_df = pd.DataFrame(columns=df_columns)
prn_df = pd.DataFrame(columns=df_columns)
time_df = pd.DataFrame(columns=df_columns)


In [8]:
data = loadmat('data/cardio.mat')

In [9]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [10]:
len(data)

5

In [11]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [12]:
# Shape and Type of Independent/Input variable
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

In [13]:
# Shape and Type of Dependent/Output variable
type(data['y']),data['y'].shape

(numpy.ndarray, (1831, 1))

In [14]:
roc_df = pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [15]:
prn_df = pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [16]:
time_df = pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


# Exploring mat files 

In [19]:
from time import time

random_state = np.random.RandomState(42)
for mat_file in mat_files:
    print("/n  Processing",mat_file,". . ./n")
    mat = loadmat(os.path.join('data',mat_file))
    
    X = mat['X']
    y = mat['y'].ravel()
    
    outlier_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outlier_fraction * 100, ndigits=4)
    
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=random_state)
    
    X_train_norm,X_test_norm = standardizer(X_train,X_test)
    
    classifiers = {
        'Angle Based Outlier Detection(ABOD)': ABOD(contamination=outlier_fraction),
        'Cluster-Based Local Outlier Factor(CBLOF)':CBLOF(contamination=outlier_fraction,check_estimator=False,
                                                          random_state=random_state),
        'FeatureBagging': FeatureBagging(contamination=outlier_fraction,random_state=random_state),
        'Histogram Based Outlier Detection':HBOS(contamination=outlier_fraction),
        'Isolation Forest': IForest(contamination=outlier_fraction,random_state=random_state),
        'K-Nearest Neighbour': KNN(contamination=outlier_fraction),
        'Local Outleir factor': LOF(contamination=outlier_fraction),
        'Minimum Covariance Determinant': MCD(contamination=outlier_fraction,random_state=random_state),
        'One-Class SVM': OCSVM(contamination=outlier_fraction),
        'Principle Component Analysis':PCA(contamination=outlier_fraction,random_state=random_state),
    }
    
    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_score = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)
        
        roc = round(roc_auc_score(y_test,test_score),ndigits=4)
        prn = round(precision_n_scores(y_test,test_score),ndigits=4)
        
        print('{clf_name} ROC:{roc}, Precision @ rank n:{prn},''Execution Time:{duration}s'.format
              (clf_name=clf_name,roc=roc,prn=prn,duration=duration))
        
        roc_list.append(roc)
        prn_list.append(prn)        
        
    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df,temp_df],axis = 0)
        
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df,temp_df],axis = 0)
        
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df,temp_df],axis = 0)

/n  Processing arrhythmia.mat . . ./n
Angle Based Outlier Detection(ABOD) ROC:0.7687, Precision @ rank n:0.3571,Execution Time:0.1167s
Cluster-Based Local Outlier Factor(CBLOF) ROC:0.7684, Precision @ rank n:0.4643,Execution Time:1.2645s
FeatureBagging ROC:0.7799, Precision @ rank n:0.5,Execution Time:0.4847s
Histogram Based Outlier Detection ROC:0.8511, Precision @ rank n:0.5714,Execution Time:1.2055s
Isolation Forest ROC:0.8527, Precision @ rank n:0.5714,Execution Time:0.2962s
K-Nearest Neighbour ROC:0.782, Precision @ rank n:0.5,Execution Time:0.0708s
Local Outleir factor ROC:0.7787, Precision @ rank n:0.4643,Execution Time:0.0599s




Minimum Covariance Determinant ROC:0.8228, Precision @ rank n:0.4286,Execution Time:0.4248s
One-Class SVM ROC:0.7986, Precision @ rank n:0.5,Execution Time:0.0489s
Principle Component Analysis ROC:0.7997, Precision @ rank n:0.5,Execution Time:0.0599s
/n  Processing cardio.mat . . ./n
Angle Based Outlier Detection(ABOD) ROC:0.5763, Precision @ rank n:0.1875,Execution Time:0.3092s
Cluster-Based Local Outlier Factor(CBLOF) ROC:0.8221, Precision @ rank n:0.4844,Execution Time:0.1097s
FeatureBagging ROC:0.4879, Precision @ rank n:0.1406,Execution Time:0.6243s
Histogram Based Outlier Detection ROC:0.8453, Precision @ rank n:0.4688,Execution Time:0.006s
Isolation Forest ROC:0.9414, Precision @ rank n:0.5,Execution Time:0.2912s
K-Nearest Neighbour ROC:0.6959, Precision @ rank n:0.2812,Execution Time:0.1147s
Local Outleir factor ROC:0.4715, Precision @ rank n:0.125,Execution Time:0.0768s




Minimum Covariance Determinant ROC:0.8778, Precision @ rank n:0.3906,Execution Time:0.4308s
One-Class SVM ROC:0.9507, Precision @ rank n:0.5938,Execution Time:0.0569s
Principle Component Analysis ROC:0.9638, Precision @ rank n:0.6875,Execution Time:0.003s
/n  Processing glass.mat . . ./n
Angle Based Outlier Detection(ABOD) ROC:0.7104, Precision @ rank n:0.25,Execution Time:0.0269s
Cluster-Based Local Outlier Factor(CBLOF) ROC:0.8506, Precision @ rank n:0.25,Execution Time:0.0319s
FeatureBagging ROC:0.7043, Precision @ rank n:0.25,Execution Time:0.0269s
Histogram Based Outlier Detection ROC:0.6524, Precision @ rank n:0.0,Execution Time:0.003s
Isolation Forest ROC:0.7195, Precision @ rank n:0.25,Execution Time:0.2384s
K-Nearest Neighbour ROC:0.7805, Precision @ rank n:0.25,Execution Time:0.007s
Local Outleir factor ROC:0.7774, Precision @ rank n:0.25,Execution Time:0.003s
Minimum Covariance Determinant ROC:0.7165, Precision @ rank n:0.0,Execution Time:0.025s
One-Class SVM ROC:0.6189, Pre



Angle Based Outlier Detection(ABOD) ROC:0.7813, Precision @ rank n:0.3562,Execution Time:5.8135s
Cluster-Based Local Outlier Factor(CBLOF) ROC:0.8447, Precision @ rank n:0.4007,Execution Time:0.8228s
FeatureBagging ROC:0.7259, Precision @ rank n:0.3664,Execution Time:40.6938s
Histogram Based Outlier Detection ROC:0.5675, Precision @ rank n:0.1199,Execution Time:0.0389s
Isolation Forest ROC:0.7801, Precision @ rank n:0.2979,Execution Time:1.3932s
K-Nearest Neighbour ROC:0.8409, Precision @ rank n:0.4144,Execution Time:5.4006s
Local Outleir factor ROC:0.7085, Precision @ rank n:0.339,Execution Time:4.7792s




Minimum Covariance Determinant ROC:0.863, Precision @ rank n:0.3973,Execution Time:1.7413s
One-Class SVM ROC:0.8417, Precision @ rank n:0.3801,Execution Time:3.752s
Principle Component Analysis ROC:0.8396, Precision @ rank n:0.3767,Execution Time:0.1107s
/n  Processing musk.mat . . ./n
Angle Based Outlier Detection(ABOD) ROC:0.0809, Precision @ rank n:0.0333,Execution Time:1.7603s
Cluster-Based Local Outlier Factor(CBLOF) ROC:1.0, Precision @ rank n:1.0,Execution Time:0.2912s
FeatureBagging ROC:0.5228, Precision @ rank n:0.1667,Execution Time:10.2725s
Histogram Based Outlier Detection ROC:0.9999, Precision @ rank n:0.9667,Execution Time:0.0439s
Isolation Forest ROC:0.9996, Precision @ rank n:0.9333,Execution Time:0.7789s
K-Nearest Neighbour ROC:0.7348, Precision @ rank n:0.2333,Execution Time:1.3294s
Local Outleir factor ROC:0.5323, Precision @ rank n:0.1333,Execution Time:1.2766s
Minimum Covariance Determinant ROC:1.0, Precision @ rank n:0.9667,Execution Time:6.6681s
One-Class SVM ROC



Minimum Covariance Determinant ROC:0.3486, Precision @ rank n:0.0,Execution Time:0.6745s
One-Class SVM ROC:0.4972, Precision @ rank n:0.0,Execution Time:1.1051s
Principle Component Analysis ROC:0.504, Precision @ rank n:0.0,Execution Time:0.0299s
/n  Processing pendigits.mat . . ./n
Angle Based Outlier Detection(ABOD) ROC:0.7008, Precision @ rank n:0.0308,Execution Time:1.0732s
Cluster-Based Local Outlier Factor(CBLOF) ROC:0.9609, Precision @ rank n:0.3077,Execution Time:0.2234s
FeatureBagging ROC:0.4687, Precision @ rank n:0.0462,Execution Time:3.6403s
Histogram Based Outlier Detection ROC:0.9294, Precision @ rank n:0.2615,Execution Time:0.007s
Isolation Forest ROC:0.9422, Precision @ rank n:0.2769,Execution Time:0.4987s
K-Nearest Neighbour ROC:0.7602, Precision @ rank n:0.0462,Execution Time:0.4698s
Local Outleir factor ROC:0.481, Precision @ rank n:0.0462,Execution Time:0.4418s
Minimum Covariance Determinant ROC:0.8271, Precision @ rank n:0.0615,Execution Time:1.482s
One-Class SVM R



Minimum Covariance Determinant ROC:0.9903, Precision @ rank n:0.7534,Execution Time:8.0943s
One-Class SVM ROC:0.9922, Precision @ rank n:0.9553,Execution Time:34.428s
Principle Component Analysis ROC:0.9902, Precision @ rank n:0.9503,Execution Time:0.023s
/n  Processing vertebral.mat . . ./n
Angle Based Outlier Detection(ABOD) ROC:0.2797, Precision @ rank n:0.0,Execution Time:0.0479s
Cluster-Based Local Outlier Factor(CBLOF) ROC:0.3908, Precision @ rank n:0.0,Execution Time:0.0429s
FeatureBagging ROC:0.3027, Precision @ rank n:0.0,Execution Time:0.0289s
Histogram Based Outlier Detection ROC:0.2695, Precision @ rank n:0.0,Execution Time:0.001s
Isolation Forest ROC:0.3576, Precision @ rank n:0.0,Execution Time:0.2374s
K-Nearest Neighbour ROC:0.318, Precision @ rank n:0.0,Execution Time:0.007s
Local Outleir factor ROC:0.318, Precision @ rank n:0.0,Execution Time:0.002s
Minimum Covariance Determinant ROC:0.3308, Precision @ rank n:0.0,Execution Time:0.0299s
One-Class SVM ROC:0.4087, Precis

In [21]:
roc_df

Unnamed: 0,Data,#Samples,#Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9414,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8309,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5778,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7801,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9996,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5764,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9422,0.7602,0.481,0.8271,0.93,0.9332
