#                                ANOMALY DETECTION 

# IMPORT PACKAGES

In [1]:
import os
import sys
from time import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# METRICS PACKAGES

In [2]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# PYOD PACKAGES

In [3]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



#  Data File List

In [4]:
mat_file_list = ['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat','lympho.mat','mnist.mat','musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat','vertebral.mat','vowels.mat','wbc.mat']


# Load matFile and its features

In [5]:
data = loadmat('arrhythmia.mat')
type(data['X']), data['X'].shape
type(data['y']), data['y'].shape

(numpy.ndarray, (452, 1))

# NINE OUTLIERS DETECTION TOOLS TO BE COMPARED

In [18]:
df_columns = ['Data', '#Samples','# Dimensions', 'Outlier %', 'ABOD', 'CBLOF', 'FB', 'HBOS', 'IF', 'KNN', 'LOF', 'MCD', 'OCSVM', 'PCA']
roc_df = pd.DataFrame(columns = df_columns)
prn_df = pd.DataFrame(columns = df_columns)
time_df = pd.DataFrame(columns = df_columns)


# Methods for Anomaly Detection

In [None]:
random_state = np.random.RandomState(42)
for mat_file in mat_file_list:
    print("\n...Processing",mat_file,'.....')
    mat=loadmat(mat_file)
    X=mat['X']  # key 
    y=mat['y'].ravel() # converting 2d values into 1d
    
    outliers_fraction=np.count_nonzero(y)/len(y)
    outliers_percentage=round(outliers_fraction*100,ndigits=4)
    
    # Construct Container For Saving Result
    
    roc_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    prn_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    time_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    
# model training and testing (60:40)
    X_train, X_test, y_train, y_test= train_test_split(X, y , test_size=0.4, random_state=random_state)
    
# Standardizing data for processing
    X_train_norm, X_test_norm= standardizer(X_train,X_test)
    
#calling method for anomaly
    classifiers={'Angle-based Outlier Detector': ABOD(contamination=outliers_fraction),
               'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction,check_estimator=False,random_state=random_state),
                'FeatureBagging':FeatureBagging(contamination=outliers_fraction,random_state=random_state),
               'Histogram-base Outlier Detection': HBOS(contamination=outliers_fraction),
               'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
               'K Nearest Neighbour': KNN(contamination=outliers_fraction),
               'Local Outlier Factor': LOF(contamination=outliers_fraction),
                'One-Class SVM':OCSVM(contamination=outliers_fraction),
               'Minimum Covariance Determinant':MCD(contamination=outliers_fraction,random_state=random_state),
               'Principal Component Analysis': PCA(contamination=outliers_fraction,random_state=random_state)}

    for cl_name,cl in classifiers.items():
        t0=time()
        cl.fit(X_train_norm)
        test_scores=cl.decision_function(X_test_norm)
        t1=time()
        duration=round((t1-t0), ndigits=4)
        time_list.append(duration)
    
        roc=round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn=round(precision_n_scores(y_test,test_scores), ndigits=4)
    
        print('{cl_name} ROC:{roc}, precision @ rank n:{prn},' 'execution time: {duration}'.format(cl_name=cl_name, roc=roc, prn=prn,
                                                                                          duration=duration))
        roc_list.append(roc)
        prn_list.append(prn)
    
        temp_df=pd.DataFrame(time_list).transpose()
        temp_df.column= df_columns
        time_df=pd.concat([time_df,temp_df],axis=0)
    
        temp_df=pd.DataFrame(roc_list).transpose()
        temp_df.column= df_columns
        roc_df=pd.concat([roc_df,temp_df],axis=0)
    
        temp_df=pd.DataFrame(prn_list).transpose()
        temp_df.column= df_columns
        prn_df=pd.concat([prn_df,temp_df],axis=0)


...Processing arrhythmia.mat .....
Angle-based Outlier Detector ROC:0.7687, precision @ rank n:0.3571,execution time: 0.401




Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643,execution time: 0.3616




FeatureBagging ROC:0.7799, precision @ rank n:0.5,execution time: 1.4622




Histogram-base Outlier Detection ROC:0.8511, precision @ rank n:0.5714,execution time: 0.1795




Isolation Forest ROC:0.8527, precision @ rank n:0.5714,execution time: 1.1159




K Nearest Neighbour ROC:0.782, precision @ rank n:0.5,execution time: 0.2474




Local Outlier Factor ROC:0.7787, precision @ rank n:0.4643,execution time: 0.2155
One-Class SVM ROC:0.7986, precision @ rank n:0.5,execution time: 0.1173




Minimum Covariance Determinant ROC:0.8228, precision @ rank n:0.4286,execution time: 1.4713
Principal Component Analysis ROC:0.7997, precision @ rank n:0.5,execution time: 0.1556





...Processing cardio.mat .....




Angle-based Outlier Detector ROC:0.5763, precision @ rank n:0.1875,execution time: 1.0953




Cluster-based Local Outlier Factor ROC:0.8221, precision @ rank n:0.4844,execution time: 0.76




FeatureBagging ROC:0.4879, precision @ rank n:0.1406,execution time: 1.9689
Histogram-base Outlier Detection ROC:0.8453, precision @ rank n:0.4688,execution time: 0.018




Isolation Forest ROC:0.9414, precision @ rank n:0.5,execution time: 1.1277




K Nearest Neighbour ROC:0.6959, precision @ rank n:0.2812,execution time: 0.3885




Local Outlier Factor ROC:0.4715, precision @ rank n:0.125,execution time: 0.2662
One-Class SVM ROC:0.9507, precision @ rank n:0.5938,execution time: 0.1805




Minimum Covariance Determinant ROC:0.8778, precision @ rank n:0.3906,execution time: 1.4248
Principal Component Analysis ROC:0.9638, precision @ rank n:0.6875,execution time: 0.012

...Processing glass.mat .....




Angle-based Outlier Detector ROC:0.7104, precision @ rank n:0.25,execution time: 0.1307
Cluster-based Local Outlier Factor ROC:0.8506, precision @ rank n:0.25,execution time: 0.1068




FeatureBagging ROC:0.7043, precision @ rank n:0.25,execution time: 0.0968
Histogram-base Outlier Detection ROC:0.6524, precision @ rank n:0.0,execution time: 0.007




Isolation Forest ROC:0.7195, precision @ rank n:0.25,execution time: 0.7314
K Nearest Neighbour ROC:0.7805, precision @ rank n:0.25,execution time: 0.0229
Local Outlier Factor ROC:0.7774, precision @ rank n:0.25,execution time: 0.007
One-Class SVM ROC:0.6189, precision @ rank n:0.25,execution time: 0.004




Minimum Covariance Determinant ROC:0.7165, precision @ rank n:0.0,execution time: 0.0947
Principal Component Analysis ROC:0.622, precision @ rank n:0.25,execution time: 0.0049

...Processing ionosphere.mat .....




Angle-based Outlier Detector ROC:0.9004, precision @ rank n:0.8214,execution time: 0.2124
Cluster-based Local Outlier Factor ROC:0.8952, precision @ rank n:0.8036,execution time: 0.1217




FeatureBagging ROC:0.8933, precision @ rank n:0.75,execution time: 0.2164
Histogram-base Outlier Detection ROC:0.5195, precision @ rank n:0.3393,execution time: 0.0209




Isolation Forest ROC:0.8309, precision @ rank n:0.6607,execution time: 0.8044
K Nearest Neighbour ROC:0.9134, precision @ rank n:0.8393,execution time: 0.0429
Local Outlier Factor ROC:0.8989, precision @ rank n:0.75,execution time: 0.0169
One-Class SVM ROC:0.8372, precision @ rank n:0.7143,execution time: 0.012




Minimum Covariance Determinant ROC:0.9399, precision @ rank n:0.8571,execution time: 0.1865
Principal Component Analysis ROC:0.7971, precision @ rank n:0.5893,execution time: 0.008

...Processing letter.mat .....




Angle-based Outlier Detector ROC:0.8465, precision @ rank n:0.275,execution time: 0.9963




Cluster-based Local Outlier Factor ROC:0.7423, precision @ rank n:0.175,execution time: 0.376




FeatureBagging ROC:0.866, precision @ rank n:0.4,execution time: 1.9437
Histogram-base Outlier Detection ROC:0.5728, precision @ rank n:0.125,execution time: 0.0409




Isolation Forest ROC:0.5778, precision @ rank n:0.05,execution time: 1.0305




K Nearest Neighbour ROC:0.845, precision @ rank n:0.3,execution time: 0.3701




Local Outlier Factor ROC:0.8409, precision @ rank n:0.325,execution time: 0.2296




One-Class SVM ROC:0.5744, precision @ rank n:0.1,execution time: 0.1775




Minimum Covariance Determinant ROC:0.7499, precision @ rank n:0.075,execution time: 2.7689
Principal Component Analysis ROC:0.48, precision @ rank n:0.05,execution time: 0.013

...Processing lympho.mat .....
Angle-based Outlier Detector ROC:0.9382, precision @ rank n:0.4,execution time: 0.0908




Cluster-based Local Outlier Factor ROC:0.9709, precision @ rank n:0.6,execution time: 0.1127




FeatureBagging ROC:0.9673, precision @ rank n:0.6,execution time: 0.0818
Histogram-base Outlier Detection ROC:0.9964, precision @ rank n:0.8,execution time: 0.012




Isolation Forest ROC:0.9855, precision @ rank n:0.6,execution time: 0.8051
K Nearest Neighbour ROC:0.9636, precision @ rank n:0.6,execution time: 0.0169
Local Outlier Factor ROC:0.9636, precision @ rank n:0.6,execution time: 0.006
One-Class SVM ROC:0.9636, precision @ rank n:0.6,execution time: 0.003




Minimum Covariance Determinant ROC:0.9164, precision @ rank n:0.6,execution time: 0.09
Principal Component Analysis ROC:0.9818, precision @ rank n:0.8,execution time: 0.0049

...Processing mnist.mat .....
Angle-based Outlier Detector ROC:0.7813, precision @ rank n:0.3562,execution time: 19.5024




Cluster-based Local Outlier Factor ROC:0.8447, precision @ rank n:0.4007,execution time: 2.5949




FeatureBagging ROC:0.7259, precision @ rank n:0.3664,execution time: 122.0182
Histogram-base Outlier Detection ROC:0.5675, precision @ rank n:0.1199,execution time: 0.1326




Isolation Forest ROC:0.7801, precision @ rank n:0.2979,execution time: 4.9292




K Nearest Neighbour ROC:0.8409, precision @ rank n:0.4144,execution time: 15.9507




Local Outlier Factor ROC:0.7085, precision @ rank n:0.339,execution time: 15.3541




One-Class SVM ROC:0.8417, precision @ rank n:0.3801,execution time: 11.3191




Minimum Covariance Determinant ROC:0.863, precision @ rank n:0.3973,execution time: 6.9126




Principal Component Analysis ROC:0.8396, precision @ rank n:0.3767,execution time: 0.377

...Processing musk.mat .....




Angle-based Outlier Detector ROC:0.0809, precision @ rank n:0.0333,execution time: 5.6725




Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0,execution time: 0.9764




FeatureBagging ROC:0.5228, precision @ rank n:0.1667,execution time: 34.5229
Histogram-base Outlier Detection ROC:0.9999, precision @ rank n:0.9667,execution time: 0.1507


