# IMPORT PACKAGES

In [1]:
import os
import sys
from time import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# METRICS PACKAGES

In [2]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# PYOD PACKAGES

In [3]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



#  Data File List

In [4]:
mat_file_list = ['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat','lympho.mat','mnist.mat','musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat','vertebral.mat','vowels.mat','wbc.mat']


# Load matFile and its features

In [5]:
data = loadmat('arrhythmia.mat')
type(data['X']), data['X'].shape
type(data['y']), data['y'].shape

(numpy.ndarray, (452, 1))

# NINE OUTLIERS DETECTION TOOLS TO BE COMPARED

In [18]:
df_columns = ['Data', '#Samples','# Dimensions', 'Outlier %', 'ABOD', 'CBLOF', 'FB', 'HBOS', 'IF', 'KNN', 'LOF', 'MCD', 'OCSVM', 'PCA']
roc_df = pd.DataFrame(columns = df_columns)
prn_df = pd.DataFrame(columns = df_columns)
time_df = pd.DataFrame(columns = df_columns)


# Methods for Anomaly Detection

In [19]:
random_state = np.random.RandomState(42)
for mat_file in mat_file_list:
    print("\n...Processing",mat_file,'.....')
    mat=loadmat(mat_file)
    X=mat['X']  # key 
    y=mat['y'].ravel() # converting 2d values into 1d
    
    outliers_fraction=np.count_nonzero(y)/len(y)
    outliers_percentage=round(outliers_fraction*100,ndigits=4)
    
    # Construct Container For Saving Result
    
    roc_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    prn_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    time_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    
# model training and testing (60:40)
    X_train, X_test, y_train, y_test= train_test_split(X, y , test_size=0.4, random_state=random_state)
    
# Standardizing data for processing
    X_train_norm, X_test_norm= standardizer(X_train,X_test)
    
#calling method for anomaly
    classifiers={'Angle-based Outlier Detector': ABOD(contamination=outliers_fraction),
               'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction,check_estimator=False,random_state=random_state),
                'FeatureBagging':FeatureBagging(contamination=outliers_fraction,random_state=random_state),
               'Histogram-base Outlier Detection': HBOS(contamination=outliers_fraction),
               'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
               'K Nearest Neighbour': KNN(contamination=outliers_fraction),
               'Local Outlier Factor': LOF(contamination=outliers_fraction),
                'One-Class SVM':OCSVM(contamination=outliers_fraction),
               'Minimum Covariance Determinant':MCD(contamination=outliers_fraction,random_state=random_state),
               'Principal Component Analysis': PCA(contamination=outliers_fraction,random_state=random_state)}

    for cl_name,cl in classifiers.items():
        t0=time()
        cl.fit(X_train_norm)
        test_scores=cl.decision_function(X_test_norm)
        t1=time()
        duration=round((t1-t0), ndigits=4)
        time_list.append(duration)
    
        roc=round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn=round(precision_n_scores(y_test,test_scores), ndigits=4)
    
        print('{cl_name} ROC:{roc}, precision @ rank n:{prn},' 'execution time: {duration}'.format(cl_name=cl_name, roc=roc, prn=prn,
                                                                                          duration=duration))
        roc_list.append(roc)
        prn_list.append(prn)
    
        temp_df=pd.DataFrame(time_list).transpose()
        temp_df.columns= df_columns
        time_df=pd.concat([time_df,temp_df],axis=0)
    
        temp_df=pd.DataFrame(roc_list).transpose()
        temp_df.columns= df_columns
        roc_df=pd.concat([roc_df,temp_df],axis=0)
    
        temp_df=pd.DataFrame(prn_list).transpose()
        temp_df.columns= df_columns
        prn_df=pd.concat([prn_df,temp_df],axis=0)


...Processing arrhythmia.mat .....
Angle-based Outlier Detector ROC:0.7687, precision @ rank n:0.3571,execution time: 0.3531


ValueError: Length mismatch: Expected axis has 5 elements, new values have 14 elements

In [17]:
roc_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,FB,HBOS,IF,KNN,LOF,MCD,OCSVM,Outlier %,PCA,Samples
0,arrhythmia,452,274,14.6018,0.7687,,,,,,...,,,,,,,,,,
0,arrhythmia,452,274,14.6018,0.7687,0.7684,,,,,...,,,,,,,,,,
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,,,,...,,,,,,,,,,
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,,,...,,,,,,,,,,
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,wbc,378,30,5.5556,0.9232,0.9063,0.9415,0.9592,0.9451,0.9437,...,,,,,,,,,,
0,wbc,378,30,5.5556,0.9232,0.9063,0.9415,0.9592,0.9451,0.9437,...,,,,,,,,,,
0,wbc,378,30,5.5556,0.9232,0.9063,0.9415,0.9592,0.9451,0.9437,...,,,,,,,,,,
0,wbc,378,30,5.5556,0.9232,0.9063,0.9415,0.9592,0.9451,0.9437,...,,,,,,,,,,
