# Anomaly Detection Project

## import python packages

In [26]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat



## Import PYOD packages & Methods

In [27]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging


## Import Metrics Packages

In [28]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

## Define Data File And Read X and Y

In [29]:
mat_file_list =["arrhythmia.mat","cardio.mat","glass.mat","ionosphere.mat",
                "letter.mat","lympho.mat","mnist.mat", "musk.mat","optdigits.mat",
                "pendigits.mat","pima.mat","satellite.mat","satimage-2.mat","shuttle.mat",
                "vertebral.mat",
                "vowels.mat","wbc.mat"]

In [30]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [43]:
data=loadmat("Anamoly_detec_data/cardio.mat")

In [44]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [45]:
len(data)

5

In [46]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [47]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

## Input (independent) feature shape in Mat file 

In [48]:
data["X"].shape ,  type(data["X"])

((1831, 21), numpy.ndarray)

## Ouput (dependent) feature shape in Mat file

In [49]:
data["y"].shape , type(data["y"])

((1831, 1), numpy.ndarray)

## Define The Outliers Tools To Be Compared

In [50]:
df_columns = ['Data','#Samples','# Dimensions','Outlier Perc',
              'ABOD','CBLOF','FB','HBOS','IForest','KNN','LOF',
              'MCD','OCSVM','PCA']

## ROC Performance Evaluation Table

In [51]:
roc_df=pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


## precision_n_scores Performance Evaluation Table

In [52]:
prn_df=pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


## Time Dataframe

In [53]:
time_df=pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


## Exploring All MAt files

In [54]:
from time import time
random_state=np.random.RandomState(42
                                  )
for mat_file in mat_file_list:
    print("\n ... Processing ",mat_file ,"....")
    mat=loadmat(os.path.join("Anamoly_detec_data",mat_file))
    
    X=mat["X"]
    y=mat["y"].ravel()
    
    outliers_fraction=np.count_nonzero(y) /len(y)
    outliers_percentage=round(outlier_fraction*100 , ndigits=4)
     
        # construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.4,
                                                        random_state=random_state)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
        contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor': CBLOF(
            contamination=outliers_fraction, check_estimator=False,
            random_state=random_state),
        'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                          random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(
            contamination=outliers_fraction),
        'Isolation Forest': IForest(contamination=outliers_fraction,
                                    random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)': LOF(
            contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)': MCD(
            contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)': PCA(
            contamination=outliers_fraction, random_state=random_state),
    }

    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(
            clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


 ... Processing  arrhythmia.mat ....
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 0.2191s
Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643, execution time: 0.1623s
Feature Bagging ROC:0.7799, precision @ rank n:0.5, execution time: 0.8549s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 0.0999s
Isolation Forest ROC:0.8527, precision @ rank n:0.5714, execution time: 0.5869s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.1199s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.1111s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 0.9279s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.07s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.08s

 ... Processing  cardio.mat ....
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @ rank n:0.1875, execution time: 0.55s
Cluster-based Local Outlier Factor ROC:0.8221, precision @ rank n:0.4844, execution time: 0.2538s
Feature Bagging ROC:0.4879, precision @ rank n:0.1406, execution time: 1.3231s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precision @ rank n:0.4688, execution time: 0.01s
Isolation Forest ROC:0.9414, precision @ rank n:0.5, execution time: 0.5334s
K Nearest Neighbors (KNN) ROC:0.6959, precision @ rank n:0.2812, execution time: 0.1999s
Local Outlier Factor (LOF) ROC:0.4715, precision @ rank n:0.125, execution time: 0.1394s




Minimum Covariance Determinant (MCD) ROC:0.8778, precision @ rank n:0.3906, execution time: 0.7127s
One-class SVM (OCSVM) ROC:0.9507, precision @ rank n:0.5938, execution time: 0.1051s
Principal Component Analysis (PCA) ROC:0.9638, precision @ rank n:0.6875, execution time: 0.005s

 ... Processing  glass.mat ....
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @ rank n:0.25, execution time: 0.0501s
Cluster-based Local Outlier Factor ROC:0.8506, precision @ rank n:0.25, execution time: 0.07s
Feature Bagging ROC:0.7043, precision @ rank n:0.25, execution time: 0.0486s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precision @ rank n:0.0, execution time: 0.005s
Isolation Forest ROC:0.7195, precision @ rank n:0.25, execution time: 0.4276s
K Nearest Neighbors (KNN) ROC:0.7805, precision @ rank n:0.25, execution time: 0.015s
Local Outlier Factor (LOF) ROC:0.7774, precision @ rank n:0.25, execution time: 0.005s
Minimum Covariance Determinant (MCD) ROC:0.7165, precision @ rank n



Angle-based Outlier Detector (ABOD) ROC:0.7813, precision @ rank n:0.3562, execution time: 12.0404s
Cluster-based Local Outlier Factor ROC:0.8447, precision @ rank n:0.4007, execution time: 2.1705s
Feature Bagging ROC:0.7259, precision @ rank n:0.3664, execution time: 75.9185s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precision @ rank n:0.1199, execution time: 0.07s
Isolation Forest ROC:0.7801, precision @ rank n:0.2979, execution time: 2.7162s
K Nearest Neighbors (KNN) ROC:0.8409, precision @ rank n:0.4144, execution time: 11.2011s
Local Outlier Factor (LOF) ROC:0.7085, precision @ rank n:0.339, execution time: 9.6554s




Minimum Covariance Determinant (MCD) ROC:0.863, precision @ rank n:0.3973, execution time: 3.9802s
One-class SVM (OCSVM) ROC:0.8417, precision @ rank n:0.3801, execution time: 6.671s
Principal Component Analysis (PCA) ROC:0.8396, precision @ rank n:0.3767, execution time: 0.2368s

 ... Processing  musk.mat ....
Angle-based Outlier Detector (ABOD) ROC:0.0809, precision @ rank n:0.0333, execution time: 3.3626s
Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 0.5123s
Feature Bagging ROC:0.5228, precision @ rank n:0.1667, execution time: 19.0397s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precision @ rank n:0.9667, execution time: 0.075s
Isolation Forest ROC:0.9996, precision @ rank n:0.9333, execution time: 1.586s
K Nearest Neighbors (KNN) ROC:0.7348, precision @ rank n:0.2333, execution time: 2.8119s
Local Outlier Factor (LOF) ROC:0.5323, precision @ rank n:0.1333, execution time: 2.3473s
Minimum Covariance Determinant (MCD) ROC:1.0, precision 



Minimum Covariance Determinant (MCD) ROC:0.3486, precision @ rank n:0.0, execution time: 1.6585s
One-class SVM (OCSVM) ROC:0.4972, precision @ rank n:0.0, execution time: 2.1122s
Principal Component Analysis (PCA) ROC:0.504, precision @ rank n:0.0, execution time: 0.07s

 ... Processing  pendigits.mat ....
Angle-based Outlier Detector (ABOD) ROC:0.7008, precision @ rank n:0.0308, execution time: 1.9963s
Cluster-based Local Outlier Factor ROC:0.9609, precision @ rank n:0.3077, execution time: 0.3887s
Feature Bagging ROC:0.4687, precision @ rank n:0.0462, execution time: 6.7032s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precision @ rank n:0.2615, execution time: 0.02s
Isolation Forest ROC:0.9422, precision @ rank n:0.2769, execution time: 0.9838s
K Nearest Neighbors (KNN) ROC:0.7602, precision @ rank n:0.0462, execution time: 0.8202s
Local Outlier Factor (LOF) ROC:0.481, precision @ rank n:0.0462, execution time: 0.7766s
Minimum Covariance Determinant (MCD) ROC:0.8271, precisio





Minimum Covariance Determinant (MCD) ROC:0.9903, precision @ rank n:0.7534, execution time: 16.1666s
One-class SVM (OCSVM) ROC:0.9922, precision @ rank n:0.9553, execution time: 69.9143s
Principal Component Analysis (PCA) ROC:0.9902, precision @ rank n:0.9503, execution time: 0.04s

 ... Processing  vertebral.mat ....
Angle-based Outlier Detector (ABOD) ROC:0.2797, precision @ rank n:0.0, execution time: 0.0785s
Cluster-based Local Outlier Factor ROC:0.3908, precision @ rank n:0.0, execution time: 0.0594s
Feature Bagging ROC:0.3027, precision @ rank n:0.0, execution time: 0.05s
Histogram-base Outlier Detection (HBOS) ROC:0.2695, precision @ rank n:0.0, execution time: 0.0s
Isolation Forest ROC:0.3576, precision @ rank n:0.0, execution time: 0.3796s
K Nearest Neighbors (KNN) ROC:0.318, precision @ rank n:0.0, execution time: 0.01s
Local Outlier Factor (LOF) ROC:0.318, precision @ rank n:0.0, execution time: 0.005s
Minimum Covariance Determinant (MCD) ROC:0.3308, precision @ rank n:0.0, 

In [55]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,14.6018,0.5763,0.8221,0.4879,0.8453,0.9414,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,14.6018,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,14.6018,0.9004,0.8952,0.8933,0.5195,0.8309,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,14.6018,0.8465,0.7423,0.866,0.5728,0.5778,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,14.6018,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,14.6018,0.7813,0.8447,0.7259,0.5675,0.7801,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,14.6018,0.0809,1.0,0.5228,0.9999,0.9996,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,14.6018,0.4428,0.7852,0.4641,0.8822,0.5764,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,14.6018,0.7008,0.9609,0.4687,0.9294,0.9422,0.7602,0.481,0.8271,0.93,0.9332


In [56]:
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,14.6018,0.1875,0.4844,0.1406,0.4688,0.5,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,14.6018,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,14.6018,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,14.6018,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,14.6018,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,14.6018,0.3562,0.4007,0.3664,0.1199,0.2979,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,14.6018,0.0333,1.0,0.1667,0.9667,0.9333,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,14.6018,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,14.6018,0.0308,0.3077,0.0462,0.2615,0.2769,0.0462,0.0462,0.0615,0.2923,0.3385


In [57]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.2191,0.1623,0.8549,0.0999,0.5869,0.1199,0.1111,0.9279,0.07,0.08
0,cardio,1831,21,14.6018,0.55,0.2538,1.3231,0.01,0.5334,0.1999,0.1394,0.7127,0.1051,0.005
0,glass,214,9,14.6018,0.0501,0.07,0.0486,0.005,0.4276,0.015,0.005,0.045,0.0,0.005
0,ionosphere,351,33,14.6018,0.0999,0.0963,0.0949,0.01,0.411,0.02,0.01,0.095,0.01,0.0
0,letter,1600,32,14.6018,0.4958,0.1699,1.2627,0.01,0.5991,0.2675,0.1199,1.7089,0.1189,0.01
0,lympho,148,18,14.6018,0.035,0.0912,0.0539,0.009,0.4527,0.009,0.005,0.0568,0.002,0.003
0,mnist,7603,100,14.6018,12.0404,2.1705,75.9185,0.07,2.7162,11.2011,9.6554,3.9802,6.671,0.2368
0,musk,3062,166,14.6018,3.3626,0.5123,19.0397,0.075,1.586,2.8119,2.3473,15.1208,1.6823,0.2675
0,optdigits,5216,64,14.6018,3.3807,0.6996,20.3772,0.0489,1.256,2.7014,2.2063,1.6585,2.1122,0.07
0,pendigits,6870,16,14.6018,1.9963,0.3887,6.7032,0.02,0.9838,0.8202,0.7766,3.022,1.2071,0.01


In [58]:
# Define the number of inliers and outliers
n_samples = len(y)
clusters_separation = [0]

n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1

In [59]:
# Show the statics of the data
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print('Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(shape=ground_truth.shape))
print(ground_truth)

Number of inliers: 357
Number of outliers: 21
Ground truth shape is (378,). Outlier are 1 and inliers are 0.

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]


In [60]:

# Show all detectors

for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

Model 1 Angle-based Outlier Detector (ABOD)
Model 2 Cluster-based Local Outlier Factor
Model 3 Feature Bagging
Model 4 Histogram-base Outlier Detection (HBOS)
Model 5 Isolation Forest
Model 6 K Nearest Neighbors (KNN)
Model 7 Local Outlier Factor (LOF)
Model 8 Minimum Covariance Determinant (MCD)
Model 9 One-class SVM (OCSVM)
Model 10 Principal Component Analysis (PCA)
