__Anomaly Detection Project - PyOD__ 

# Import Libraries

In [1]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# Import Pyod and the methods

In [2]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

from warnings import filterwarnings
filterwarnings('ignore')


# Import Metrics Package

In [3]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [4]:
mat_file_list=['arrhythmia.mat',
               'cardio.mat',
               'glass.mat',
               'ionosphere.mat',
               'letter.mat',
               'lympho.mat',
               'mnist.mat',
               'musk.mat',
               'optdigits.mat',
               'pendigits.mat',
               'pima.mat',
               'satellite.mat',
               'satimage-2.mat',
               'shuttle.mat',
               'vertebral.mat',
               'vowels.mat',
               'wbc.mat']


In [5]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [6]:
len(mat_file_list)

17

__Note:__

- Files with a . mat extension contain MATLAB formatted data, 

- and data can be loaded from or written to these files using the functions load and save , respectively.

- Python data can be saved to a MAT-file, with the function savemat . 

- Data has to be structured in the same way as for loadmat , 

- i.e. it should be composed of simple data types, like dict , list , str , int , and float . 

- The parameter data shall be a dict with the variables.



# Loading mat file

In [7]:
from scipy.io import loadmat

In [8]:
data=loadmat('data/cardio.mat')

In [9]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [10]:
len(data)

5

In [11]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [12]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

Input (Dependent) Feature Shape in Mat File Format

In [13]:
type(data['X']),data['X'].shape

(numpy.ndarray, (1831, 21))

Dependent / Target / Output Feature Shape

In [14]:
type(data['y']), data['y'].shape

(numpy.ndarray, (1831, 1))

In [15]:
df_columns=['Data', '#Sample', '#Dimensions', 'Outlier Perc',
            'ABOD', 'CBLOF', 'HBOS', 'IFOREST',  'KNN', 
            'LOF',  'MCD', 'OCSVM',  'PCA', 'FEATUREBAGGING']

In [16]:
df_columns

['Data',
 '#Sample',
 '#Dimensions',
 'Outlier Perc',
 'ABOD',
 'CBLOF',
 'HBOS',
 'IFOREST',
 'KNN',
 'LOF',
 'MCD',
 'OCSVM',
 'PCA',
 'FEATUREBAGGING']

# Precision, Time and Roc evolution tables creation

__Creating empty dataframes:__

- ROC Dataframe to record all Roc values performed on each dataset

- Precison Dataframe to record all Precison values performed on each dataset

- Execution Time Dataframe to record the time taken to perform algorithm on each dataset, So as to find the algorithm which takes minimum amount of time and gives best accuracy

## ROC Performance Evaluation Table

ROC Dataframe

In [17]:
roc_df=pd.DataFrame(columns=df_columns)


In [18]:
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA,FEATUREBAGGING


## Precision_n_scores - Performance Eveluation Table

Precision  Dataframe

In [19]:
prn_df=pd.DataFrame(columns=df_columns)


In [20]:
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA,FEATUREBAGGING


## Time Evaluation Table

Time dataFrame

In [21]:
time_df=pd.DataFrame(columns=df_columns)


In [22]:
time_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA,FEATUREBAGGING


_____________

In [23]:
data_1 = loadmat("data/vowels.mat") 
data_1

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-26 08:42:13 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.58046914, -0.90253404,  0.61789919, ...,  1.60463715,
         -0.6230598 , -0.38312549],
        [ 0.78437493, -1.07736635,  0.6157809 , ...,  1.26023551,
         -0.42333934, -0.2877912 ],
        [ 0.79129238, -1.08624216,  0.66977272, ...,  1.08179729,
         -0.26720104, -0.17220348],
        ...,
        [ 0.9470763 ,  0.35810832,  0.27472497, ..., -1.08832841,
          0.3271257 ,  1.69283401],
        [ 1.58485142,  0.69359118, -0.37568588, ..., -3.07682047,
         -0.24109405,  1.94433536],
        [ 2.32735022,  0.38281412,  0.77590669, ..., -0.48257003,
         -0.59043614, -0.72199018]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [24]:
data_2 = loadmat('data/letter.mat')
data_2

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Tue Jan 26 15:35:22 2016',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 6, 10,  5, ..., 10,  2,  7],
        [ 0,  6,  0, ...,  8,  1,  7],
        [ 4,  7,  5, ...,  8,  2,  8],
        ...,
        [ 4,  9,  4, ...,  8,  3,  8],
        [ 6, 10,  6, ...,  9,  8,  8],
        [ 3,  1,  3, ...,  9,  1,  7]], dtype=uint8),
 'y': array([[0],
        [0],
        [0],
        ...,
        [1],
        [1],
        [1]], dtype=uint8)}

__Findings__

- .mat files are in form of Dictionary

- In the file, header , version , globals are predefined classes

- X and y are the variables to be used, and they are predefined.

- X and y are 2D - numpy arrays

_________________

# Exploring All Mat files

In [25]:
# Importing time module-
from time import time

# Creating Random State-
random_state = np.random.RandomState(42)

# Processing .mat files One by One-
for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join('data', mat_file))

    
    # Defining 'X' & 'y'
    X = mat['X']
    y = mat['y'].ravel()  # ravel() is used to convert 2D to 1D array
    
    # Counting Outliers fraction - 
    # (Counting the number of non-zero values in the array y with np.count_nonzero() 
    # and divide by length of y to get outliers fraction)
    outliers_fraction = np.count_nonzero(y) / len(y)
    
    # Calculation of Outliers Percentage
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # Construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    # Application of all the Algorithms and Storing their results in dictionery format:
    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                   
        'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False,
                                                    random_state=random_state),
       
        'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state),
        
        'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
       
        'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state),
                                    
        'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
        
        'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction),
            
        'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state),
            
        'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
                   
        'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state),
           
    }

     # Calculating the Time taken for each algorithm
    for clf_name, clf in classifiers.items():
        
        # Initialize the start time
        t0 = time()  
        
        # Fit the train data
        clf.fit(X_train_norm)
        
        # prediction of test scores using X_test
        test_scores = clf.decision_function(X_test_norm)
        
        # Final time taken
        t1 = time()
        
        # calculating Total duration taken with round() and t1 - t0 [ i.e. __final time - start time__] 
        duration = round(t1 - t0, ndigits=4)
        
        # Append the duration in time list with .append()
        time_list.append(duration)

        # Calculation of roc value and precision value of the algorithm
        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        
        # Printing ROC, Precision and Execution time
        print('{clf_name} ROC:{roc}, '
              'precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        # Append roc value to roc_list with .append()
        roc_list.append(roc)
        
        # Append precision value to prn_list with .append()
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 2.1541s
Cluster-based Local Outlier Factor ROC:0.7789, precision @ rank n:0.4643, execution time: 2.2581s
Feature Bagging ROC:0.7796, precision @ rank n:0.4643, execution time: 0.783s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 2.6021s
Isolation Forest ROC:0.8637, precision @ rank n:0.6071, execution time: 0.851s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.115s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.098s
Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 1.5151s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.057s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.095s

... Processing cardio.mat ...
Angle-based Outlier Detector 

Angle-based Outlier Detector (ABOD) ROC:0.667, precision @ rank n:0.0526, execution time: 3.1732s
Cluster-based Local Outlier Factor ROC:0.8082, precision @ rank n:0.1579, execution time: 0.4s
Feature Bagging ROC:0.4889, precision @ rank n:0.0526, execution time: 5.9013s
Histogram-base Outlier Detection (HBOS) ROC:0.9348, precision @ rank n:0.2632, execution time: 0.016s
Isolation Forest ROC:0.939, precision @ rank n:0.3333, execution time: 1.0141s
K Nearest Neighbors (KNN) ROC:0.7371, precision @ rank n:0.0702, execution time: 1.0391s
Local Outlier Factor (LOF) ROC:0.4965, precision @ rank n:0.0702, execution time: 0.9371s
Minimum Covariance Determinant (MCD) ROC:0.8204, precision @ rank n:0.0877, execution time: 3.0842s
One-class SVM (OCSVM) ROC:0.9235, precision @ rank n:0.3158, execution time: 1.7071s
Principal Component Analysis (PCA) ROC:0.9309, precision @ rank n:0.3158, execution time: 0.015s

... Processing pima.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7163, precision


# ROC Dataframe

In [26]:
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.7687,0.7789,0.7796,0.8511,0.8637,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5892,0.8845,0.6385,0.8373,0.951,0.734,0.588,0.8524,0.9478,0.9616
0,glass,214,9,4.2056,0.6951,0.811,0.7073,0.7073,0.7134,0.8384,0.7043,0.8293,0.6585,0.686
0,ionosphere,351,33,35.8974,0.9181,0.9176,0.9303,0.6052,0.8516,0.932,0.9227,0.9669,0.8257,0.7941
0,letter,1600,32,6.25,0.8783,0.7783,0.8947,0.6063,0.6279,0.8573,0.8765,0.8061,0.5927,0.5216
0,lympho,148,18,4.0541,0.9831,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,mnist,7603,100,9.2069,0.7628,0.8389,0.7157,0.5766,0.7915,0.8498,0.7195,0.8713,0.854,0.8534
0,musk,3062,166,3.1679,0.2161,1.0,0.473,0.9999,1.0,0.8009,0.4629,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4894,0.7901,0.5062,0.8774,0.686,0.406,0.5277,0.3822,0.5171,0.526
0,pendigits,6870,16,2.2707,0.667,0.8082,0.4889,0.9348,0.939,0.7371,0.4965,0.8204,0.9235,0.9309


# Precision Dataframe

In [27]:
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.4643,0.5714,0.6071,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1918,0.4932,0.1781,0.4521,0.6027,0.3562,0.1507,0.411,0.5342,0.6849
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8431,0.8039,0.8039,0.3922,0.6078,0.8824,0.7843,0.8627,0.6863,0.5686
0,letter,1600,32,6.25,0.4375,0.1875,0.4062,0.0938,0.0625,0.3125,0.3438,0.1875,0.125,0.125
0,lympho,148,18,4.0541,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,mnist,7603,100,9.2069,0.3367,0.3912,0.3741,0.1361,0.2687,0.432,0.3673,0.2653,0.3946,0.3878
0,musk,3062,166,3.1679,0.1,1.0,0.125,0.975,1.0,0.175,0.125,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.0152,0.0,0.0303,0.2121,0.0303,0.0,0.0303,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0526,0.1579,0.0526,0.2632,0.3333,0.0702,0.0702,0.0877,0.3158,0.3158


# Time Dataframe

In [28]:
time_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,ABOD,CBLOF,HBOS,IFOREST,KNN,LOF,MCD,OCSVM,PCA,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,2.1541,2.2581,0.783,2.6021,0.851,0.115,0.098,1.5151,0.057,0.095
0,cardio,1831,21,9.6122,0.8771,0.21,1.3801,0.011,0.56,0.286,0.154,0.8821,0.133,0.006
0,glass,214,9,4.2056,0.138,0.063,0.051,0.004,0.367,0.015,0.004,0.069,0.002,0.002
0,ionosphere,351,33,35.8974,0.182,0.071,0.098,0.015,0.407,0.029,0.01,0.107,0.007,0.004
0,letter,1600,32,6.25,0.765,0.262,1.1291,0.018,0.59,0.265,0.135,1.7951,0.126,0.009
0,lympho,148,18,4.0541,0.063,0.107,0.046,0.008,0.369,0.011,0.004,0.071,0.002,0.003
0,mnist,7603,100,9.2069,12.1287,1.7731,70.771,0.087,2.8232,10.0006,9.2205,4.8683,6.7454,0.214
0,musk,3062,166,3.1679,3.4112,0.592,17.945,0.107,1.8761,2.5301,2.3321,21.3712,1.6381,0.209
0,optdigits,5216,64,2.8758,4.3572,0.849,19.2551,0.052,1.4841,3.1652,2.5081,2.1501,2.2141,0.071
0,pendigits,6870,16,2.2707,3.1732,0.4,5.9013,0.016,1.0141,1.0391,0.9371,3.0842,1.7071,0.015


___________________