# Anamoly detection pyod using matfile 

## import python packages

In [1]:
import warnings
warnings.simplefilter('ignore')
import os 
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

from sklearn.model_selection import train_test_split
from scipy.io import loadmat




## import pyod models 

In [2]:
# linear models
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM

# proximity models
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS

# probability
from pyod.models.abod import ABOD

# outlier ensembles
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging


## import metrics packages

In [3]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from pyod.utils.example import visualize
from pyod.utils.data import evaluate_print
from pyod.utils.data import generate_data 

from sklearn.metrics import roc_auc_score

## define data files, read x & y

In [4]:
matfile_list = ['arrhythmia.mat',
                 'cardio.mat',
                 'glass.mat',
                 'ionosphere.mat',
                 'letter.mat',
                 'lympho.mat',
                 'mnist.mat',
                 'musk.mat',
                 'optdigits.mat',
                 'pendigits.mat',
                 'pima.mat',
                 'satellite.mat',
                 'satimage-2.mat',
                 'shuttle.mat',
                 'vertebral.mat',
                 'vowels.mat',
                 'wbc.mat']

In [5]:
matfile_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

## load mat file

In [6]:
data = loadmat('E:\AI-ML PYTHON/data/cardio.mat')


In [7]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [8]:
X = data['X']

In [9]:
X.shape[0]

1831

In [10]:
X.shape[1]

21

In [11]:
X.shape

(1831, 21)

In [12]:
y = data['y']
y.shape

(1831, 1)

## Input(Independent) Feature Shape in Mat file format¶


In [13]:
print(data['X'].shape, type(data['X']))

(1831, 21) <class 'numpy.ndarray'>


## Dependent/ Target /Output Feature shape¶


In [14]:
print(data['y'].shape, type(data['y']))

(1831, 1) <class 'numpy.ndarray'>


In [15]:
len(data)

5

In [16]:
data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [17]:
data.values()

dict_values([b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC', '1.0', [], array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]]), array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])])

In [18]:
data.items()

dict_items([('__header__', b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC'), ('__version__', '1.0'), ('__globals__', []), ('X', array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
        -0.28978574, -0.49329397],
       [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
        -0.25638541, -0.49329397],
       [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
        -0.25638541,  1.14001753],
       ...,
       [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
         0.24461959, -0.49329397],
       [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
         0.14441859, -0.49329397],
       [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
         3.58465295, -0.49329397]])), ('y', array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]]))])

## now create empty dataframe columns

In [19]:
df_columns = ['Data', 'Samples', 'Dimensions', 'Outlier Perc',
              'ABOD',  'FB', 'HBOS', 'IForest', 'KNN', 'LOF', 'MCD',
              'OCSVM', 'PCA']

In [20]:
len(df_columns)

13

## roc performance evaluation table

In [21]:
roc_df = pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,Samples,Dimensions,Outlier Perc,ABOD,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


## precision n scores performance evaluation table

In [22]:
prn_df = pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,Samples,Dimensions,Outlier Perc,ABOD,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


## Time Dataframe

In [23]:
time_df = pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,Samples,Dimensions,Outlier Perc,ABOD,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


##  Loading and building all matfiles

In [24]:
from time import time
random_state = np.random.RandomState(32)

for matfile in matfile_list:
    print('\n...Processing', matfile, '...')
    mat = loadmat(os.path.join('E:\AI-ML PYTHON\data',matfile)) 
    
    X = mat['X']
    y = mat['y'].ravel()
    
    Outliers_fraction = np.count_nonzero(y)/ len(y)
    Outliers_percentage = round(Outliers_fraction * 100, ndigits = 4)
    
    # construct the containers for saving results
    roc_list = [matfile[:-4], X.shape[0], X.shape[1], Outliers_percentage] # to remove .mat char from the name of the strings in 
    prn_list = [matfile[:-4], X.shape[0], X.shape[1], Outliers_percentage]  # the matfile list like cardio.mat ta takeoff .mat
    time_list = [matfile[:-4], X.shape[0], X.shape[1], Outliers_percentage]
    
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, test_size=0.4)
    
    # normalize the independent values
    X_train_norm, X_test_norm = standardizer(X_train, X_test)
    
    # building models in dict form

    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
        contamination=Outliers_fraction),
        'Feature Bagging': FeatureBagging(contamination=Outliers_fraction,
                                          random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)': HBOS(
            contamination=Outliers_fraction),
        'Isolation Forest': IForest(contamination=Outliers_fraction,
                                    random_state=random_state),
        'K Nearest Neighbors (KNN)': KNN(contamination=Outliers_fraction),
        'Local Outlier Factor (LOF)': LOF(
            contamination=Outliers_fraction),
        'Minimum Covariance Determinant (MCD)': MCD(
            contamination=Outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)': OCSVM(contamination=Outliers_fraction),
        'Principal Component Analysis (PCA)': PCA(
            contamination=Outliers_fraction, random_state=random_state),}
    
        
    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm,y_train)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)

        
        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)
    
        print('{clf_name} ROC:{roc}, precision n scores:{prn}, execution time:{duration}'.format(clf_name=clf_name, roc=roc, prn=prn, 
                                                              duration = duration))
        roc_list.append(roc)
        prn_list.append(prn)
    
    
    temp_df = pd.DataFrame(time_list)
    print(temp_df)
    temp_df = temp_df.transpose()
    print(temp_df)
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)
    print(time_df)
    
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)
    
        
        
        
                   
    


...Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7488, precision n scores:0.3478, execution time:4.4022
Feature Bagging ROC:0.7545, precision n scores:0.3913, execution time:1.1218
Histogram-base Outlier Detection (HBOS) ROC:0.8343, precision n scores:0.4783, execution time:5.1013
Isolation Forest ROC:0.7997, precision n scores:0.4783, execution time:1.4655
K Nearest Neighbors (KNN) ROC:0.7562, precision n scores:0.3478, execution time:0.2494
Local Outlier Factor (LOF) ROC:0.7545, precision n scores:0.3478, execution time:0.125
Minimum Covariance Determinant (MCD) ROC:0.7419, precision n scores:0.3043, execution time:2.8655
One-class SVM (OCSVM) ROC:0.7595, precision n scores:0.3478, execution time:0.0781
Principal Component Analysis (PCA) ROC:0.7611, precision n scores:0.3913, execution time:0.1441
             0
0   arrhythmia
1          452
2          274
3      14.6018
4       4.4022
5       1.1218
6       5.1013
7       1.4655
8       0.2494
9        0.1

In [25]:
print('Time complexity')
time_df


Time complexity


Unnamed: 0,Data,Samples,Dimensions,Outlier Perc,ABOD,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,4.4022,1.1218,5.1013,1.4655,0.2494,0.125,2.8655,0.0781,0.1441
0,cardio,1831,21,9.6122,1.3371,1.7012,0.0312,1.277,0.5004,0.2031,1.689,0.1718,0.0152
0,glass,214,9,4.2056,0.1406,0.1015,0.0156,0.9114,0.0312,0.0,0.125,0.0166,0.0036
0,ionosphere,351,33,35.8974,0.2465,0.1718,0.0273,0.9802,0.0625,0.0156,0.2148,0.0156,0.0156
0,letter,1600,32,6.25,1.1804,1.5255,0.0488,1.2931,0.4486,0.1718,3.5797,0.1562,0.0156
0,lympho,148,18,4.0541,0.0937,0.1099,0.016,0.8997,0.0312,0.0156,0.1406,0.0042,0.0158
0,mnist,7603,100,9.2069,16.9568,107.431,0.1933,5.6392,14.2231,12.6205,9.0041,10.1508,0.5116
0,musk,3062,166,3.1679,6.8473,28.6823,0.2226,3.9191,4.2853,3.9523,49.7836,2.3135,0.3271
0,optdigits,5216,64,2.8758,7.932,37.7099,0.1308,3.7043,5.1504,5.0644,5.909,3.7522,0.2197
0,pendigits,6870,16,2.2707,7.19,8.5481,0.0312,2.6713,2.398,1.2302,6.9898,2.3267,0.0322


In [26]:
print('ROC Performance')
roc_df


ROC Performance


Unnamed: 0,Data,Samples,Dimensions,Outlier Perc,ABOD,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.7488,0.7545,0.8343,0.7997,0.7562,0.7545,0.7419,0.7595,0.7611
0,cardio,1831,21,9.6122,0.6198,0.6076,0.832,0.9344,0.7514,0.5974,0.8686,0.9326,0.9445
0,glass,214,9,4.2056,0.8554,0.8394,0.8072,0.7831,0.8675,0.8353,0.8273,0.8153,0.755
0,ionosphere,351,33,35.8974,0.8993,0.8892,0.5459,0.8149,0.9273,0.8925,0.9662,0.7914,0.7563
0,letter,1600,32,6.25,0.9058,0.8905,0.6181,0.6498,0.8718,0.8831,0.81,0.622,0.5539
0,lympho,148,18,4.0541,0.924,0.9825,1.0,1.0,0.9708,0.9883,0.8012,0.9883,0.9942
0,mnist,7603,100,9.2069,0.7933,0.7206,0.5851,0.7794,0.8618,0.7238,0.871,0.861,0.8613
0,musk,3062,166,3.1679,0.1384,0.6868,1.0,1.0,0.8134,0.6834,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.5232,0.5745,0.878,0.683,0.3703,0.5793,0.3796,0.5024,0.5087
0,pendigits,6870,16,2.2707,0.7069,0.5593,0.9334,0.973,0.783,0.5314,0.8462,0.9426,0.9471


In [27]:
print('Precision @ n Performance')
prn_df


Precision @ n Performance


Unnamed: 0,Data,Samples,Dimensions,Outlier Perc,ABOD,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,arrhythmia,452,274,14.6018,0.3478,0.3913,0.4783,0.4783,0.3478,0.3478,0.3043,0.3478,0.3913
0,cardio,1831,21,9.6122,0.3077,0.1538,0.4487,0.4872,0.3846,0.141,0.4103,0.4744,0.6026
0,glass,214,9,4.2056,0.0,0.0,0.0,0.0,0.0,0.3333,0.0,0.0,0.0
0,ionosphere,351,33,35.8974,0.84,0.7,0.4,0.62,0.86,0.72,0.88,0.68,0.54
0,letter,1600,32,6.25,0.3333,0.4848,0.0909,0.1212,0.2727,0.4545,0.1515,0.1515,0.0909
0,lympho,148,18,4.0541,0.6667,0.6667,1.0,1.0,0.6667,0.6667,0.6667,0.6667,0.6667
0,mnist,7603,100,9.2069,0.4007,0.3367,0.1347,0.2795,0.468,0.3401,0.4276,0.4444,0.4209
0,musk,3062,166,3.1679,0.0286,0.4286,1.0,0.9714,0.3714,0.3143,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.069,0.0517,0.2586,0.0,0.0,0.0517,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0746,0.0597,0.3284,0.4328,0.0896,0.0597,0.0,0.3433,0.3433


In [32]:
evaluate_print("ABOD", y_test, test_scores)

ABOD ROC:0.868, precision @ rank n:0.2857
