In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import math

dataset = 'pageblocks'
raw_file = "./data/{}.csv".format(dataset)

df = pd.read_csv(raw_file, header=None)
num_features = len(df.columns) - 1

print("Number of Features: {}".format(num_features))

Number of Features: 10


In [2]:
def partition_dataset(df, num_normal=20, num_anomalies=20):
    df_n = df[df.iloc[:,-1] == 1].sample(num_normal)
    df_a = df[df.iloc[:,-1] == -1].sample(num_anomalies)
    
    df.drop(df_n.index, inplace=True)
    df.drop(df_a.index, inplace=True)
    
    frames = [df_n, df_a]
    df_validation = pd.concat(frames)
    
    return df, df_validation

df_training, df_validation = partition_dataset(df)

total = len(df_training)
num_normal = len(df_training[df_training.iloc[:,-1] == 1])
num_anomalies = len(df_training[df_training.iloc[:,-1] == -1])
contamination_ratio = (num_anomalies / total) * 100

# Get the vector of values disregarding labels
x_training = df_training.iloc[:,:-1].values
x_validation = df_validation.iloc[:,:-1].values

y_training = df_training.iloc[:,-1].values
y_validation = df_validation.iloc[:,-1].values

print("Total: {}".format(total))
print("Normal Count: {}".format(num_normal))
print("Anomaly Count: {}".format(num_anomalies))
print("Contamination Ratio: {}".format(contamination_ratio))

Total: 5348
Normal Count: 4863
Anomaly Count: 485
Contamination Ratio: 9.06881077038145


In [3]:
from pyod.models.ecod import ECOD

clf = ECOD()
clf.fit(x_training)

predictions = clf.predict(x_validation)
predictions = np.where(predictions == 1, -1, predictions)
predictions = np.where(predictions == 0, 1, predictions)

cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]

mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
f1 = tp / (tp + (0.5 * (fp + fn)))

print(classification_report(y_validation, predictions))
print("MCC: {}".format(mcc))
print("F1: {}".format(f1))

              precision    recall  f1-score   support

          -1       0.83      0.25      0.38        20
           1       0.56      0.95      0.70        20

    accuracy                           0.60        40
   macro avg       0.70      0.60      0.54        40
weighted avg       0.70      0.60      0.54        40

MCC: 0.280056016805602
F1: 0.38461538461538464


In [4]:
from pyod.models.copod import COPOD

clf = COPOD()
clf.fit(x_training)

predictions = clf.predict(x_validation)
predictions = np.where(predictions == 1, -1, predictions)
predictions = np.where(predictions == 0, 1, predictions)

cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]

mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
f1 = tp / (tp + (0.5 * (fp + fn)))

print(classification_report(y_validation, predictions))
print("MCC: {}".format(mcc))
print("F1: {}".format(f1))

              precision    recall  f1-score   support

          -1       0.83      0.25      0.38        20
           1       0.56      0.95      0.70        20

    accuracy                           0.60        40
   macro avg       0.70      0.60      0.54        40
weighted avg       0.70      0.60      0.54        40

MCC: 0.280056016805602
F1: 0.38461538461538464


In [None]:
import sys
import os

sys.path.append(os.path.join(os.path.abspath(''), '../pyneural/lib'))
sys.path.append(os.path.join(os.path.abspath(''), '../pyneural/modules'))

from autoencoder import Autoencoder
from train_autoencoder import TrainAutoencoder

params = {
    'layers':        [num_features, num_features - 3],
    'batch_size':    5,
    'model_file':    '/home/ralampay/workspace/pyneural/models/ae-{}.pth'.format(dataset),
    'training_data': x_training,
    'epochs':        100,
    'learning_rate': 0.001
}

cmd = TrainAutoencoder(params)
cmd.execute()

Training model...
Storing data to tensor...
Epoch: 0


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1070/1070 [00:01<00:00, 658.36it/s, loss=0.0101]


Ave Loss: 0.042496682813164786
Epoch: 1


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1070/1070 [00:01<00:00, 709.28it/s, loss=0.00644]


Ave Loss: 0.008213793458830495
Epoch: 2


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1070/1070 [00:01<00:00, 690.13it/s, loss=0.0043]


Ave Loss: 0.005428713019099521
Epoch: 3


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1070/1070 [00:01<00:00, 658.99it/s, loss=0.00288]


Ave Loss: 0.004207820525538681
Epoch: 4


 54%|███████████████████████████████████████████████████████████████████████████████████▌                                                                       | 577/1070 [00:00<00:00, 634.19it/s, loss=0.000763]

In [None]:
from auto_threshold_re import AutoThresholdRe
import torch

model = cmd.model
x_tensor = torch.tensor(x_training).float()
clf = AutoThresholdRe(x_tensor, model)
clf.execute()

print("Optimal Threshold: {}".format(clf.optimal_threshold))

predictions = clf.predict(torch.tensor(x_validation).float())

cm = confusion_matrix(y_validation, predictions)
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]

mcc = ((tn * tp) - (fn * fp)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
f1 = tp / (tp + (0.5 * (fp + fn)))

print(classification_report(y_validation, predictions))
print("MCC: {}".format(mcc))
print("F1: {}".format(f1))