# Initialization

In [1]:
import imp
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
#from data_parser import DataParser
from pull_data import Pull
import os
from sklearn.metrics import confusion_matrix
from prettytable import PrettyTable
from statistics import mean

#from scikit_IsolatedForest import IsolatedForest
from sklearn.ensemble import IsolationForest
#from scikit_LOFNovelty import LOFNovelty
from sklearn.neighbors import LocalOutlierFactor
#from scikit_OneClassSVM import OCSVM
from sklearn.svm import OneClassSVM

In [2]:
IKEA_APP = "dev-annotated-datasets/ikea-app"
IKEA_HOMEKIT = "dev-annotated-datasets/ikea-homekit"
IP_CAM = "dev-annotated-datasets/ipcam"
NORMAL_USER = "dev-annotated-datasets/normal-user"
VOICE_ASSISTANT = "dev-annotated-datasets/voice-assistant"

# Function Definitions

In [3]:
class Metrics:
    def __init__(self,label):
        self.label = label
        self.accuracy = []
        self.precision = []
        self.recall = []
        self.f1 = []
        self.cnt = 0
    def update(self,y,pred):
        try:
            tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
        except Exception as e:
            # TN in all cases
            tn = 0
            fp = 0
            fn = 0
            tp = confusion_matrix(y, pred).ravel()[0]
        
        total = tp+tn+fp+fn
        accuracy = (tp+tn)/total
        if self.label == "Valid":
            precision = tp/(tp+fp)
            recall = tp/(tp+fn)
            f1 = 2*(precision*recall)/(precision+recall)
        else:
            try:
                precision = tn/(tn+fn) # Negative precision
                recall = tn/(tn+fp) # Negative recall
                f1 = 2*(precision*recall)/(precision+recall) # Negative f1
            except Exception as e:
                print(e)
                precision = 0
                f1 = 0
                recall = 0
        
        self.accuracy.append(accuracy)
        self.precision.append(precision)
        self.recall.append(recall)
        self.f1.append(f1)
        self.cnt += 1
        
    def print(self):
        table = PrettyTable()
        table.field_names = [self.label+" Data","Accuracy", "Precision", "Recall", "F1 score"]
        for i in range(len(self.accuracy)):
            table.add_row([i,round(self.accuracy[i],3),round(self.precision[i],3),round(self.recall[i],3),round(self.f1[i],3)])
        
        table.add_row(["Avg",round(mean(self.accuracy),3),round(mean(self.precision),3),round(mean(self.recall),3),round(mean(self.f1),3)])
        print(table)

In [4]:
def print_metrics(y, pred, thr_pred=0.5, label=""):
    #mse = metrics.mean_squared_error(y, pred) # MSE of (y - pred) is the same as Brier score
#    brier = metrics.brier_score_loss(y, pred)
    #logloss = metrics.log_loss(y, pred)
    print("### Metric",label,"###")
    try:
        tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    except Exception as e:
        # TN in all cases
        tn = 0
        fp = 0
        fn = 0
        tp = confusion_matrix(y, pred).ravel()[0]
    #print(tn, fp, fn, tp)
    
    total = tp+tn+fp+fn
    acc = (tp+tn)/total
    if label == "Valid":
        prec = tp/(tp+fp)
        rec = tp/(tp+fn)
        f1 = 2*(prec*rec)/(prec+rec)
    
        print("TP: {:7d} {:6.2f}%".format(tp, tp*100/total))
        print("FN: {:7d} {:6.2f}%".format(fn, fn*100/total))
        print("FP: {:7d} {:6.2f}%".format(fp, fp*100/total))
        print("TN: {:7d} {:6.2f}%".format(tn, tn*100/total))
        print("Accuracy:   {:6.2f}%".format(acc*100))
        print("Precision:  {:6.4f}".format(prec))
        print("Recall:     {:6.4f}".format(rec))
        print("F1 score:   {:6.4f}".format(f1))
    
    else:
        try:
            prec_n = tn/(tn+fn)
            rec_n = tn/(tn+fp)
            f1_n = 2*(prec_n*rec_n)/(prec_n+rec_n)
        except Exception as e:
            print(e)
            prec_n = 0
            f1_n = 0
            rec_n = 0
        
        print("TP: {:7d} {:6.2f}%".format(tp, tp*100/total))
        print("FN: {:7d} {:6.2f}%".format(fn, fn*100/total))
        print("FP: {:7d} {:6.2f}%".format(fp, fp*100/total))
        print("TN: {:7d} {:6.2f}%".format(tn, tn*100/total))
        print("Accuracy:   {:6.2f}%".format(acc*100))
        print("Precision Anomaly:  {:6.4f}".format(prec_n))
        print("Recall Anomaly:     {:6.4f}".format(rec_n))
        print("F1 score Anomaly:   {:6.4f}".format(f1_n))
   

In [5]:
def runModel(models):
    for key, model in models.items():
        print("### Model Name:",key," ###")
        m_valid = Metrics(label="Valid")
        m_anomaly = Metrics(label="Anomaly")
        kf = KFold(5, True)
        t_data = np.array(t.data)
        a_data = np.array(a.data)
        iteration_cnt = 0
        for train_index, test_index in kf.split(t_data):
            iteration_cnt += 1
            #Train
            model.fit(t_data[train_index])
            #Evaluate 
            y_pred_valid = model.predict(t_data[test_index])
            y_pred_outliers = model.predict(a.data)
            # Add results to the metrics object
            m_valid.update([1]*len(y_pred_valid),y_pred_valid)
            m_anomaly.update([-1]*len(y_pred_outliers),y_pred_outliers)
            #print_metrics([1]*len(y_pred_valid),y_pred_valid,label="Valid")
            #print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="Anomaly")
        m_valid.print()
        m_anomaly.print()

# Pull Datasets

In [6]:
t = Pull(IKEA_APP+"/train/",1)
a = Pull(IKEA_APP+"/anomaly/",1)
v = Pull(IKEA_APP+"/valid/",1)
print("Valid:",len(t.data)," Anomaly:",len(a.data)," Valid:",len(v.data))
print("Number of features:",t.features_cnt)

Valid: 388  Anomaly: 16  Valid: 0
Number of features: 110


# MODELS

In [7]:
MODELS = {}
MODELS["IsolatedForest"] = {}
MODELS["LOF"] = {}
MODELS["OneClassSVM"] = {}
rng = np.random.RandomState(12345)
MODELS["IsolatedForest"]["IF1"] = IsolationForest(n_estimators = 100, max_samples="auto",max_features=1,bootstrap=False ,random_state=rng, behaviour='new', contamination='auto')
MODELS["LOF"]["LOF1"] = LocalOutlierFactor(n_neighbors = 20, metric = "chebyshev", novelty=True, contamination='auto')
MODELS["OneClassSVM"]["OSVM1"] = OneClassSVM(kernel='sigmoid',gamma="auto",coef0=0.0, nu=0.1)



In [8]:
runModel(MODELS["IsolatedForest"])
runModel(MODELS["LOF"])
runModel(MODELS["OneClassSVM"])

### Model Name: IF1  ###
+------------+----------+-----------+--------+----------+
| Valid Data | Accuracy | Precision | Recall | F1 score |
+------------+----------+-----------+--------+----------+
|     0      |  0.974   |    1.0    | 0.974  |  0.987   |
|     1      |  0.974   |    1.0    | 0.974  |  0.987   |
|     2      |  0.923   |    1.0    | 0.923  |   0.96   |
|     3      |  0.961   |    1.0    | 0.961  |   0.98   |
|     4      |   1.0    |    1.0    |  1.0   |   1.0    |
|    Avg     |  0.967   |    1.0    | 0.967  |  0.983   |
+------------+----------+-----------+--------+----------+
+--------------+----------+-----------+--------+----------+
| Anomaly Data | Accuracy | Precision | Recall | F1 score |
+--------------+----------+-----------+--------+----------+
|      0       |   0.75   |    1.0    |  0.75  |  0.857   |
|      1       |  0.312   |    1.0    | 0.312  |  0.476   |
|      2       |  0.312   |    1.0    | 0.312  |  0.476   |
|      3       |  0.062   |    1.0 



# Single Models

## Isolated Forest

In [9]:
#Create Model
rng = np.random.RandomState(12345)
clf = IsolationForest(n_estimators = 100, max_samples="auto",max_features=1,bootstrap=False ,random_state=rng, behaviour='new', contamination='auto')

#Train 
#clf.fit(t.data)

#Evaluate 
#y_pred_valid = clf.predict(v.data)
#y_pred_outliers = clf.predict(a.data)

# Measurement
#print_metrics([1]*len(y_pred_valid),y_pred_valid,label="valid")
#print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="anomaly")
# Detection quality score ( The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers.) 
#score_v = clf.decision_function(v.data)
#score_a = clf.decision_function(a.data)
#print(score_v)
#print(score_a)


#m_valid = Metrics(label="Valid")
#m_anomaly = Metrics(label="Anomaly")
kf = KFold(3, True)
t_data = np.array(t.data)
a_data = np.array(a.data)
iteration_cnt = 0
for train_index, test_index in kf.split(t_data):
    iteration_cnt += 1
    #Train
    clf.fit(t_data[train_index])
    #Evaluate 
    y_pred_valid = clf.predict(t_data[test_index])
    y_pred_outliers = clf.predict(a.data)
    print("===== Iteration:",iteration_cnt,"=====")
    print_metrics([1]*len(y_pred_valid),y_pred_valid,label="Valid")
    print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="Anomaly")
    
    #m_valid.update([1]*len(y_pred_valid),y_pred_valid)
    #m_anomaly.update([-1]*len(y_pred_outliers),y_pred_outliers)
#m_valid.print()
#m_anomaly.print()
    

===== Iteration: 1 =====
### Metric Valid ###
TP:     125  96.15%
FN:       5   3.85%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    96.15%
Precision:  1.0000
Recall:     0.9615
F1 score:   0.9804
### Metric Anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:      11  68.75%
TN:       5  31.25%
Accuracy:    31.25%
Precision Anomaly:  1.0000
Recall Anomaly:     0.3125
F1 score Anomaly:   0.4762
===== Iteration: 2 =====
### Metric Valid ###
TP:     110  85.27%
FN:      19  14.73%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    85.27%
Precision:  1.0000
Recall:     0.8527
F1 score:   0.9205
### Metric Anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:      10  62.50%
TN:       6  37.50%
Accuracy:    37.50%
Precision Anomaly:  1.0000
Recall Anomaly:     0.3750
F1 score Anomaly:   0.5455
===== Iteration: 3 =====
### Metric Valid ###
TP:     125  96.90%
FN:       4   3.10%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    96.90%
Precision:  1.0000
Recall:     0.9690
F1

### LOF Novelty

In [10]:
#Create Model
clf = LocalOutlierFactor(n_neighbors = 20, metric = "chebyshev", novelty=True, contamination='auto')

#Train 
#clf.fit(t.data)

#Evaluate 
#y_pred_valid = clf.predict(v.data)
#y_pred_outliers = clf.predict(a.data)

# Measurement
#print_metrics([1]*len(y_pred_valid),y_pred_valid,label="valid")
#print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="anomaly")
# Detection quality score ( The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers.) 
#score_v = clf.decision_function(v.data)
#score_a = clf.decision_function(a.data)
#print(score_v)
#print(score_a)

kf = KFold(3, True)
t_data = np.array(t.data)
a_data = np.array(a.data)
iteration_cnt = 0
for train_index, test_index in kf.split(t_data):
    iteration_cnt += 1
    #Train
    clf.fit(t_data[train_index])
    #Evaluate 
    y_pred_valid = clf.predict(t_data[test_index])
    y_pred_outliers = clf.predict(a.data)
    print("===== Iteration:",iteration_cnt,"=====")
    print_metrics([1]*len(y_pred_valid),y_pred_valid,label="Valid")
    print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="Anomaly")
    



===== Iteration: 1 =====
### Metric Valid ###
TP:      95  73.08%
FN:      35  26.92%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    73.08%
Precision:  1.0000
Recall:     0.7308
F1 score:   0.8444
### Metric Anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       4  25.00%
TN:      12  75.00%
Accuracy:    75.00%
Precision Anomaly:  1.0000
Recall Anomaly:     0.7500
F1 score Anomaly:   0.8571
===== Iteration: 2 =====
### Metric Valid ###
TP:      94  72.87%
FN:      35  27.13%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    72.87%
Precision:  1.0000
Recall:     0.7287
F1 score:   0.8430
### Metric Anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       4  25.00%
TN:      12  75.00%
Accuracy:    75.00%
Precision Anomaly:  1.0000
Recall Anomaly:     0.7500
F1 score Anomaly:   0.8571
===== Iteration: 3 =====
### Metric Valid ###
TP:      81  62.79%
FN:      48  37.21%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    62.79%
Precision:  1.0000
Recall:     0.6279
F1

### OneClassSVM

In [11]:
#Create Model
clf = OneClassSVM(kernel='sigmoid',gamma="auto",coef0=0.0, nu=0.1)

#Train 
#clf.fit(t.data)

#Evaluate 
#y_pred_valid = clf.predict(v.data)
#y_pred_outliers = clf.predict(a.data)

# Measurement
#print_metrics([1]*len(y_pred_valid),y_pred_valid,label="valid")
#print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="anomaly")
# Detection quality score ( Signed distance is positive for an inlier and negative for an outlier.) 
#score_v = clf.decision_function(v.data)
#score_a = clf.decision_function(a.data)
#print(score_v)
#print(score_a)

kf = KFold(3, True)
t_data = np.array(t.data)
a_data = np.array(a.data)
iteration_cnt = 0
for train_index, test_index in kf.split(t_data):
    iteration_cnt += 1
    #Train
    clf.fit(t_data[train_index])
    #Evaluate 
    y_pred_valid = clf.predict(t_data[test_index])
    y_pred_outliers = clf.predict(a.data)
    print("===== Iteration:",iteration_cnt,"=====")
    print_metrics([1]*len(y_pred_valid),y_pred_valid,label="Valid")
    print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="Anomaly")
    


===== Iteration: 1 =====
### Metric Valid ###
TP:       0   0.00%
FN:     130 100.00%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:     0.00%
Precision:     nan
Recall:     0.0000
F1 score:      nan
### Metric Anomaly ###
division by zero
TP:      16 100.00%
FN:       0   0.00%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:   100.00%
Precision Anomaly:  0.0000
Recall Anomaly:     0.0000
F1 score Anomaly:   0.0000
===== Iteration: 2 =====
### Metric Valid ###
TP:       0   0.00%
FN:     129 100.00%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:     0.00%
Precision:     nan
Recall:     0.0000
F1 score:      nan
### Metric Anomaly ###
division by zero
TP:      16 100.00%
FN:       0   0.00%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:   100.00%
Precision Anomaly:  0.0000
Recall Anomaly:     0.0000
F1 score Anomaly:   0.0000
===== Iteration: 3 =====
### Metric Valid ###
TP:       0   0.00%
FN:     129 100.00%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:     0.00%
Precis

