# Initialization

In [1]:
import imp
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
#from data_parser import DataParser
from pull_data import Pull
import os
from sklearn.metrics import confusion_matrix

#from scikit_IsolatedForest import IsolatedForest
from sklearn.ensemble import IsolationForest
#from scikit_LOFNovelty import LOFNovelty
from sklearn.neighbors import LocalOutlierFactor
#from scikit_OneClassSVM import OCSVM
from sklearn.svm import OneClassSVM

In [2]:
IKEA_APP = "dev-annotated-datasets/ikea-app"
IKEA_HOMEKIT = "dev-annotated-datasets/ikea-homekit"
IP_CAM = "dev-annotated-datasets/ipcam"
NORMAL_USER = "dev-annotated-datasets/normal-user"
VOICE_ASSISTANT = "dev-annotated-datasets/voice-assistant"

In [3]:
def print_metrics(y, pred, thr_pred=0.5, label=""):
    #mse = metrics.mean_squared_error(y, pred) # MSE of (y - pred) is the same as Brier score
#    brier = metrics.brier_score_loss(y, pred)
    #logloss = metrics.log_loss(y, pred)
    print("### Metric",label,"###")
    try:
        tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    except Exception as e:
        # TN in all cases
        tn = 0
        fp = 0
        fn = 0
        tp = confusion_matrix(y, pred).ravel()[0]
    #print(tn, fp, fn, tp)
    
    total = tp+tn+fp+fn
    acc = (tp+tn)/total
    if label == "valid":
        prec = tp/(tp+fp)
        rec = tp/(tp+fn)
        f1 = 2*(prec*rec)/(prec+rec)
    
        print("TP: {:7d} {:6.2f}%".format(tp, tp*100/total))
        print("FN: {:7d} {:6.2f}%".format(fn, fn*100/total))
        print("FP: {:7d} {:6.2f}%".format(fp, fp*100/total))
        print("TN: {:7d} {:6.2f}%".format(tn, tn*100/total))
        print("Accuracy:   {:6.2f}%".format(acc*100))
        print("Precision:  {:6.4f}".format(prec))
        print("Recall:     {:6.4f}".format(rec))
    
        print("F1 score:   {:6.4f}".format(f1))
    
    else:
        prec_n = tn/(tn+fn)
        rec_n = tn/(tn+fp)
        f1_n = 2*(prec_n*rec_n)/(prec_n+rec_n)
        print("TP: {:7d} {:6.2f}%".format(tp, tp*100/total))
        print("FN: {:7d} {:6.2f}%".format(fn, fn*100/total))
        print("FP: {:7d} {:6.2f}%".format(fp, fp*100/total))
        print("TN: {:7d} {:6.2f}%".format(tn, tn*100/total))
        print("Accuracy:   {:6.2f}%".format(acc*100))
        print("Precision Anomaly:  {:6.4f}".format(prec_n))
        print("Recall Anomaly:     {:6.4f}".format(rec_n))
        print("F1 score Anomaly:   {:6.4f}".format(f1_n))
   

In [4]:
t = Pull(IKEA_APP+"/train/",1)
a = Pull(IKEA_APP+"/anomaly/",1)
v = Pull(IKEA_APP+"/valid/",1)

388 9 0


## Isolated Forest

In [31]:
#Create Model
rng = np.random.RandomState(12345)
clf = IsolationForest(n_estimators = 100, max_samples="auto",max_features=1,bootstrap=False ,random_state=rng, behaviour='new', contamination='auto')

#Train 
#clf.fit(t.data)

#Evaluate 
#y_pred_valid = clf.predict(v.data)
#y_pred_outliers = clf.predict(a.data)

# Measurement
#print_metrics([1]*len(y_pred_valid),y_pred_valid,label="valid")
#print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="anomaly")
# Detection quality score ( The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers.) 
#score_v = clf.decision_function(v.data)
#score_a = clf.decision_function(a.data)
#print(score_v)
#print(score_a)

kf = KFold(5, True)
t_data = np.array(t.data)
a_data = np.array(a.data)
iteration_cnt = 0
for train_index, test_index in kf.split(t_data):
    iteration_cnt += 1
    #Train
    clf.fit(t_data[train_index])
    #Evaluate 
    y_pred_valid = clf.predict(t_data[test_index])
    y_pred_outliers = clf.predict(a.data)
    print("===== Iteration:",iteration_cnt,"=====")
    print_metrics([1]*len(y_pred_valid),y_pred_valid,label="valid")
    print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="anomaly")
    

===== Iteration: 1 =====
### Metric valid ###
TP:      76  97.44%
FN:       2   2.56%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    97.44%
Precision:  1.0000
Recall:     0.9744
F1 score:   0.9870
### Metric anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       8  88.89%
TN:       1  11.11%
Accuracy:    11.11%
Precision Anomaly:  1.0000
Recall Anomaly:     0.1111
F1 score Anomaly:   0.2000
===== Iteration: 2 =====
### Metric valid ###
TP:      77  98.72%
FN:       1   1.28%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    98.72%
Precision:  1.0000
Recall:     0.9872
F1 score:   0.9935
### Metric anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       6  66.67%
TN:       3  33.33%
Accuracy:    33.33%
Precision Anomaly:  1.0000
Recall Anomaly:     0.3333
F1 score Anomaly:   0.5000
===== Iteration: 3 =====
### Metric valid ###
TP:      78 100.00%
FN:       0   0.00%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:   100.00%
Precision:  1.0000
Recall:     1.0000
F1

# LOF Novelty

In [28]:
#Create Model
clf = LocalOutlierFactor(n_neighbors = 20, metric = "chebyshev", novelty=True, contamination='auto')

#Train 
#clf.fit(t.data)

#Evaluate 
#y_pred_valid = clf.predict(v.data)
#y_pred_outliers = clf.predict(a.data)

# Measurement
#print_metrics([1]*len(y_pred_valid),y_pred_valid,label="valid")
#print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="anomaly")
# Detection quality score ( The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers.) 
#score_v = clf.decision_function(v.data)
#score_a = clf.decision_function(a.data)
#print(score_v)
#print(score_a)

kf = KFold(5, True)
t_data = np.array(t.data)
a_data = np.array(a.data)
iteration_cnt = 0
for train_index, test_index in kf.split(t_data):
    iteration_cnt += 1
    #Train
    clf.fit(t_data[train_index])
    #Evaluate 
    y_pred_valid = clf.predict(t_data[test_index])
    y_pred_outliers = clf.predict(a.data)
    print("===== Iteration:",iteration_cnt,"=====")
    print_metrics([1]*len(y_pred_valid),y_pred_valid,label="valid")
    print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="anomaly")
    



===== Iteration: 1 =====
### Metric valid ###
TP:      61  78.21%
FN:      17  21.79%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    78.21%
Precision:  1.0000
Recall:     0.7821
F1 score:   0.8777
### Metric anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       8  88.89%
TN:       1  11.11%
Accuracy:    11.11%
Precision Anomaly:  1.0000
Recall Anomaly:     0.1111
F1 score Anomaly:   0.2000
===== Iteration: 2 =====
### Metric valid ###
TP:      62  79.49%
FN:      16  20.51%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    79.49%
Precision:  1.0000
Recall:     0.7949
F1 score:   0.8857
### Metric anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       8  88.89%
TN:       1  11.11%
Accuracy:    11.11%
Precision Anomaly:  1.0000
Recall Anomaly:     0.1111
F1 score Anomaly:   0.2000
===== Iteration: 3 =====
### Metric valid ###
TP:      63  80.77%
FN:      15  19.23%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    80.77%
Precision:  1.0000
Recall:     0.8077
F1

## OneClassSVM

In [30]:
#Create Model
clf = OneClassSVM(kernel='sigmoid',gamma="auto",coef0=0.0, nu=0.1)

#Train 
#clf.fit(t.data)

#Evaluate 
#y_pred_valid = clf.predict(v.data)
#y_pred_outliers = clf.predict(a.data)

# Measurement
#print_metrics([1]*len(y_pred_valid),y_pred_valid,label="valid")
#print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="anomaly")
# Detection quality score ( Signed distance is positive for an inlier and negative for an outlier.) 
#score_v = clf.decision_function(v.data)
#score_a = clf.decision_function(a.data)
#print(score_v)
#print(score_a)

kf = KFold(5, True)
t_data = np.array(t.data)
a_data = np.array(a.data)
iteration_cnt = 0
for train_index, test_index in kf.split(t_data):
    iteration_cnt += 1
    #Train
    clf.fit(t_data[train_index])
    #Evaluate 
    y_pred_valid = clf.predict(t_data[test_index])
    y_pred_outliers = clf.predict(a.data)
    print("===== Iteration:",iteration_cnt,"=====")
    print_metrics([1]*len(y_pred_valid),y_pred_valid,label="valid")
    print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="anomaly")
    


===== Iteration: 1 =====
### Metric valid ###
TP:      78 100.00%
FN:       0   0.00%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:   100.00%
Precision:  1.0000
Recall:     1.0000
F1 score:   1.0000
### Metric anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       9 100.00%
TN:       0   0.00%
Accuracy:     0.00%
Precision Anomaly:     nan
Recall Anomaly:     0.0000
F1 score Anomaly:      nan
===== Iteration: 2 =====
### Metric valid ###
TP:      78 100.00%
FN:       0   0.00%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:   100.00%
Precision:  1.0000
Recall:     1.0000
F1 score:   1.0000
### Metric anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       9 100.00%
TN:       0   0.00%
Accuracy:     0.00%
Precision Anomaly:     nan
Recall Anomaly:     0.0000
F1 score Anomaly:      nan
===== Iteration: 3 =====
### Metric valid ###
TP:      78 100.00%
FN:       0   0.00%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:   100.00%
Precision:  1.0000
Recall:     1.0000
F1



ZeroDivisionError: division by zero