# Initialization

In [29]:
import imp
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
#from data_parser import DataParser
from pull_data import Pull
import os
from sklearn.metrics import confusion_matrix
from prettytable import PrettyTable
from statistics import mean

#from scikit_IsolatedForest import IsolatedForest
from sklearn.ensemble import IsolationForest
#from scikit_LOFNovelty import LOFNovelty
from sklearn.neighbors import LocalOutlierFactor
#from scikit_OneClassSVM import OCSVM
from sklearn.svm import OneClassSVM

In [2]:
IKEA_APP = "dev-annotated-datasets/ikea-app"
IKEA_HOMEKIT = "dev-annotated-datasets/ikea-homekit"
IP_CAM = "dev-annotated-datasets/ipcam"
NORMAL_USER = "dev-annotated-datasets/normal-user"
VOICE_ASSISTANT = "dev-annotated-datasets/voice-assistant"

# Function Definitions

In [3]:
class Metrics:
    def __init__(self,label):
        self.label = label
        self.accuracy = []
        self.precision = []
        self.recall = []
        self.f1 = []
        self.cnt = 0
    def update(self,y,pred):
        try:
            tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
        except Exception as e:
            # TN in all cases
            tn = 0
            fp = 0
            fn = 0
            tp = confusion_matrix(y, pred).ravel()[0]
        
        total = tp+tn+fp+fn
        accuracy = (tp+tn)/total
        if self.label == "Valid":
            precision = tp/(tp+fp)
            recall = tp/(tp+fn)
            f1 = 2*(precision*recall)/(precision+recall)
        else:
            try:
                precision = tn/(tn+fn) # Negative precision
                recall = tn/(tn+fp) # Negative recall
                f1 = 2*(precision*recall)/(precision+recall) # Negative f1
            except Exception as e:
                print(e)
                precision = 0
                f1 = 0
                recall = 0
        
        self.accuracy.append(accuracy)
        self.precision.append(precision)
        self.recall.append(recall)
        self.f1.append(f1)
        self.cnt += 1
        
    def print(self):
        table = PrettyTable()
        table.field_names = [self.label+" Data","Accuracy", "Precision", "Recall", "F1 score"]
        for i in range(len(self.accuracy)):
            table.add_row([i,round(self.accuracy[i],3),round(self.precision[i],3),round(self.recall[i],3),round(self.f1[i],3)])
        
        table.add_row(["Avg",round(mean(self.accuracy),3),round(mean(self.precision),3),round(mean(self.recall),3),round(mean(self.f1),3)])
        print(table)

In [4]:
def print_metrics(y, pred, thr_pred=0.5, label=""):
    #mse = metrics.mean_squared_error(y, pred) # MSE of (y - pred) is the same as Brier score
#    brier = metrics.brier_score_loss(y, pred)
    #logloss = metrics.log_loss(y, pred)
    print("### Metric",label,"###")
    try:
        tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    except Exception as e:
        # TP in all cases
        tn = 0
        fp = 0
        fn = 0
        tp = confusion_matrix(y, pred).ravel()[0]
    #print(tn, fp, fn, tp)
    
    total = tp+tn+fp+fn
    acc = (tp+tn)/total
    if label == "Valid":
        prec = tp/(tp+fp)
        rec = tp/(tp+fn)
        f1 = 2*(prec*rec)/(prec+rec)
    
        print("TP: {:7d} {:6.2f}%".format(tp, tp*100/total))
        print("FN: {:7d} {:6.2f}%".format(fn, fn*100/total))
        print("FP: {:7d} {:6.2f}%".format(fp, fp*100/total))
        print("TN: {:7d} {:6.2f}%".format(tn, tn*100/total))
        print("Accuracy:   {:6.2f}%".format(acc*100))
        print("Precision:  {:6.4f}".format(prec))
        print("Recall:     {:6.4f}".format(rec))
        print("F1 score:   {:6.4f}".format(f1))
    
    else:
        try:
            prec_n = tn/(tn+fn)
            rec_n = tn/(tn+fp)
            f1_n = 2*(prec_n*rec_n)/(prec_n+rec_n)
        except Exception as e:
            print(e)
            prec_n = 0
            f1_n = 0
            rec_n = 0
        
        print("TP: {:7d} {:6.2f}%".format(tp, tp*100/total))
        print("FN: {:7d} {:6.2f}%".format(fn, fn*100/total))
        print("FP: {:7d} {:6.2f}%".format(fp, fp*100/total))
        print("TN: {:7d} {:6.2f}%".format(tn, tn*100/total))
        print("Accuracy:   {:6.2f}%".format(acc*100))
        print("Precision Anomaly:  {:6.4f}".format(prec_n))
        print("Recall Anomaly:     {:6.4f}".format(rec_n))
        print("F1 score Anomaly:   {:6.4f}".format(f1_n))
   

In [25]:
def runModel(models):
    for key, model in models.items():
        print("### Model Name:",key," ###")
        m_valid = Metrics(label="Valid")
        m_anomaly = Metrics(label="Anomaly")
        kf = KFold(5, True)
        t_data = np.array(t.data)
        a_data = np.array(a.data)
        iteration_cnt = 0
        for train_index, test_index in kf.split(t_data):
            iteration_cnt += 1
            #Train
            model.fit(t_data[train_index])
            #Evaluate 
            y_pred_valid = model.predict(t_data[test_index])
            y_pred_outliers = model.predict(a.data)
            # Add results to the metrics object
            m_valid.update([1]*len(y_pred_valid),y_pred_valid)
            m_anomaly.update([-1]*len(y_pred_outliers),y_pred_outliers)
            #print_metrics([1]*len(y_pred_valid),y_pred_valid,label="Valid")
            #print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="Anomaly")
        m_valid.print()
        m_anomaly.print()

# Pull Datasets

In [65]:
t = Pull(IKEA_APP +"/train/",1)
a = Pull(IKEA_APP +"/anomaly/",1)
v = Pull(IKEA_APP +"/valid/",1)
print("Valid:",len(t.data)," Anomaly:",len(a.data)," Valid:",len(v.data))
print("Number of features:",t.features_cnt)

Error: failued to parse file dev-annotated-datasets/ikea-app/train/.DS_Store
Valid: 388  Anomaly: 16  Valid: 0
Number of features: 107


# MODELS

In [57]:
MODELS = {}
MODELS["IsolatedForest"] = {}
MODELS["LOF"] = {}
MODELS["OneClassSVM"] = {}
rng = np.random.RandomState(12345)
MODELS["IsolatedForest"]["IF1"] = IsolationForest(n_estimators = 250, max_samples='auto',max_features=5,bootstrap=True , behaviour='new',random_state=rng, contamination='auto')
#MODELS["IsolatedForest"]["IF2"] = IsolationForest(n_estimators = 20, max_samples='auto',max_features=5,bootstrap=True ,random_state=rng)
#MODELS["LOF"]["LOF1"] = LocalOutlierFactor(n_neighbors = 10, metric = "chebyshev", novelty=True, contamination=0.1)
MODELS["LOF"]["LOF2"] = LocalOutlierFactor(n_neighbors = 10, metric = "chebyshev", novelty=True, contamination='auto')
MODELS["LOF"]["LOF3"] = LocalOutlierFactor(n_neighbors = 10, metric = "canberra", novelty=True, contamination='auto')
#MODELS["LOF"]["LOF4"] = LocalOutlierFactor(n_neighbors = 10, metric = "canberra", novelty=True, contamination='auto')
MODELS["OneClassSVM"]["OSVM1"] = OneClassSVM(kernel='poly',gamma="auto",coef0=1, nu=0.2)



In [66]:
#runModel(MODELS["IsolatedForest"])
runModel(MODELS["LOF"])
#runModel(MODELS["OneClassSVM"])

### Model Name: LOF2  ###
+------------+----------+-----------+--------+----------+
| Valid Data | Accuracy | Precision | Recall | F1 score |
+------------+----------+-----------+--------+----------+
|     0      |  0.859   |    1.0    | 0.859  |  0.924   |
|     1      |  0.833   |    1.0    | 0.833  |  0.909   |
|     2      |  0.833   |    1.0    | 0.833  |  0.909   |
|     3      |   0.87   |    1.0    |  0.87  |  0.931   |
|     4      |  0.896   |    1.0    | 0.896  |  0.945   |
|    Avg     |  0.858   |    1.0    | 0.858  |  0.924   |
+------------+----------+-----------+--------+----------+
+--------------+----------+-----------+--------+----------+
| Anomaly Data | Accuracy | Precision | Recall | F1 score |
+--------------+----------+-----------+--------+----------+
|      0       |  0.688   |    1.0    | 0.688  |  0.815   |
|      1       |  0.688   |    1.0    | 0.688  |  0.815   |
|      2       |  0.688   |    1.0    | 0.688  |  0.815   |
|      3       |  0.875   |    1.0

# Single Models

## Isolated Forest

### LOF Novelty

In [46]:
#Create Model
clf = LocalOutlierFactor(n_neighbors = 10, metric = "minkowski", novelty=True, contamination='auto')

kf = KFold(3, True)
t_data = np.array(t.data)
a_data = np.array(a.data)
iteration_cnt = 0
for train_index, test_index in kf.split(t_data):
    iteration_cnt += 1
    #Train
    clf.fit(t_data[train_index])
    #Evaluate 
    y_pred_valid = clf.predict(t_data[test_index])
    y_pred_outliers = clf.predict(a.data)
    print("===== Iteration:",iteration_cnt,"=====")
    print_metrics([1]*len(y_pred_valid),y_pred_valid,label="Valid")
    print_metrics([-1]*len(y_pred_outliers),y_pred_outliers,label="Anomaly")
    score_v = clf.decision_function(t_data[test_index])
    score_a = clf.decision_function(a.data)
    print(max(score_v))
   
    



===== Iteration: 1 =====
### Metric Valid ###
TP:      87  78.38%
FN:      24  21.62%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    78.38%
Precision:  1.0000
Recall:     0.7838
F1 score:   0.8788
### Metric Anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       4  25.00%
TN:      12  75.00%
Accuracy:    75.00%
Precision Anomaly:  1.0000
Recall Anomaly:     0.7500
F1 score Anomaly:   0.8571
0.5842882001362815
===== Iteration: 2 =====
### Metric Valid ###
TP:      96  86.49%
FN:      15  13.51%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    86.49%
Precision:  1.0000
Recall:     0.8649
F1 score:   0.9275
### Metric Anomaly ###
TP:       0   0.00%
FN:       0   0.00%
FP:       6  37.50%
TN:      10  62.50%
Accuracy:    62.50%
Precision Anomaly:  1.0000
Recall Anomaly:     0.6250
F1 score Anomaly:   0.7692
0.565677147539551
===== Iteration: 3 =====
### Metric Valid ###
TP:      98  89.09%
FN:      12  10.91%
FP:       0   0.00%
TN:       0   0.00%
Accuracy:    89.09%
Pre

### OneClassSVM