# Importing Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from sklearn.cluster import DBSCAN
from collections import Counter
from scipy.spatial import ConvexHull, convex_hull_plot_2d
warnings.filterwarnings("ignore")

# Reading Cleaned Datasets

In [2]:
cleaned_duration_dataframe_house_A_occ_1 = pd.read_csv("../../data/cleaned/Cleaned-Duration-Dataframe_House-A_Occ-1.csv")
cleaned_duration_dataframe_house_A_occ_1
cleaned_duration_dataframe_house_A_occ_2 = pd.read_csv("../../data/cleaned/Cleaned-Duration-Dataframe_House-A_Occ-2.csv")
cleaned_duration_dataframe_house_A_occ_2
cleaned_duration_dataframe_house_B_occ_1 = pd.read_csv("../../data/cleaned/Cleaned-Duration-Dataframe_House-B_Occ-1.csv")
cleaned_duration_dataframe_house_B_occ_1
cleaned_duration_dataframe_house_B_occ_2 = pd.read_csv("../../data/cleaned/Cleaned-Duration-Dataframe_House-B_Occ-2.csv")
cleaned_duration_dataframe_house_B_occ_2

Unnamed: 0,House,Day,Occupant-2 Activity ID,Occupant-2 Zone ID,Start Time,Duration
0,B,1,11,1,0,620
1,B,1,15,4,620,8
2,B,1,18,2,629,42
3,B,1,4,3,672,16
4,B,1,18,2,689,10
...,...,...,...,...,...,...
329,B,30,8,3,1272,12
330,B,30,12,2,1285,50
331,B,30,15,4,1336,1
332,B,30,12,2,1338,69


# Clustering

In [3]:
# returns dbscan clusters
def dbscan(X, eps, min_samples):
    db = DBSCAN(eps=eps, min_samples=min_samples)
    cluster = db.fit(X)
    #y_pred = db.fit_predict(X)
    #plt.scatter(X[:,0], X[:,1],c=y_pred, cmap='Paired')
    #plt.title("DBSCAN")
    #print("noise", sum(cluster.labels_ ==-1), "regular", sum(cluster.labels_ !=-1))
    #print(np.unique(cluster.labels_))
    return cluster

# Anomaly Detection Model Testing for House-A and Occupand-1

In [4]:
record = []
for day in range(10, 30, 5):
    print(day)
    
    train_dataframe = cleaned_duration_dataframe_house_A_occ_1[cleaned_duration_dataframe_house_A_occ_1['Day'] <= day]
    test_dataframe = cleaned_duration_dataframe_house_A_occ_1.iloc[len(train_dataframe):, :]
    column = "Occupant-1 Zone ID"
    eps, min_samples =  20, 3
    mod_dataframe = pd.DataFrame()

    test_values = test_dataframe.iloc[:,4:].values
    train_values = train_dataframe.iloc[:,4:].values
    
    train_pos = 0 
    train_anomaly = 0
    for j in range(len(train_values)):
        zone = train_dataframe.iloc[j, 3]
        mod_dataframe = pd.DataFrame()
        for i in range(len(train_dataframe)):
            if int(train_dataframe[column][i] == zone):
                mod_dataframe = mod_dataframe.append(train_dataframe.loc[i, ['Start Time', 'Duration']])

        features = np.empty([len(mod_dataframe), 2])
        features[:, 0:1] = mod_dataframe.loc[:, ['Start Time']].values
        features[:, 1:] = mod_dataframe.loc[:, ['Duration']].values
        features = np.concatenate((features, train_values[j:j+1]))
        cluster_model = dbscan(features, eps, min_samples)    
        if cluster_model.labels_[len(features) - 1] == -1:
            train_anomaly += 1
        else:
            train_pos += 1
    train_accuracy = (train_pos/(train_pos + train_anomaly))*100
    
    test_pos = 0 
    test_anomaly = 0
    for j in range(len(test_values)):
        zone = test_dataframe.iloc[j, 3]
        mod_dataframe = pd.DataFrame()
        for i in range(len(train_dataframe)):
            if int(train_dataframe[column][i] == zone):
                mod_dataframe = mod_dataframe.append(train_dataframe.loc[i, ['Start Time', 'Duration']])

        features = np.empty([len(mod_dataframe), 2])
        features[:, 0:1] = mod_dataframe.loc[:, ['Start Time']].values
        features[:, 1:] = mod_dataframe.loc[:, ['Duration']].values
        features = np.concatenate((features, test_values[j:j+1]))
        cluster_model = dbscan(features, eps, min_samples)    
        if cluster_model.labels_[len(features) - 1] == -1:
            test_anomaly += 1
        else:
            test_pos += 1
    test_accuracy = (test_pos/(test_pos + test_anomaly))*100
    record.append([train_pos, train_anomaly, train_accuracy, test_pos, test_anomaly, test_accuracy])
    print(record)

10
[[218, 112, 66.06060606060606, 319, 333, 48.92638036809816]]
15
[[218, 112, 66.06060606060606, 319, 333, 48.92638036809816], [402, 107, 78.97838899803537, 300, 173, 63.424947145877375]]
20
[[218, 112, 66.06060606060606, 319, 333, 48.92638036809816], [402, 107, 78.97838899803537, 300, 173, 63.424947145877375], [551, 114, 82.85714285714286, 220, 97, 69.4006309148265]]
25
[[218, 112, 66.06060606060606, 319, 333, 48.92638036809816], [402, 107, 78.97838899803537, 300, 173, 63.424947145877375], [551, 114, 82.85714285714286, 220, 97, 69.4006309148265], [715, 109, 86.77184466019418, 114, 44, 72.15189873417721]]


In [5]:
adm_accuracy_dataframe_house_A_occ_1 = pd.DataFrame(record, columns = ['Train Positive', 'Train Anomaly', 'Train Accuracy', 'Test Positive', 'Test Anomaly', 'Test Accuracy'])
adm_accuracy_dataframe_house_A_occ_1["Training Days"] = [10, 15, 20, 25]
adm_accuracy_dataframe_house_A_occ_1.to_csv('../../data/shatter/ADM_Accuracy_House-A_Occ-1.csv', index = False)
adm_accuracy_dataframe_house_A_occ_1

Unnamed: 0,Train Positive,Train Anomaly,Train Accuracy,Test Positive,Test Anomaly,Test Accuracy,Training Days
0,218,112,66.060606,319,333,48.92638,10
1,402,107,78.978389,300,173,63.424947,15
2,551,114,82.857143,220,97,69.400631,20
3,715,109,86.771845,114,44,72.151899,25


# Anomaly Detection Model Testing for House-A and Occupand-2

In [6]:
record = []
for day in range(10, 30, 5):
    print(day)
    
    train_dataframe = cleaned_duration_dataframe_house_A_occ_2[cleaned_duration_dataframe_house_A_occ_2['Day'] <= day]
    test_dataframe = cleaned_duration_dataframe_house_A_occ_2.iloc[len(train_dataframe):, :]
    column = "Occupant-2 Zone ID"
    eps, min_samples =  20, 3
    mod_dataframe = pd.DataFrame()

    test_values = test_dataframe.iloc[:,4:].values
    train_values = train_dataframe.iloc[:,4:].values
    
    train_pos = 0 
    train_anomaly = 0
    for j in range(len(train_values)):
        zone = train_dataframe.iloc[j, 3]
        mod_dataframe = pd.DataFrame()
        for i in range(len(train_dataframe)):
            if int(train_dataframe[column][i] == zone):
                mod_dataframe = mod_dataframe.append(train_dataframe.loc[i, ['Start Time', 'Duration']])

        features = np.empty([len(mod_dataframe), 2])
        features[:, 0:1] = mod_dataframe.loc[:, ['Start Time']].values
        features[:, 1:] = mod_dataframe.loc[:, ['Duration']].values
        features = np.concatenate((features, train_values[j:j+1]))
        cluster_model = dbscan(features, eps, min_samples)    
        if cluster_model.labels_[len(features) - 1] == -1:
            train_anomaly += 1
        else:
            train_pos += 1
    train_accuracy = (train_pos/(train_pos + train_anomaly))*100
    
    test_pos = 0 
    test_anomaly = 0
    for j in range(len(test_values)):
        zone = test_dataframe.iloc[j, 3]
        mod_dataframe = pd.DataFrame()
        for i in range(len(train_dataframe)):
            if int(train_dataframe[column][i] == zone):
                mod_dataframe = mod_dataframe.append(train_dataframe.loc[i, ['Start Time', 'Duration']])

        features = np.empty([len(mod_dataframe), 2])
        features[:, 0:1] = mod_dataframe.loc[:, ['Start Time']].values
        features[:, 1:] = mod_dataframe.loc[:, ['Duration']].values
        features = np.concatenate((features, test_values[j:j+1]))
        cluster_model = dbscan(features, eps, min_samples)    
        if cluster_model.labels_[len(features) - 1] == -1:
            test_anomaly += 1
        else:
            test_pos += 1
    test_accuracy = (test_pos/(test_pos + test_anomaly))*100
    record.append([train_pos, train_anomaly, train_accuracy, test_pos, test_anomaly, test_accuracy])
    print(record)

10
[[85, 116, 42.28855721393035, 110, 281, 28.13299232736573]]
15
[[85, 116, 42.28855721393035, 110, 281, 28.13299232736573], [162, 125, 56.44599303135889, 121, 184, 39.67213114754099]]
20
[[85, 116, 42.28855721393035, 110, 281, 28.13299232736573], [162, 125, 56.44599303135889, 121, 184, 39.67213114754099], [252, 135, 65.11627906976744, 98, 107, 47.80487804878049]]
25
[[85, 116, 42.28855721393035, 110, 281, 28.13299232736573], [162, 125, 56.44599303135889, 121, 184, 39.67213114754099], [252, 135, 65.11627906976744, 98, 107, 47.80487804878049], [381, 144, 72.57142857142857, 41, 26, 61.19402985074627]]


In [7]:
adm_accuracy_dataframe_house_A_occ_2 = pd.DataFrame(record, columns = ['Train Positive', 'Train Anomaly', 'Train Accuracy', 'Test Positive', 'Test Anomaly', 'Test Accuracy'])
adm_accuracy_dataframe_house_A_occ_2["Training Days"] = [10, 15, 20, 25]
adm_accuracy_dataframe_house_A_occ_2.to_csv('../../data/shatter/ADM_Accuracy_House-A_Occ-2.csv', index = False)
adm_accuracy_dataframe_house_A_occ_2

Unnamed: 0,Train Positive,Train Anomaly,Train Accuracy,Test Positive,Test Anomaly,Test Accuracy,Training Days
0,85,116,42.288557,110,281,28.132992,10
1,162,125,56.445993,121,184,39.672131,15
2,252,135,65.116279,98,107,47.804878,20
3,381,144,72.571429,41,26,61.19403,25


# Anomaly Detection Model Testing for House-B and Occupand-1

In [8]:
record = []
for day in range(10, 30, 5):
    print(day)
    
    train_dataframe = cleaned_duration_dataframe_house_B_occ_1[cleaned_duration_dataframe_house_B_occ_1['Day'] <= day]
    test_dataframe = cleaned_duration_dataframe_house_B_occ_1.iloc[len(train_dataframe):, :]
    column = "Occupant-1 Zone ID"
    eps, min_samples =  20, 3
    mod_dataframe = pd.DataFrame()

    test_values = test_dataframe.iloc[:,4:].values
    train_values = train_dataframe.iloc[:,4:].values
    
    train_pos = 0 
    train_anomaly = 0
    for j in range(len(train_values)):
        zone = train_dataframe.iloc[j, 3]
        mod_dataframe = pd.DataFrame()
        for i in range(len(train_dataframe)):
            if int(train_dataframe[column][i] == zone):
                mod_dataframe = mod_dataframe.append(train_dataframe.loc[i, ['Start Time', 'Duration']])

        features = np.empty([len(mod_dataframe), 2])
        features[:, 0:1] = mod_dataframe.loc[:, ['Start Time']].values
        features[:, 1:] = mod_dataframe.loc[:, ['Duration']].values
        features = np.concatenate((features, train_values[j:j+1]))
        cluster_model = dbscan(features, eps, min_samples)    
        if cluster_model.labels_[len(features) - 1] == -1:
            train_anomaly += 1
        else:
            train_pos += 1
    train_accuracy = (train_pos/(train_pos + train_anomaly))*100
    
    test_pos = 0 
    test_anomaly = 0
    for j in range(len(test_values)):
        zone = test_dataframe.iloc[j, 3]
        mod_dataframe = pd.DataFrame()
        for i in range(len(train_dataframe)):
            if int(train_dataframe[column][i] == zone):
                mod_dataframe = mod_dataframe.append(train_dataframe.loc[i, ['Start Time', 'Duration']])

        features = np.empty([len(mod_dataframe), 2])
        features[:, 0:1] = mod_dataframe.loc[:, ['Start Time']].values
        features[:, 1:] = mod_dataframe.loc[:, ['Duration']].values
        features = np.concatenate((features, test_values[j:j+1]))
        cluster_model = dbscan(features, eps, min_samples)    
        if cluster_model.labels_[len(features) - 1] == -1:
            test_anomaly += 1
        else:
            test_pos += 1
    test_accuracy = (test_pos/(test_pos + test_anomaly))*100
    record.append([train_pos, train_anomaly, train_accuracy, test_pos, test_anomaly, test_accuracy])
    print(record)

10
[[91, 72, 55.828220858895705, 106, 255, 29.362880886426595]]
15
[[91, 72, 55.828220858895705, 106, 255, 29.362880886426595], [135, 97, 58.189655172413794, 105, 187, 35.95890410958904]]
20
[[91, 72, 55.828220858895705, 106, 255, 29.362880886426595], [135, 97, 58.189655172413794, 105, 187, 35.95890410958904], [225, 106, 67.97583081570997, 75, 118, 38.860103626943]]
25
[[91, 72, 55.828220858895705, 106, 255, 29.362880886426595], [135, 97, 58.189655172413794, 105, 187, 35.95890410958904], [225, 106, 67.97583081570997, 75, 118, 38.860103626943], [299, 139, 68.2648401826484, 45, 41, 52.32558139534884]]


In [9]:
adm_accuracy_dataframe_house_B_occ_1 = pd.DataFrame(record, columns = ['Train Positive', 'Train Anomaly', 'Train Accuracy', 'Test Positive', 'Test Anomaly', 'Test Accuracy'])
adm_accuracy_dataframe_house_B_occ_1["Training Days"] = [10, 15, 20, 25]
adm_accuracy_dataframe_house_B_occ_1.to_csv('../../data/shatter/ADM_Accuracy_House-B_Occ-1.csv', index = False)
adm_accuracy_dataframe_house_B_occ_1

Unnamed: 0,Train Positive,Train Anomaly,Train Accuracy,Test Positive,Test Anomaly,Test Accuracy,Training Days
0,91,72,55.828221,106,255,29.362881,10
1,135,97,58.189655,105,187,35.958904,15
2,225,106,67.975831,75,118,38.860104,20
3,299,139,68.26484,45,41,52.325581,25


# Anomaly Detection Model Testing for House-B and Occupand-2

In [10]:
record = []
for day in range(10, 30, 5):
    print(day)
    
    train_dataframe = cleaned_duration_dataframe_house_B_occ_2[cleaned_duration_dataframe_house_B_occ_2['Day'] <= day]
    test_dataframe = cleaned_duration_dataframe_house_B_occ_2.iloc[len(train_dataframe):, :]
    column = "Occupant-2 Zone ID"
    eps, min_samples =  20, 3
    mod_dataframe = pd.DataFrame()

    test_values = test_dataframe.iloc[:,4:].values
    train_values = train_dataframe.iloc[:,4:].values
    
    train_pos = 0 
    train_anomaly = 0
    for j in range(len(train_values)):
        zone = train_dataframe.iloc[j, 3]
        mod_dataframe = pd.DataFrame()
        for i in range(len(train_dataframe)):
            if int(train_dataframe[column][i] == zone):
                mod_dataframe = mod_dataframe.append(train_dataframe.loc[i, ['Start Time', 'Duration']])

        features = np.empty([len(mod_dataframe), 2])
        features[:, 0:1] = mod_dataframe.loc[:, ['Start Time']].values
        features[:, 1:] = mod_dataframe.loc[:, ['Duration']].values
        features = np.concatenate((features, train_values[j:j+1]))
        cluster_model = dbscan(features, eps, min_samples)    
        if cluster_model.labels_[len(features) - 1] == -1:
            train_anomaly += 1
        else:
            train_pos += 1
    train_accuracy = (train_pos/(train_pos + train_anomaly))*100
    
    test_pos = 0 
    test_anomaly = 0
    for j in range(len(test_values)):
        zone = test_dataframe.iloc[j, 3]
        mod_dataframe = pd.DataFrame()
        for i in range(len(train_dataframe)):
            if int(train_dataframe[column][i] == zone):
                mod_dataframe = mod_dataframe.append(train_dataframe.loc[i, ['Start Time', 'Duration']])

        features = np.empty([len(mod_dataframe), 2])
        features[:, 0:1] = mod_dataframe.loc[:, ['Start Time']].values
        features[:, 1:] = mod_dataframe.loc[:, ['Duration']].values
        features = np.concatenate((features, test_values[j:j+1]))
        cluster_model = dbscan(features, eps, min_samples)    
        if cluster_model.labels_[len(features) - 1] == -1:
            test_anomaly += 1
        else:
            test_pos += 1
    test_accuracy = (test_pos/(test_pos + test_anomaly))*100
    record.append([train_pos, train_anomaly, train_accuracy, test_pos, test_anomaly, test_accuracy])
    print(record)

10
[[65, 66, 49.61832061068702, 52, 151, 25.615763546798032]]
15
[[65, 66, 49.61832061068702, 52, 151, 25.615763546798032], [98, 97, 50.256410256410255, 44, 95, 31.654676258992804]]
20
[[65, 66, 49.61832061068702, 52, 151, 25.615763546798032], [98, 97, 50.256410256410255, 44, 95, 31.654676258992804], [123, 122, 50.204081632653065, 33, 56, 37.07865168539326]]
25
[[65, 66, 49.61832061068702, 52, 151, 25.615763546798032], [98, 97, 50.256410256410255, 44, 95, 31.654676258992804], [123, 122, 50.204081632653065, 33, 56, 37.07865168539326], [143, 131, 52.18978102189781, 27, 33, 45.0]]


In [11]:
adm_accuracy_dataframe_house_B_occ_2 = pd.DataFrame(record, columns = ['Train Positive', 'Train Anomaly', 'Train Accuracy', 'Test Positive', 'Test Anomaly', 'Test Accuracy'])
adm_accuracy_dataframe_house_B_occ_2["Training Days"] = [10, 15, 20, 25]
adm_accuracy_dataframe_house_B_occ_2.to_csv('../../data/shatter/ADM_Accuracy_House-B_Occ-2.csv', index = False)
adm_accuracy_dataframe_house_B_occ_2

Unnamed: 0,Train Positive,Train Anomaly,Train Accuracy,Test Positive,Test Anomaly,Test Accuracy,Training Days
0,65,66,49.618321,52,151,25.615764,10
1,98,97,50.25641,44,95,31.654676,15
2,123,122,50.204082,33,56,37.078652,20
3,143,131,52.189781,27,33,45.0,25
