# Clustering - Fixed Window - Approach 2
- Precisely crop the anomaly from the detections by syncing the subtrace before and after the anomaly w.r.t ref_samples
- to keep the lenght of feature vector same, we pad the features with trailing zeros to get length of 500 (max length of detection)
- The feature extraction is the dependent on the corresponding normal behaviour subtrace
- We tested this approach across all applications


In [None]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt
import plotly.express as px
from statistics import mode

# ############ configuration - trace ################
# ############################################


CODE = 'mamba2'       ### application (code)       ###  'theft_protection', 'mamba2', 'lora_ducy'
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 4                     ### format of data collection
WINDOW = 500                 ### window size for subsequence

base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)


################# configuration - diag ################
IS_VAR_WINDOW = False             ### True, False; wether to use variable window size or not

#####################################################


ref_samples_basepath = os.path.join(normalbase_path, f'diag_refsamples{WINDOW}')
ref_var_samples_basepath = os.path.join(normalbase_path, 'diag_var_refsamples')
diag_subseq_basepath = os.path.join(faultybase_path, 'diag_subseq')
subseq_label_basepath = os.path.join(diag_subseq_basepath, 'subseq_labels')
test_labels_basepath = os.path.join(faultybase_path, 'labels')


print('ref_samples_path:\n', ref_samples_basepath)
print('ref_var_samples_path:\n', ref_var_samples_basepath)
print('diag_subseq_path:\n', diag_subseq_basepath)

######### get paths #######################
ref_samples_path = [os.path.join(ref_samples_basepath, x) for x in os.listdir(ref_samples_basepath)]
# ref_var_samples_path = [os.path.join(ref_var_samples_basepath, x) for x in os.listdir(ref_var_samples_basepath)]   

train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

test_subseq_path = [os.path.join(diag_subseq_basepath, x) for x in os.listdir(diag_subseq_basepath)]
test_labels_path = [os.path.join(subseq_label_basepath, x) for x in os.listdir(subseq_label_basepath)]
eval_labels_path = [os.path.join(test_labels_basepath, x) for x in os.listdir(test_labels_basepath)]


# ### remove.Ds_store from all lists
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]
ref_samples_path = [x for x in ref_samples_path if '.DS_Store' not in x]
# ref_var_samples_path = [x for x in ref_var_samples_path if '.DS_Store' not in x]
test_subseq_path = [x for x in test_subseq_path if '.DS_Store' not in x if '.json' in x]
test_labels_path = [x for x in test_labels_path if '.DS_Store' not in x]
eval_labels_path = [x for x in eval_labels_path if '.DS_Store' not in x]

varlist_path.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

if IS_VAR_WINDOW:
    # train_data_path = ref_var_samples_path
    raise ValueError('Ref samples for variable window missing')
else:
    train_data_path = ref_samples_path

test_data_path = test_subseq_path

# print('train_data:', train_data_path)
print(len(train_data_path))
# print('test_data:\n', test_data_path)
print(len(test_data_path))
print('test_labels:\n', test_labels_path)
print('eval_labels:\n', eval_labels_path)



In [None]:
'''
TODO:

0. Take detection trace as the input
1. Identify the start of the detection that is correct: part that matches with the ref_samples
2. Skip the part that is correct and halt at the first incorrect event (anomaly)
3. this indicares the start of first anomaly, thus add this and the next consecutive point to a new blank list (anomaly_instance), and halt at the next point
4. Identify if there is correct part of the trace after that point by comparing with the ref_samples
    a. if there is no matching ref_sample, then shift to the next point an add it to the list (anomaly_instance). Repeat this until the end of the trace
    b. if there is matching ref_sample, skip the matching part and halt at the first incorrect event (anomaly). Add this point a new blank list (next anomaly_instance). Repeat it until the end of the trace
5. collection of all the anomy_instance will give the instances of the anomaly detected



Feature extraction and Clustering:
- use the seperated instances to extract features
- cluster the features (start with kmeans)
- try the same feature extractors as Approach 1
    - TSFEL
    - SegLeran
    - CNN+LSTM
    - Autoencoder
    - our method

 

'''

## Detect and Seperate if multiple instances

In [None]:
# def get_detection_labels(test_labels_path, test_data_path):

#     test_class = {}
#     ### load the labels
#     test_class_labels = read_json(test_labels_path[0])
#     print('test_class_labels:', len(test_class_labels))
#     print('test_class_labels:', test_class_labels)

#     ### prepare the feature vectors for classification
#     for test_data in test_data_path:
#         file_name = test_data.split('/')[-1].split('.')[0]
#         # print(CODE, test_data)
#         class_list = test_class_labels[file_name]
#         test_class[test_data] = class_list
        
#     return test_class

# def split_instances(ref_samples, test_events, test_intervals, test_timestamps):
#     '''
#     check which part of detections are in sync with the corresponding ref_samples
#     first check was max matching events at the start, use this to select correspoinding ref_sample
#     for the selected ref_sample, if there are parts of the trace that are in sync with detection use them to split the trace in mutiple instance

#     ref_samples: list of reference samples of length 50
#     '''
#     WINDOW = 50
#     SLIDING_WINDOW = 20

#     assert np.array(ref_samples).shape[2] == 50, 'ref_samples should be of length 50'

#     test_data_len = len(test_events)
#     # print('test_events:', test_events)
#     # print('test_data_len:', test_data_len)
#     ### shortlist the reference samples which has first 5 elements same as the test_trace
#     selected_ref_events = []
#     selected_ref_intervals = []

#     feature_vector_event = np.zeros((test_data_len,))
#     feature_vector_interval = np.zeros((test_data_len,))
#     feature_vector_timestamps = np.zeros((test_data_len,))
#     # print(feature_vector_event.shape)
    
#     if test_data_len < SLIDING_WINDOW:
#         _test_events = test_events
#         _test_intervals = test_intervals
#         _test_timestamps = test_timestamps

#         _test_len = len(_test_events)
#         print('test_events:', len(_test_events))
#         print('COPY the logic from the next block')
#         # for ref_sample in ref_samples:
#         #     # print(ref_sample)
#         #     _ref_events = ref_sample[0]
#         #     _ref_intervals = ref_sample[1]

#         #     ###'ref_sample should be of len 50'
#         #     assert(len(_ref_events) == WINDOW)

#         #     _ref_events = _ref_events[:_test_len]
#         #     _ref_intervals = _ref_intervals[:_test_len]

#         #     print('ref_event:', len(_ref_events))

#             # break
#     else:
#         for i in range(0, test_data_len, SLIDING_WINDOW):
#             window_start = i
#             window_end = i+WINDOW
#             print('window:', window_start, window_end)
#             _test_events = test_events[window_start:window_end]
#             _test_intervals = test_intervals[window_start:window_end]
#             _test_timestamps = test_timestamps[window_start:window_end]

#             _test_len = len(_test_events)
#             print('test_events:', len(_test_events))

#             shortlisted_ref_events = []
#             shortlisted_ref_intervals = []
#             zero_count = []
#             sample_selected = False
#             for ref_sample in ref_samples:
#                 # print(ref_sample)
#                 _ref_events = ref_sample[0]
#                 _ref_intervals = ref_sample[1]

#                 ###'ref_sample should be of len 50'
#                 # assert(len(_ref_events) == WINDOW)

#                 _ref_events = _ref_events[:_test_len]
#                 _ref_intervals = _ref_intervals[:_test_len]

#                 # print('ref_event:', len(_ref_events))
#                 if _ref_events[:2] == _test_events[:2]:
#                     diff_events = np.array(_ref_events) - np.array(_test_events)
#                     diff_intervals = np.abs(np.array(_ref_intervals) - np.array(_test_intervals))
#                     # print('diff_events:', diff_events)

#                     if all(diff_events == 0):
#                         print('All events are same')
#                         selected_ref_events.append(_ref_events)
#                         selected_ref_intervals.append(_ref_intervals)
#                         feature_vector_event[window_start:window_end] = diff_events
#                         feature_vector_interval[window_start:window_end] = diff_intervals
#                         feature_vector_timestamps[window_start:window_end] = _test_timestamps
#                         sample_selected = True
#                         break   ### part of the logic, do not remove
#                     else:
#                         count = 0
#                         # print(sf[0], sf[1])
#                         for esf, esi in zip(diff_events, diff_intervals):
#                             ### check if events and intervals are same
#                             # if esf == 0 and esi < 5:
#                             if esf == 0:
#                                 count += 1
#                             else:
#                                 break   ### part of the logic, do not remove

#                         if _ref_events not in shortlisted_ref_events:
#                             zero_count.append(count)
#                             shortlisted_ref_events.append(_ref_events)
#                             shortlisted_ref_intervals.append(_ref_intervals)

#             if not sample_selected:
#                 if len(zero_count) != 0:
#                     max_zero_count = max(zero_count)
#                     zero_count = np.array(zero_count)
#                     max_zero_count_ind = np.where(zero_count==max_zero_count)[0][0]
#                     print('zero_count:', zero_count)
#                     print('max_zero_count:', max_zero_count)

#                     # print('max_zero_count_ind:', max_zero_count_ind)
#                     _ref_events = shortlisted_ref_events[max_zero_count_ind]
#                     _ref_intervals = shortlisted_ref_intervals[max_zero_count_ind]
#                     selected_ref_events.append(_ref_events)
#                     selected_ref_intervals.append(_ref_intervals)

#                     diff_events = np.array(_ref_events) - np.array(_test_events)
#                     diff_intervals = np.abs(np.array(_ref_intervals) - np.array(_test_intervals))
#                     # print('selected_ref_events:', selected_ref_events[:max_zero_count+1])
#                     feature_vector_event[window_start:window_end] = diff_events
#                     feature_vector_interval[window_start:window_end] = diff_intervals
#                     feature_vector_timestamps[window_start:window_end] = _test_timestamps
#                 else:
#                     ### no matching ref_sample found
#                     pass ### do nothing, part of the logic

#             # break
        
#     return feature_vector_event, feature_vector_interval, feature_vector_timestamps



def strip_correct_part(ref_samples, test_events, test_intervals):
    '''
    check if any matching event trace in present based on first 2 points
    if yes, then check the number of matching events and intervals, remove the matching part in the event trace and return the remaining part
    if no, then return the same event trace
    '''

    test_data_len = len(test_events)
    # print('test_events:', test_events)
    print('test_data_len:', test_data_len)
    ### shortlist the reference samples which has first 5 elements same as the test_trace
    shortlisted_ref_events = []
    shortlisted_ref_intervals = []
    zero_count = []
    for ref_sample in ref_samples:
        # print('ref_sample:', ref_sample[0][:5])
        # event_diff = np.array(ref_sample[0][0:len(test_events)]) - np.array(test_events)
        # print('event_diff:', event_diff)
        # print('event_diff:', len(event_diff))
        # print('zeros:', np.where(event_diff==0)[0].shape)
        # if len(test_events) == 276:
        #     print('ref_sample:', ref_sample[0][:5])
        #     print('test_events:', test_events[:5])
        if ref_sample[0][:2] == test_events[:2]:
            ref_events = ref_sample[0][:test_data_len]
            ref_intervals = ref_sample[1][:test_data_len]
            if len(test_events) > 500:
                diff_events = np.array(ref_events) - np.array(test_events[:500])
                diff_intervals = np.abs(np.array(ref_intervals) - np.array(test_intervals[:500]))
            else:
                diff_events = np.array(ref_events) - np.array(test_events)
                diff_intervals = np.abs(np.array(ref_intervals) - np.array(test_intervals))
    
            # if len(test_events) == 276:
            #     print('diff_events:', diff_events)
            #     print('diff_intervals:', diff_intervals)
            count = 0
            # print(sf[0], sf[1])
            for esf, esi in zip(diff_events, diff_intervals):
                ### check if events and intervals are same
                # if esf == 0 and esi < 5:
                if esf == 0:
                    count += 1
                else:
                    break   ### part of the logic, do not remove

            # print('zero_count:', count)
            ### depulicate the ref samples
            if ref_events not in shortlisted_ref_events:
                zero_count.append(count)
                shortlisted_ref_events.append(ref_events)
                shortlisted_ref_intervals.append(ref_intervals)
            # print('count:', count)  

        # break

    # print('zero_count:', zero_count)
    # print('shortlisted_ref_samples:', len(shortlisted_ref_events))

    ### select the ref_sample_events with maximum leading zeros
    if len(zero_count) != 0:
        max_zero_count = max(zero_count)
        zero_count = np.array(zero_count)
        max_zero_count_ind = np.where(zero_count==max_zero_count)[0][0]
        # print('max_zero_count_ind:', max_zero_count_ind)
        selected_ref_events = shortlisted_ref_events[max_zero_count_ind]
        selected_ref_intervals = shortlisted_ref_intervals[max_zero_count_ind]
        # print('selected_ref_events:', selected_ref_events[:max_zero_count+1])
    else:
        max_zero_count = 0
        selected_ref_events = None
        selected_ref_intervals = None

    if max_zero_count == 0:
        print('func: No match found')
        return None, selected_ref_events, selected_ref_intervals
    else:
        ### select the point where the last match happened
        # last_matched_point = max_zero_count-1
        # print('last_matched_point:', last_matched_point)
        # print('test_events:', test_events[:last_matched_point])
        # striped_test_events = test_events[last_matched_point:]
        # striped_test_intervals = test_intervals[last_matched_point:]
        # striped_test_timestamps = test_timestamps[last_matched_point:]
        # print('striped_test_events:', striped_test_events)
        # print('max count:', max_zero_count, len(test_events), len(striped_test_events))

        
        if max_zero_count == len(test_events):
            print('func: All events are same')
            return max_zero_count, selected_ref_events, selected_ref_intervals
        else:
            return max_zero_count, selected_ref_events, selected_ref_intervals



In [None]:
##########################################################
##########################################################
    
### load all the reference samples (fixed window size)
ref_samples = []
for ref_sample in train_data_path:
    ref_samples.append(read_traces(ref_sample))


#########################################################
#########################################################

### load the test samples and compare with the reference samples
anomaly_instances = []
anomaly_timestamps = []
test_files = []
for test_data in test_data_path[0:]:
    print('test_data:', test_data)
    ### read the subseq
    test_trace = read_traces(test_data)
    print('test_trace:', test_trace)
    test_data_len = len(test_trace)
    # print('test_data_len:', test_data_len)

    # if test_data_len > 500:
    #     # print('test data length is more than 500, skipping...')
    #     # missing_features.append((test_data, 'test data length is more than 500'))
    #     # continue

    #     print('test data length is more than 500, truncating...')
    #     test_trace = test_trace[:500]
    #     test_data_len = 500
    
    ### transform the test trace from [(var,ts1), (var,ts2), (var, ts3)] to [[var1, var2, var3], [ts1, ts2, ts3]]
    ### old implementation with 0 at start of intervals
    # test_events = []
    # test_intervals = []
    # test_timestamps = []
    # prev_time = test_trace[0][1]
    # time_diff = 0
    # for x in test_trace:
    #     time_diff = x[1] - prev_time
    #     test_intervals.append(time_diff)
    #     prev_time = x[1]
    #     test_events.append(x[0])
    #     test_timestamps.append(x[1])

    ### new implementation without 0 at start of intervals
    test_events = []
    test_intervals = []
    test_timestamps = []
    for x,y in zip(test_trace[:-1], test_trace[1:]):
        test_events.append(x[0])   ### first event
        test_intervals.append(y[1] - x[1])   ### difference between the timestamps of second and first event
        test_timestamps.append(x[1])   ### first timestamp
    # print('test_events:', len(test_events))
    # print('test_intervals:', test_intervals)

    assert len(test_events)+1 == len(test_intervals)+1 == test_data_len

    # ### get detection class

    # test_class_labels = read_json(test_labels_path[0])

    ### store the first to consecutive points as the first anomaly instance
    an_instance = []
    an_timestamps = []
    all_instances = []
    all_timestamps = []
    all_ref_events = []
    all_ref_intervals = []
    all_striped_test_events = []
    all_striped_test_intervals = []
    all_striped_timestamps = []

    striped_test_events = test_events
    striped_test_intervals = test_intervals
    striped_timestamps = test_timestamps
    print('striped_test_events:', striped_test_events, len(striped_test_events))
    # print('striped_test_intervals:', striped_test_intervals, len(striped_test_intervals))
    has_checked = False
    i = 0
    while len(striped_test_events) > 0:
    # while i < 6:
        ### first collect the trace that is given as input
        all_striped_test_events.append(striped_test_events)
        all_striped_test_intervals.append(striped_test_intervals)
        all_striped_timestamps.append(striped_timestamps)
        ### remove the initial correct part of the trace
        # print('ref_samples:', ref_samples, striped_test_events, striped_test_intervals)
        print('give input:', len(striped_test_events))
        max_zero_count, selected_ref_events, selected_ref_intervals = strip_correct_part(ref_samples, striped_test_events, striped_test_intervals)
        print('max_zero_count:', max_zero_count, 'len:', len(striped_test_events))
        if max_zero_count == len(striped_test_events):
            print('All events are same')
            striped_test_events = []
            striped_test_intervals = []
            striped_timestamps = []
        # elif max_zero_count != None:
        #     striped_test_events = striped_test_events[max_zero_count-1:]
        #     striped_test_intervals = striped_test_intervals[max_zero_count-1:]
        #     striped_timestamps = striped_timestamps[max_zero_count-1:]
        print('striped_test_events 1:', striped_test_events, len(striped_test_events))
        ### get the ref trace that matched with the test trace
        all_ref_events.append(selected_ref_events)
        all_ref_intervals.append(selected_ref_intervals)

        # break
        ### store the first anomaly instance (first two consecutive points)
        if max_zero_count != None:
            # print('debug', max_zero_count, len(striped_test_events))
            if len(striped_test_events) != 0:
                if has_checked:
                    if len(an_instance) != 0:
                        ### new anomaly instance detected
                        all_instances.append(an_instance)
                        all_timestamps.append(an_timestamps)
                        an_instance = []
                        an_timestamps = []

                    ### start, where first anomaly instance is detected
                    print('Checked, new instance created')
                    seq_start = np.clip(max_zero_count-1, 0, None)
                    seq_end = np.clip(seq_start+2, 0, len(striped_test_events))
                    print('seq_start:', seq_start, 'seq_end:', seq_end)
                    an_instance.extend(striped_test_events[seq_start:seq_end])
                    an_timestamps.extend(striped_timestamps[seq_start:seq_end])
                    striped_test_events = striped_test_events[seq_end:]
                    striped_test_intervals = striped_test_intervals[seq_end:]
                    striped_timestamps = striped_timestamps[seq_end:]
                    # print('an_instance:', an_instance)
                    print('striped_test_events 3.1:', striped_test_events, len(striped_test_events))
                    has_checked = False
                else:
                    ### recheck from last 5 matching points to see in really an anomaly
                    print('Detected discripancy, rechecking...')
                    step_back = np.clip(max_zero_count-30, 5, None)
                    print('steping back', step_back)
                    striped_test_events = striped_test_events[step_back:]
                    striped_test_intervals = striped_test_intervals[step_back:]
                    striped_timestamps = striped_timestamps[step_back:]
                    has_checked = True
                    continue

                print('striped_test_events 2:', striped_test_events, len(striped_test_events))
            else:
                if len(an_instance) != 0:
                    all_instances.append(an_instance)
                    all_timestamps.append(an_timestamps)
                    an_instance = []
                    an_timestamps = []
                anomaly_instances.append(all_instances)
                anomaly_timestamps.append(all_timestamps)
                test_files.append(test_data)
                print('DONE 1')
                break
        else:
            if len(striped_test_events) > 1:
                ### normal functionality loop
                print('extending previous instance')
                an_instance.extend(striped_test_events[1:2])
                an_timestamps.extend(striped_timestamps[1:2])
                striped_test_events = striped_test_events[1:]
                striped_test_intervals = striped_test_intervals[1:]
                striped_timestamps = striped_timestamps[1:]
                print('striped_test_events 3.2:', striped_test_events, len(striped_test_events))
            else:
                ### for last iteration, when len(striped_test_events) == 1   
                if len(an_instance) != 0:
                    all_instances.append(an_instance)
                    all_timestamps.append(an_timestamps)
                    an_instance = []
                    an_timestamps = []
                anomaly_instances.append(all_instances)
                anomaly_timestamps.append(all_timestamps)
                test_files.append(test_data)
                print('DONE 2')
                break
        print('an_instance:', an_instance)
        print('all_instances:', all_instances)
        if len(striped_test_events) == 0:
            all_instances.append(an_instance)
            all_timestamps.append(an_timestamps)
            an_instance = []
            an_timestamps = []
            anomaly_instances.append(all_instances)
            anomaly_timestamps.append(all_timestamps)
            test_files.append(test_data)
            print('DONE 3')
        print('')    
        i += 1
    # break
print('anomaly_instances:', anomaly_instances) 
print('anomaly_timestamps:', anomaly_timestamps)
        



In [None]:
# for an in anomaly_instances:
#     print('an:', an)
#     print('len:', len(an))
#     print('')

In [None]:
test_class_labels = read_json(test_labels_path[0])
# print(test_class_labels)

for l, i, k in zip(test_files, anomaly_instances, anomaly_timestamps):
    # print(l)
    file_name = l.split('/')[-1].split('.')[0]
    print(file_name)

    class_labels = test_class_labels[file_name]

    print('length of labels and predictions:', len(class_labels), len(i))
    print(class_labels)
    print(i)
    print(k)

    print('')
    # break

### Evaluation

In [None]:
ei = exeInt()

#### check how many instance are identified correctly

### load labels for each trial file
all_test_labels = dict()
for label_path in eval_labels_path:
    # print(label_path)
    file_name = label_path.split('/')[-1].split('.')[0][:-7]
    # print(file_name)
    eval_labels = read_json(label_path)
    key = list(eval_labels['labels'].keys())[0]
    labels = eval_labels['labels'][key]
    all_test_labels[file_name] = labels
    # print('labels:', labels)
    # print(eval_labels)
    # print('')

y_pred_all = []
y_true_all = []
correct = 0
incorrect = 0
for file, instances, inst_time in zip(test_files, anomaly_instances, anomaly_timestamps):
    print(file)
    file_name = file.split('/')[-1].split('.')[0].split('_')
    file_name = '_'.join(file_name[:-1])
    print('file_name:', file_name)
    subseq_ind = file.split('/')[-1].split('.')[0].split('_')[-1]
    subseq_start = int(subseq_ind.split('-')[0])
    subseq_end = int(subseq_ind.split('-')[1])
    # print('subseq_ind:', subseq_ind)
    # print('instances:', instances)
    # print('inst_time:', inst_time)
    
    ### check against labels if the instances are correct
    labels = all_test_labels[file_name]
    # print('instances:', instances)
    # print('inst_time:', inst_time)
    print('labels:', labels)

    detection = []
    for i, t in zip(instances, inst_time):
        start_anomaly = t[0]
        end_anomaly = t[-1]
        start_event = i[0]
        end_event = i[-1]

        anomaly = [(start_event, end_event), (start_anomaly, end_anomaly), file_name]
        detection.append(anomaly)

    gts = []
    for l in labels:
        # if subseq_start < l[0] < subseq_end or subseq_start < l[1] < subseq_end:
        if (subseq_start-5 < l[0] < subseq_end-5 and subseq_start+5 < l[1] < subseq_end+5):

            gts.append(l)

    print('detection:', detection)
    print('gts:', gts)
    correct_pred, rest_pred, y_pred, y_true, false_neg = ei.get_correct_detections(detection, gts)
    print(correct_pred, rest_pred, y_pred, y_true, false_neg)

    y_pred_all.extend(y_pred)
    y_true_all.extend(y_true)


In [None]:
### Evaluation metrics

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, average_precision_score, ConfusionMatrixDisplay


# Calculate precision
precision = precision_score(y_true_all, y_pred_all)
print(f'Precision: {precision:.4f}')

# Calculate recall
recall = recall_score(y_true_all, y_pred_all)
print(f'Recall: {recall:.4f}')

# # Calculate average precision
# average_precision = average_precision_score(y_true_all, y_pred_all)
# print(f'Average Precision: {average_precision:.4f}')

# Calculate F1 score
f1 = f1_score(y_true_all, y_pred_all)
print(f"F1 Score: {f1:.4f}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_true_all, y_pred_all)
print("Confusion Matrix:")
print(conf_matrix)
if len(conf_matrix) == 1:
    conf_matrix = np.array([[0, 0], [0, conf_matrix[0][0]]])
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['normal', 'anomaly'])
disp.plot()

### Plot Trace

In [None]:
# # x_axis = np.arange(0, len(test_trace), 1)



# ### prepare test_trace for plotting
# plot_data = dict()
# plot_data['subseq'] = test_events   ### y_data (traces)

# # for i, fv in enumerate(shortlisted_ref_samples):
# #     plot_data[f'feat1_{i}'] = fv[0]
# plot_data['ref_samples'] = selected_ref_events
    
# df_feat1 = pd.DataFrame(plot_data)

# plot_data = dict()
# plot_data['intervals'] = test_intervals   ### y_data (traces)

# # for i, fv in enumerate(feature_vectors):
# #     plot_data[f'feat2_{i}'] = fv[1]
# plot_data['ref_intervals'] = selected_ref_intervals

# df_feat2 = pd.DataFrame(plot_data)

# fig = px.line(df_feat1, title='features')
# fig.show()

# fig = px.line(df_feat2, title='features')
# fig.show()

for i in range(len(all_striped_test_events)):
    plot_data = dict()
    plot_data['subseq'] = all_striped_test_events[i]   ### y_data (traces)
    print('ref:', all_ref_events[i])
    print('det:', all_striped_test_events[i])
    if all_ref_events[i] != None:
        plot_data['ref_samples'] = all_ref_events[i]
            
        df_feat1 = pd.DataFrame(plot_data)
        # print('df_feat1:', df_feat1)
        
        plot_data = dict()
        plot_data['intervals'] = all_striped_test_intervals[i]   ### y_data (traces)
        plot_data['ref_intervals'] = all_ref_intervals[i]

        df_feat2 = pd.DataFrame(plot_data)

        fig = px.line(df_feat1, title='features')
        fig.show()

        fig = px.line(df_feat2, title='features')
        fig.show()

    # break
