# Clustering - Fixed Window - Approach 2
- Precisely crop the anomaly from the detections by syncing the subtrace before and after the anomaly w.r.t ref_samples
- to keep the lenght of feature vector same, we pad the features with trailing zeros to get length of 500 (max length of detection)
- The feature extraction is the dependent on the corresponding normal behaviour subtrace
- We tested this approach across all applications


In [None]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt
import plotly.express as px
from statistics import mode

# ############ configuration - trace ################
# ############################################


CODE = 'theft_protection'       ### application (code)       ###  'theft_protection', 'mamba2', 'lora_ducy'
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 3                     ### format of data collection

base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)


################# configuration - diag ################
IS_VAR_WINDOW = False             ### True, False; wether to use variable window size or not

#####################################################


ref_samples_basepath = os.path.join(normalbase_path, 'diag_refsamples')
ref_var_samples_basepath = os.path.join(normalbase_path, 'diag_var_refsamples')
diag_subseq_basepath = os.path.join(faultybase_path, 'diag_subseq')
subseq_label_basepath = os.path.join(diag_subseq_basepath, 'subseq_labels')


print('ref_samples_path:\n', ref_samples_basepath)
print('ref_var_samples_path:\n', ref_var_samples_basepath)
print('diag_subseq_path:\n', diag_subseq_basepath)

######### get paths #######################
ref_samples_path = [os.path.join(ref_samples_basepath, x) for x in os.listdir(ref_samples_basepath)]
ref_var_samples_path = [os.path.join(ref_var_samples_basepath, x) for x in os.listdir(ref_var_samples_basepath)]   

train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

test_subseq_path = [os.path.join(diag_subseq_basepath, x) for x in os.listdir(diag_subseq_basepath)]
test_labels_path = [os.path.join(subseq_label_basepath, x) for x in os.listdir(subseq_label_basepath)]

# ### remove.Ds_store from all lists
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]
ref_samples_path = [x for x in ref_samples_path if '.DS_Store' not in x]
ref_var_samples_path = [x for x in ref_var_samples_path if '.DS_Store' not in x]
test_subseq_path = [x for x in test_subseq_path if '.DS_Store' not in x if '.json' in x]
test_labels_path = [x for x in test_labels_path if '.DS_Store' not in x]


varlist_path.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

if IS_VAR_WINDOW:
    train_data_path = ref_var_samples_path
else:
    train_data_path = ref_samples_path

test_data_path = test_subseq_path

print('train_data:\n', train_data_path)
print(len(train_data_path))
print('test_data:\n', test_data_path)
print(len(test_data_path))
print('test_labels:\n', test_labels_path)




In [None]:
'''
TODO:
1. save ref_samples with window of 10
2. On the detections, slide with window of 10 and sliding interval of 10
3. if the pattern exists in the ref_samples, then it is a normal pattern
4. if the pattern does not exist in the ref_samples, then it is a faulty pattern (increment anomaly count)
 4.1 if pattern does not exist find the most similar pattern in the ref_samples (take the diff and look for max zeros)
5. with the matching ref sample for each windwow, form a new seq by joinig them one after the other
6. This new seq is the new unique ref_sample for that detection, take the difference and get the feature vector
7. split the detection into number of instances detected, seperated by zeros.

Feature extraction and Clustering:
- use the seperated instances to extract features
- cluster the features (start with kmeans)
- try the same feature extractors as Approach 1
    - TSFEL
    - SegLeran
    - CNN+LSTM
    - Autoencoder
    - our method

 

'''

## Sync the Detection and Ref_Sample

In [None]:
##########################################################
##########################################################
    
### load all the reference samples (fixed window size)
ref_samples = []
for ref_sample in train_data_path:
    ref_samples.append(read_traces(ref_sample))


#########################################################
#########################################################

### load the test samples and compare with the reference samples
test_feature_vectors = []  ### [(test_data, (feat1_vector, feat2_vector)), (), (), ...]
missing_features = []   ### [(test_data, missing_feature), (), (), ...]
for test_data in test_data_path[0:]:
    print('test_data:', test_data)
    ### read the subseq
    test_trace = read_traces(test_data)
    print('test_trace:', test_trace)
    test_data_len = len(test_trace)
    print('test_data_len:', test_data_len)

    if test_data_len > 500:
        # print('test data length is more than 500, skipping...')
        # missing_features.append((test_data, 'test data length is more than 500'))
        # continue

        print('test data length is more than 500, truncating...')
        test_trace = test_trace[:500]
        test_data_len = 500
    
    ### transform the test trace from [(var,ts1), (var,ts2), (var, ts3)] to [[var1, var2, var3], [ts1, ts2, ts3]]
    test_events = []
    test_intervals = []
    prev_time = test_trace[0][1]
    time_diff = 0
    for x in test_trace:
        time_diff = x[1] - prev_time
        test_intervals.append(time_diff)
        prev_time = x[1]
        test_events.append(x[0])

    assert len(test_events) == len(test_intervals) == test_data_len

    ### shortlist the reference samples which has first 5 elements same as the test_trace
    startsync_ref_samples = []
    for ref_sample in ref_samples:
        # print('ref_sample:', ref_sample[0][:5])
        if ref_sample[0][:5] == test_events[:5]:
            ref_sample = (ref_sample[0][:test_data_len], ref_sample[1][:test_data_len])
            startsync_ref_samples.append(ref_sample)
        
    print('startsync_ref_samples:', len(startsync_ref_samples))
    print('test_events', test_events)



    ### deduplicate reference samples
    # print('shortlisted_ref_samples:', len(shortlisted_ref_samples))
    # dedup_ref_samples = []
    # _dedup_events = []
    # for ref_sample in shortlisted_ref_samples:
    #     # print('ref_sample:', ref_sample[0])
    #     if ref_sample[0] not in _dedup_events:
    #         dedup_ref_samples.append(ref_sample)
    #         _dedup_events.append(ref_sample[0])
    # # print('dedup_ref_samples:', len(dedup_ref_samples))
    # shortlisted_ref_samples = dedup_ref_samples
                

    # print('dedup_ref_samples:', len(shortlisted_ref_samples))
    # print('ref_samples:', len(ref_samples))

    # ### generate feature vector for the test_trace with respect to each of the shortlisted_ref_samples
    # '''
    # Feature generation:
    # - take difference of the events and intervals of the test_trace with the shortlisted_ref_samples
    # '''
    # # print('ref samples with matching first 5 events:', np.array(shortlisted_ref_samples).shape)
    # if shortlisted_ref_samples != []:
    #     shortlisted_features = []
    #     feature_vectors = []
    #     for ref_sample in shortlisted_ref_samples:
    #         # print('ref_sample:', ref_sample[1])
    #         sel_ref_event = ref_sample[0][:test_data_len]
    #         sel_ref_interval = ref_sample[1][:test_data_len]
    #         # print('sel_ref_event:', len(sel_ref_event), len(sel_ref_interval))
    #         assert (len(sel_ref_event) == len(sel_ref_interval) == test_data_len)

    #         ### generate feature vector
    #         feat1_vector = []
    #         feat2_vector = []
    #         for i in range(test_data_len):
    #             feat1 = test_events[i] - sel_ref_event[i]
    #             feat2 = test_intervals[i] - sel_ref_interval[i]
    #             ### if the difference in interval is within 500 ms, then consider it as same, as we consider tolerance of 500 ms based on observation
    #             feat2 = [0 if feat2 >= -500 and feat2 <= 500 else feat2 ][0] 
    #             feat1_vector.append(feat1)
    #             feat2_vector.append(feat2)

    #         feat1_vector = np.array(feat1_vector)
    #         feat2_vector = np.array(feat2_vector)
    #         shortlisted_features.append((feat1_vector, feat2_vector))

        
    #     ### count leading zeros in the feature vector
    #     # print('shortlisted_features:', len(shortlisted_features))
    #     zero_count = []
    #     for sf in shortlisted_features:
    #         count = 0
    #         # print(sf[0], sf[1])
    #         for esf, isf in zip(sf[0], sf[1]):
    #             ### check if events and intervals are same
    #             if esf == 0 and isf == 0:
    #                 count += 1
    #             else:
    #                 break   ### part of the logic, do not remove

    #         # print('zero_count:', count)
    #         zero_count.append(count)

    #     ### select the feature vector with maximum leading zeros
    #     max_zero_count = max(zero_count)
    #     zero_count = np.array(zero_count)
    #     max_zero_count_ind = np.where(zero_count==max_zero_count)[0]
    #     # print('max number of starting events that are same for ref and test:', max_zero_count)
    #     # print('ref samples with highest matching events in the start:', len(max_zero_count_ind))

    #     ### select the feature vectors with maximum leading zeros
    #     feature_vectors = [ shortlisted_features[i] for i in max_zero_count_ind ]

    #     total_zero_count = []
    #     for features in feature_vectors:
    #         # print('feature:', features)
    #         # print('zero_count:', np.where(features[0]==0)[0].shape)
    #         total_zero_count.append(np.where(features[0]==0)[0].shape[0])
    #     # print('total_zero_count:', total_zero_count)
    #     total_zero_count = np.array(total_zero_count)
    #     min_total_zero_count = min(total_zero_count)
    #     min_total_zero_count_ind = np.where(total_zero_count==min_total_zero_count)[0]
    #     # print('the number of highest number of total zeros:', min_total_zero_count)
    #     print('files that has max number of total zeros:', min_total_zero_count_ind)
    #     feature_vector = [ feature_vectors[i] for i in min_total_zero_count_ind ]
    #     # print('feature_vector:', len(feature_vector))

    #     ### select the first feature vector if multiple shortlisted feature vectors are there
    #     # print(np.array(feature_vector).shape)
    #     if np.array(feature_vector).shape[0] > 1:
    #         print('multiple feature vectors found, selecting the first one')

    #         feature_vector = [feature_vectors[0]]

    #     test_feature_vectors.append((test_data, feature_vector))
    # else:
    #     print('No shortlisted ref samples found for the test data:', test_data)
    #     missing_features.append((test_data, 'No shortlisted ref samples found'))

    print('')
    break
        
        



### Plot Trace

In [None]:
# x_axis = np.arange(0, len(test_trace), 1)



### prepare test_trace for plotting
plot_data = dict()
plot_data['subseq'] = test_events   ### y_data (traces)

for i, fv in enumerate(shortlisted_ref_samples):
    plot_data[f'feat1_{i}'] = fv[0]
    
df_feat1 = pd.DataFrame(plot_data)

# plot_data = dict()
# plot_data['intervals'] = test_intervals   ### y_data (traces)

# for i, fv in enumerate(feature_vectors):
#     plot_data[f'feat2_{i}'] = fv[1]

# df_feat2 = pd.DataFrame(plot_data)


In [None]:
fig = px.line(df_feat1, title='features')
fig.show()

# fig = px.line(df_feat2, title='features')
# fig.show()