# Clustering - Fixed Window - Approach 1 Method 2
- uses the affected events based on exe_int as features

__Comments__
- Results store in to_discuss/folder 9-
- works well for comm and some sensors where always same events are affected in same order
- for some sensor detections, events are affected in different order thus not leading to a match (even with multithresh)
- it obviously fails for detection with multiple instances.

In [None]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt
import plotly.express as px
from statistics import mode

# ############ configuration - trace ################
# ############################################


CODE = 'theft_protection'       ### application (code)       ###  'theft_protection', 'mamba2', 'lora_ducy'
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 4                     ### format of data collection
WINDOW = 500
SUBSEQ =  'diag_subseq_multi'        # 'diag_subseq'        ### subsequence type, diag_subseq, subseq


base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)


################# configuration - diag ################
IS_VAR_WINDOW = False             ### True, False; wether to use variable window size or not

#####################################################


ref_samples_basepath = os.path.join(normalbase_path, f'diag_refsamples{WINDOW}')
# ref_var_samples_basepath = os.path.join(normalbase_path, 'diag_var_refsamples')
diag_subseq_basepath = os.path.join(faultybase_path, f'{SUBSEQ}/subseq')
diag_el_basepath = os.path.join(faultybase_path, f'{SUBSEQ}/el')
subseq_label_basepath = os.path.join(diag_subseq_basepath, 'subseq_labels')


print('ref_samples_path:\n', ref_samples_basepath)
# print('ref_var_samples_path:\n', ref_var_samples_basepath)
print('diag_subseq_path:\n', diag_subseq_basepath)

######### get paths #######################
ref_samples_path = [os.path.join(ref_samples_basepath, x) for x in os.listdir(ref_samples_basepath)]
# ref_var_samples_path = [os.path.join(ref_var_samples_basepath, x) for x in os.listdir(ref_var_samples_basepath)]   

train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

test_subseq_path = [os.path.join(diag_subseq_basepath, x) for x in os.listdir(diag_subseq_basepath)]
test_affected_el_path = [os.path.join(diag_el_basepath, x) for x in os.listdir(diag_el_basepath)]
test_labels_path = [os.path.join(subseq_label_basepath, x) for x in os.listdir(subseq_label_basepath)]

# ### remove.Ds_store from all lists
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]
ref_samples_path = [x for x in ref_samples_path if '.DS_Store' not in x]
# ref_var_samples_path = [x for x in ref_var_samples_path if '.DS_Store' not in x]
test_subseq_path = [x for x in test_subseq_path if '.DS_Store' not in x if '.json' in x]
test_affected_el_path = [x for x in test_affected_el_path if '.DS_Store' not in x]
test_labels_path = [x for x in test_labels_path if '.DS_Store' not in x]


varlist_path.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

if IS_VAR_WINDOW:
    # train_data_path = ref_var_samples_path
    raise ValueError('Variable window size not implemented')
else:
    train_data_path = ref_samples_path

test_data_path = test_subseq_path

print('train_data:\n', train_data_path)
print(len(train_data_path))
print('test_data:\n', test_data_path)
print(len(test_data_path))
print('test_affected_el_path', test_affected_el_path)
print('test_labels:\n', test_labels_path)




In [None]:
len(test_data_path[0:])

## Calculate Feature Vectors

- For fixed window size, load all the ref samples before hand
- For variable window, load the map_len; further load files only with the suitable len

### Fixed Window

In [None]:
# ##########################################################
# ##########################################################
    
# ### load all the reference samples (fixed window size)
# ref_samples = []
# for ref_sample in train_data_path:
#     ref_samples.append(read_traces(ref_sample))


# #########################################################
# #########################################################

# ### load the test samples and compare with the reference samples
# test_feature_vectors = []  ### [(test_data, (feat1_vector, feat2_vector)), (), (), ...]
# missing_features = []   ### [(test_data, missing_feature), (), (), ...]
# for test_data in test_data_path[0:]:
#     print('test_data:', test_data)
#     ### read the subseq
#     test_trace = read_traces(test_data)
#     print('test_trace:', test_trace)
#     test_data_len = len(test_trace)
#     print('test_data_len:', test_data_len)

#     if test_data_len > 500:
#         # print('test data length is more than 500, skipping...')
#         # missing_features.append((test_data, 'test data length is more than 500'))
#         # continue

#         print('test data length is more than 500, truncating...')
#         test_trace = test_trace[:500]
#         test_data_len = 500
    
#     ### transform the test trace from [(var,ts1), (var,ts2), (var, ts3)] to [[var1, var2, var3], [ts1, ts2, ts3]]
#     test_events = []
#     test_intervals = []
#     # prev_time = test_trace[0][1]
#     # time_diff = 0
#     # for x in test_trace:
#     #     time_diff = x[1] - prev_time
#     #     test_intervals.append(time_diff)
#     #     prev_time = x[1]
#     #     test_events.append(x[0])
#     for x,y in zip(test_trace[:-1], test_trace[1:]):
#         test_events.append(x[0])
#         test_intervals.append(y[1] - x[1])

#     # print(len(test_events), len(test_intervals), test_data_len)
#     assert len(test_events) == len(test_intervals) == test_data_len-1

#     ### shortlist the reference samples which has first 5 elements same as the test_trace
#     shortlisted_ref_samples = []
#     for ref_sample in ref_samples:
#         # print('ref_sample:', ref_sample[0][:5])
#         if ref_sample[0][:5] == test_events[:5]:
#             ref_sample = (ref_sample[0][:test_data_len], ref_sample[1][:test_data_len])
#             shortlisted_ref_samples.append(ref_sample)

#     ### deduplicate reference samples
#     # print('shortlisted_ref_samples:', len(shortlisted_ref_samples))
#     dedup_ref_samples = []
#     _dedup_events = []
#     for ref_sample in shortlisted_ref_samples:
#         # print('ref_sample:', ref_sample[0])
#         if ref_sample[0] not in _dedup_events:
#             dedup_ref_samples.append(ref_sample)
#             _dedup_events.append(ref_sample[0])
#     # print('dedup_ref_samples:', len(dedup_ref_samples))
#     shortlisted_ref_samples = dedup_ref_samples
                

#     ### generate feature vector for the test_trace with respect to each of the shortlisted_ref_samples
#     '''
#     Feature generation:
#     - take difference of the events and intervals of the test_trace with the shortlisted_ref_samples
#     '''
#     # print('ref samples with matching first 5 events:', np.array(shortlisted_ref_samples).shape)
#     if shortlisted_ref_samples != []:
#         shortlisted_features = []
#         feature_vectors = []
#         for ref_sample in shortlisted_ref_samples:
#             # print('ref_sample:', ref_sample[1])
#             sel_ref_event = ref_sample[0][:test_data_len-1]
#             sel_ref_interval = ref_sample[1][:test_data_len-1]
#             # print('sel_ref_event:', len(sel_ref_event), len(sel_ref_interval))
#             print(len(sel_ref_event), len(sel_ref_interval), test_data_len)
#             assert (len(sel_ref_event) == len(sel_ref_interval) == test_data_len)-1

#             ### generate feature vector
#             feat1_vector = []
#             feat2_vector = []
#             for i in range(len(sel_ref_event)):
#                 feat1 = test_events[i] - sel_ref_event[i]
#                 feat2 = test_intervals[i] - sel_ref_interval[i]
#                 ### if the difference in interval is within 500 ms, then consider it as same, as we consider tolerance of 500 ms based on observation
#                 feat2 = [0 if feat2 >= -500 and feat2 <= 500 else feat2 ][0] 
#                 feat1_vector.append(feat1)
#                 feat2_vector.append(feat2)

#             feat1_vector = np.array(feat1_vector)
#             feat2_vector = np.array(feat2_vector)
#             shortlisted_features.append((feat1_vector, feat2_vector))

        
#         ### count leading zeros in the feature vector
#         # print('shortlisted_features:', len(shortlisted_features))
#         zero_count = []
#         for sf in shortlisted_features:
#             count = 0
#             # print(sf[0], sf[1])
#             for esf, isf in zip(sf[0], sf[1]):
#                 ### check if events and intervals are same
#                 # if esf == 0 and isf == 0:
#                 if esf == 0:
#                     count += 1
#                 else:
#                     break   ### part of the logic, do not remove

#             # print('zero_count:', count)
#             zero_count.append(count)

#         ### select the feature vector with maximum leading zeros
#         max_zero_count = max(zero_count)
#         zero_count = np.array(zero_count)
#         max_zero_count_ind = np.where(zero_count==max_zero_count)[0]
#         # print('max number of starting events that are same for ref and test:', max_zero_count)
#         # print('ref samples with highest matching events in the start:', len(max_zero_count_ind))

#         ### select the feature vectors with maximum leading zeros
#         feature_vectors = [ shortlisted_features[i] for i in max_zero_count_ind ]

#         total_zero_count = []
#         for features in feature_vectors:
#             # print('feature:', features)
#             # print('zero_count:', np.where(features[0]==0)[0].shape)
#             total_zero_count.append(np.where(features[0]==0)[0].shape[0])
#         # print('total_zero_count:', total_zero_count)
#         total_zero_count = np.array(total_zero_count)
#         min_total_zero_count = min(total_zero_count)
#         min_total_zero_count_ind = np.where(total_zero_count==min_total_zero_count)[0]
#         # print('the number of highest number of total zeros:', min_total_zero_count)
#         print('files that has max number of total zeros:', min_total_zero_count_ind)
#         feature_vector = [ feature_vectors[i] for i in min_total_zero_count_ind ]
#         # print('feature_vector:', len(feature_vector))

#         ### select the first feature vector if multiple shortlisted feature vectors are there
#         # print(np.array(feature_vector).shape)
#         if np.array(feature_vector).shape[0] > 1:
#             print('multiple feature vectors found, selecting the first one')

#             feature_vector = [feature_vectors[0]]

#         test_feature_vectors.append((test_data, feature_vector))
#     else:
#         print('No shortlisted ref samples found for the test data:', test_data)
#         missing_features.append((test_data, 'No shortlisted ref samples found'))
        
        



In [None]:
##### Method 2 using affected events
test_feature_vectors = []  ### [(test_data, (feat1_vector, feat2_vector)), (), (), ...]
for test_data, affected_events in zip(test_data_path[0:], test_affected_el_path):
    print('test_data:', test_data)
    ### read the subseq
    test_trace = read_traces(test_data)
    # print('test_trace:', test_trace)
    test_data_len = len(test_trace)
    # print('test_data_len:', test_data_len)

    exe_list = read_traces(affected_events)
    # print('exe_list:', exe_list)

    ### transform the test trace from [(var,ts1), (var,ts2), (var, ts3)] to [[var1, var2, var3], [ts1, ts2, ts3]]
    feat_events = []
    feat_intervals = []
    for x in exe_list:
        print('x:', x)
        feat_events.append(x[0])
        feat_intervals.append(x[1])

    print('feat_events:', feat_events)
    print('feat_intervals:', feat_intervals)

    feature_vector = [feat_events, feat_intervals]
    test_feature_vectors.append((test_data, feature_vector))

    print('')

In [None]:
test_feature_vectors

In [None]:
test_labels_path

## Prepare data

In [None]:
def get_detection_labels(test_labels_path, test_data_path):

    test_class = {}
    ### load the labels
    test_class_labels = read_json(test_labels_path[0])
    # print('test_class_labels:', len(test_class_labels))
    # print('test_class_labels:', test_class_labels)

    ### prepare the feature vectors for classification
    for test_data in test_data_path:
        file_name = test_data.split('/')[-1].split('.')[0]
        # print(CODE, test_data)
        class_list = test_class_labels[file_name]
        test_class[test_data] = class_list
        
    return test_class


In [None]:
test_labels_path

In [None]:
test_class = get_detection_labels(test_labels_path, test_data_path)

In [None]:
test_class

In [None]:
test_files = []
padded_features = []
test_class = []

    
### load the labels
test_class_labels = read_json(test_labels_path[0])
# print('test_class_labels:', len(test_class_labels))
# print('test_class_labels:', test_class_labels)

### prepare the feature vectors for classification
for test_data, feature_vector in test_feature_vectors:
    file_name = test_data.split('/')[-1].split('.')[0]
    # print(CODE, test_data)
    class_list = test_class_labels[file_name]
    # print('class_list:', class_list)
    class_label = None
    # break

    # print('test_data:', test_data)
    # print('feature_vector:', np.array(feature_vector).shape)
    # print('test_class_label:', test_class_labels[file_name])

    # print(np.array(feature_vector).shape)
    if len(class_list) == 1:
        ### Hardcode the class label for all applications
        class_label = class_list

    else:
        # print('multiple class labels found for the test data:', test_data)
        class_label = class_list


        
    if class_label != None:
        feature_vector = np.array(feature_vector)
        print('feature_vector:', feature_vector.shape)
        print(feature_vector[0])
        print(feature_vector[1])
        if feature_vector.shape[0] == 2:
            # print('feature_vector:', feature_vector[0].shape)
            feat1 = feature_vector[0]
            feat2 = feature_vector[1]
            pad_num = 500 - feat1.shape[0]

            if pad_num > 0:
                feat1 = np.pad(feat1, (0, pad_num), 'constant', constant_values=(0))
                feat2 = np.pad(feat2, (0, pad_num), 'constant', constant_values=(0))
            
            
            # padded_features.append((feat1, feat2))
            padded_features.append(feat1)
            test_files.append(test_data)
            test_class.append(class_label)
            # print('class', class_label)

            # break
        else:
            print('feature_vector shape incorrect:', feature_vector.shape)
            continue
            for fv in feature_vector:
                print('feature_vector:', fv.shape)
                feat1 = fv[0]
                feat2 = fv[1]
                pad_num = 500 - feat1.shape[0]

                if pad_num > 0:
                    feat1 = np.pad(feat1, (0, pad_num), 'constant', constant_values=(0))
                    feat2 = np.pad(feat2, (0, pad_num), 'constant', constant_values=(0))

            
                padded_features.append((feat1, feat2))
                test_files.append(test_data)
                test_class.append(class_label)
                # print('class', class_label)
    

In [None]:
# print('padded_features:', len(padded_features))
# print('padded_features:', np.array(padded_features).shape)
# print('padded_features:', np.array(padded_features))
for i, j, k  in zip(padded_features, test_files, test_class):
    print('test_file', j)
    print('test_class', k)
    print('padded_features', i[:50])

In [None]:
test_class

In [None]:
print(np.array(padded_features).shape)
print(np.array(test_files).shape)
print(len(test_class))
len(test_class)
np.array(test_files).shape

## Clustering

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Example variables
# padded_features = np.random.rand(x, 500)  # Replace with your actual feature matrix
# test_files = ["file1", "file2", ..., "fileX"]  # Replace with your actual file names
# test_class = ["label1", "label2", ..., "labelX"]  # Replace with your actual labels

# Normalize the features
scaler = StandardScaler()
padded_features_normalized = scaler.fit_transform(padded_features)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=2, metric="euclidean")
dbscan.fit(padded_features_normalized)

# Get cluster labels
cluster_labels = dbscan.labels_

# Group file names and labels by cluster
clusters = {}
for i, cluster in enumerate(cluster_labels):
    if cluster not in clusters:
        clusters[cluster] = {"files": [], "labels": [], "features": []}
    clusters[cluster]["files"].append(test_files[i])
    clusters[cluster]["labels"].append(test_class[i])
    clusters[cluster]["features"].append(padded_features[i]) 

# Print the clusters
for cluster_id, cluster_data in clusters.items():
    print(f"Cluster {cluster_id}:")
    print(f"  Number of files: {len(cluster_data['files'])}")
    

for cluster_id, cluster_data in clusters.items():
    print(f"Cluster {cluster_id}:")
    # print(f"  Files: {cluster_data['files']}")
    # print(f"  Labels: {cluster_data['labels']}")
    files = cluster_data['files']
    labels = cluster_data['labels']
    features = cluster_data['features']
    for file, label, feat in zip(files, labels, features):
        print(file)
        print(label)
        print(feat[:50])
        print('')
    print('')

In [None]:
# Flatten the labels by joining them into a single string per instance
ground_truth = np.array(["_".join(map(str, labels)) for labels in test_class])

# Convert flattened labels to numeric format
unique_classes = list(set(ground_truth))  # Get unique class labels
class_to_int = {cls: idx for idx, cls in enumerate(unique_classes)}  # Map each class to an integer
ground_truth_numeric = np.array([class_to_int[cls] for cls in ground_truth])


In [None]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score

# Apply DBSCAN clustering
dbscan.fit(padded_features_normalized)

# Get cluster labels
labels = dbscan.labels_

# Filter out noise points (label -1 indicates noise in DBSCAN)
filtered_indices = labels != -1
print(filtered_indices)
filtered_labels = labels[filtered_indices]
filtered_ground_truth = ground_truth_numeric[filtered_indices]

# Evaluation Metrics
ari = adjusted_rand_score(filtered_ground_truth, filtered_labels)
nmi = normalized_mutual_info_score(filtered_ground_truth, filtered_labels)
silhouette = silhouette_score(padded_features_normalized[filtered_indices], filtered_labels)
homogeneity = homogeneity_score(filtered_ground_truth, filtered_labels)
completeness = completeness_score(filtered_ground_truth, filtered_labels)
v_measure = v_measure_score(filtered_ground_truth, filtered_labels)

# Print results
# print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print(f"Silhouette Score: {silhouette:.4f}")
print(f"Homogeneity: {homogeneity:.4f}")
print(f"Completeness: {completeness:.4f}")
print(f"V-Measure: {v_measure:.4f}")