# Clustering - Fixed Window - Approach 1
- Approach 1 refers to syncing only the first half of traces before anomaly, for feature extraction
- The feature extraction is the dependent on the corresponding normal behaviour subtrace
- In this approach we only consider detections that has only single instance of anomaly
- We tested this approach across all applications and combinations of multiplr applications

In [None]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt
import plotly.express as px
from statistics import mode

# ############ configuration - trace ################
# ############################################

all_applications = ['theft_protection', 'mamba2', 'lora_ducy']  ###  'theft_protection', 'mamba2', 'lora_ducy'
app_paths = defaultdict(dict)

for CODE in all_applications:

    # CODE = 'theft_protection'       ### application (code)
    BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
    BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
    THREAD = 'single'           ### single, multi
    VER = 3                     ### format of data collection

    base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
    normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
    faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'
    
    app_paths[CODE]['base_dir'] = base_dir
    app_paths[CODE]['normalbase_path'] = normalbase_path
    app_paths[CODE]['faultybase_path'] = faultybase_path

    print(normalbase_path)
    print(faultybase_path)


    ################# configuration - diag ################
    IS_VAR_WINDOW = False             ### True, False; wether to use variable window size or not

    #####################################################


    ref_samples_basepath = os.path.join(normalbase_path, 'diag_refsamples')
    ref_var_samples_basepath = os.path.join(normalbase_path, 'diag_var_refsamples')
    diag_subseq_basepath = os.path.join(faultybase_path, 'diag_subseq')
    subseq_label_basepath = os.path.join(diag_subseq_basepath, 'subseq_labels')


    print('ref_samples_path:\n', ref_samples_basepath)
    print('ref_var_samples_path:\n', ref_var_samples_basepath)
    print('diag_subseq_path:\n', diag_subseq_basepath)

    ######### get paths #######################
    ref_samples_path = [os.path.join(ref_samples_basepath, x) for x in os.listdir(ref_samples_basepath)]
    ref_var_samples_path = [os.path.join(ref_var_samples_basepath, x) for x in os.listdir(ref_var_samples_basepath)]   

    train_varlist_path = os.listdir(normalbase_path)
    train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

    ######### get paths #######################
    paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

    test_subseq_path = [os.path.join(diag_subseq_basepath, x) for x in os.listdir(diag_subseq_basepath)]
    test_labels_path = [os.path.join(subseq_label_basepath, x) for x in os.listdir(subseq_label_basepath)]

    # ### remove.Ds_store from all lists
    train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
    varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
    paths_label = [x for x in paths_label if '.DS_Store' not in x]
    ref_samples_path = [x for x in ref_samples_path if '.DS_Store' not in x]
    ref_var_samples_path = [x for x in ref_var_samples_path if '.DS_Store' not in x]
    test_subseq_path = [x for x in test_subseq_path if '.DS_Store' not in x if '.json' in x]
    test_labels_path = [x for x in test_labels_path if '.DS_Store' not in x]


    varlist_path.sort()

    # print(paths_log)
    # print(paths_traces)
    # print(varlist_path)
    # print(paths_label)

    if IS_VAR_WINDOW:
        train_data_path = ref_var_samples_path
    else:
        train_data_path = ref_samples_path

    test_data_path = test_subseq_path

    print('train_data:\n', train_data_path)
    print(len(train_data_path))
    print('test_data:\n', test_data_path)
    print(len(test_data_path))
    print('test_labels:\n', test_labels_path)

    app_paths[CODE]['train_data_path'] = train_data_path
    app_paths[CODE]['test_data_path'] = test_data_path
    app_paths[CODE]['train_varlist_path'] = train_varlist_path
    app_paths[CODE]['varlist_path'] = varlist_path
    app_paths[CODE]['paths_label'] = paths_label
    app_paths[CODE]['ref_samples_path'] = ref_samples_path
    app_paths[CODE]['ref_var_samples_path'] = ref_var_samples_path
    app_paths[CODE]['test_subseq_path'] = test_subseq_path
    app_paths[CODE]['test_labels_path'] = test_labels_path




In [None]:
app_paths

In [None]:
len(test_data_path[0:])

## Calculate Feature Vectors

- For fixed window size, load all the ref samples before hand
- For variable window, load the map_len; further load files only with the suitable len

### Fixed Window

In [None]:
##########################################################
##########################################################
all_test_feature_vectors = []
for CODE in all_applications:
    train_data_path = app_paths[CODE]['train_data_path']
    test_data_path = app_paths[CODE]['test_data_path']
    ### load all the reference samples (fixed window size)
    ref_samples = []
    for ref_sample in train_data_path:
        ref_samples.append(read_traces(ref_sample))


    #########################################################
    #########################################################

    ### load the test samples and compare with the reference samples
    test_feature_vectors = []  ### [(test_data, (feat1_vector, feat2_vector)), (), (), ...]
    missing_features = []   ### [(test_data, missing_feature), (), (), ...]
    for test_data in test_data_path[0:]:
        print('test_data:', test_data)
        ### read the subseq
        test_trace = read_traces(test_data)
        print('test_trace:', test_trace)
        test_data_len = len(test_trace)
        print('test_data_len:', test_data_len)

        if test_data_len > 500:
            # print('test data length is more than 500, skipping...')
            # missing_features.append((test_data, 'test data length is more than 500'))
            # continue

            print('test data length is more than 500, truncating...')
            test_trace = test_trace[:500]
            test_data_len = 500
        
        ### transform the test trace from [(var,ts1), (var,ts2), (var, ts3)] to [[var1, var2, var3], [ts1, ts2, ts3]]
        test_events = []
        test_intervals = []
        prev_time = test_trace[0][1]
        time_diff = 0
        for x in test_trace:
            time_diff = x[1] - prev_time
            test_intervals.append(time_diff)
            prev_time = x[1]
            test_events.append(x[0])

        assert len(test_events) == len(test_intervals) == test_data_len

        ### shortlist the reference samples which has first 5 elements same as the test_trace
        shortlisted_ref_samples = []
        for ref_sample in ref_samples:
            # print('ref_sample:', ref_sample[0][:5])
            if ref_sample[0][:5] == test_events[:5]:
                ref_sample = (ref_sample[0][:test_data_len], ref_sample[1][:test_data_len])
                shortlisted_ref_samples.append(ref_sample)

        ### deduplicate reference samples
        # print('shortlisted_ref_samples:', len(shortlisted_ref_samples))
        dedup_ref_samples = []
        _dedup_events = []
        for ref_sample in shortlisted_ref_samples:
            # print('ref_sample:', ref_sample[0])
            if ref_sample[0] not in _dedup_events:
                dedup_ref_samples.append(ref_sample)
                _dedup_events.append(ref_sample[0])
        # print('dedup_ref_samples:', len(dedup_ref_samples))
        shortlisted_ref_samples = dedup_ref_samples
                    

        ### generate feature vector for the test_trace with respect to each of the shortlisted_ref_samples
        '''
        Feature generation:
        - take difference of the events and intervals of the test_trace with the shortlisted_ref_samples
        '''
        # print('ref samples with matching first 5 events:', np.array(shortlisted_ref_samples).shape)
        if shortlisted_ref_samples != []:
            shortlisted_features = []
            feature_vectors = []
            for ref_sample in shortlisted_ref_samples:
                # print('ref_sample:', ref_sample[1])
                sel_ref_event = ref_sample[0][:test_data_len]
                sel_ref_interval = ref_sample[1][:test_data_len]
                # print('sel_ref_event:', len(sel_ref_event), len(sel_ref_interval))
                assert (len(sel_ref_event) == len(sel_ref_interval) == test_data_len)

                ### generate feature vector
                feat1_vector = []
                feat2_vector = []
                for i in range(test_data_len):
                    feat1 = test_events[i] - sel_ref_event[i]
                    feat2 = test_intervals[i] - sel_ref_interval[i]
                    ### if the difference in interval is within 500 ms, then consider it as same, as we consider tolerance of 500 ms based on observation
                    feat2 = [0 if feat2 >= -500 and feat2 <= 500 else feat2 ][0] 
                    feat1_vector.append(feat1)
                    feat2_vector.append(feat2)

                feat1_vector = np.array(feat1_vector)
                feat2_vector = np.array(feat2_vector)
                shortlisted_features.append((feat1_vector, feat2_vector))

            
            ### count leading zeros in the feature vector
            # print('shortlisted_features:', len(shortlisted_features))
            zero_count = []
            for sf in shortlisted_features:
                count = 0
                # print(sf[0], sf[1])
                for esf, isf in zip(sf[0], sf[1]):
                    ### check if events and intervals are same
                    if esf == 0 and isf == 0:
                        count += 1
                    else:
                        break   ### part of the logic, do not remove

                # print('zero_count:', count)
                zero_count.append(count)

            ### select the feature vector with maximum leading zeros
            max_zero_count = max(zero_count)
            zero_count = np.array(zero_count)
            max_zero_count_ind = np.where(zero_count==max_zero_count)[0]
            # print('max number of starting events that are same for ref and test:', max_zero_count)
            # print('ref samples with highest matching events in the start:', len(max_zero_count_ind))

            ### select the feature vectors with maximum leading zeros
            feature_vectors = [ shortlisted_features[i] for i in max_zero_count_ind ]

            total_zero_count = []
            for features in feature_vectors:
                # print('feature:', features)
                # print('zero_count:', np.where(features[0]==0)[0].shape)
                total_zero_count.append(np.where(features[0]==0)[0].shape[0])
            # print('total_zero_count:', total_zero_count)
            total_zero_count = np.array(total_zero_count)
            min_total_zero_count = min(total_zero_count)
            min_total_zero_count_ind = np.where(total_zero_count==min_total_zero_count)[0]
            # print('the number of highest number of total zeros:', min_total_zero_count)
            print('files that has max number of total zeros:', min_total_zero_count_ind)
            feature_vector = [ feature_vectors[i] for i in min_total_zero_count_ind ]
            # print('feature_vector:', len(feature_vector))

            ### select the first feature vector if multiple shortlisted feature vectors are there
            # print(np.array(feature_vector).shape)
            if np.array(feature_vector).shape[0] > 1:
                print('multiple feature vectors found, selecting the first one')

                feature_vector = [feature_vectors[0]]

            test_feature_vectors.append((test_data, feature_vector))
        else:
            print('No shortlisted ref samples found for the test data:', test_data)
            missing_features.append((test_data, 'No shortlisted ref samples found'))
            
            
    print('')
    # break
    all_test_feature_vectors.append((CODE, test_feature_vectors))

In [None]:
missing_features

In [None]:
len(all_test_feature_vectors)

In [None]:
len(test_feature_vectors)

### Plot Trace

In [None]:
# x_axis = np.arange(0, len(test_trace), 1)



### prepare test_trace for plotting
plot_data = dict()
plot_data['subseq'] = test_events   ### y_data (traces)

for i, fv in enumerate(feature_vectors):
    plot_data[f'feat1_{i}'] = fv[0]
    
df_feat1 = pd.DataFrame(plot_data)

plot_data = dict()
plot_data['intervals'] = test_intervals   ### y_data (traces)

for i, fv in enumerate(feature_vectors):
    plot_data[f'feat2_{i}'] = fv[1]

df_feat2 = pd.DataFrame(plot_data)


In [None]:
fig = px.line(df_feat1, title='features')
fig.show()

fig = px.line(df_feat2, title='features')
fig.show()

In [None]:
# ### prepare test trace for plotting
# num_trace = []
# time_stamp = []
# for (t, ts) in test_trace:
#     num_trace.extend([t])
#     time_stamp.extend([ts])

# plot_data = dict()
# # plot_data['time'] = time_stamp   ### x_data
# print(len(num_trace))
# plot_data['test_trace'] = num_trace   ### y_data (traces)

# ### prepare ref samples
# samples = [ shortlisted_ref_samples[i] for i in max_zero_count_ind ]

# for i in max_zero_count_ind:
#     print(len(shortlisted_ref_samples[i][0]))
#     plot_data[f'sample{i}'] = shortlisted_ref_samples[i][0]
    
# df_trace = pd.DataFrame(plot_data)

# fig = px.line(df_trace, title='event trace')
# fig.show()

##################################################

### prepare test trace for plotting
num_trace = []
time_stamp = []
for (t, ts) in test_trace:
    num_trace.extend([t])
    time_stamp.extend([ts])

plot_data = dict()
# plot_data['time'] = time_stamp   ### x_data
print(len(num_trace))

### prepare ref samples
samples = [ shortlisted_ref_samples[i] for i in max_zero_count_ind ]

for i in max_zero_count_ind:
    print(len(shortlisted_ref_samples[i][0]))
    plot_data['test_trace'] = num_trace   ### y_data (traces)
    plot_data[f'sample{i}'] = shortlisted_ref_samples[i][0]
    df_trace = pd.DataFrame(plot_data)
    fig = px.line(df_trace, title='event trace')
    fig.show()


In [None]:
test_labels_path

## Clustering

### Prepare data

In [None]:
test_files = []
padded_features = []
test_class = []
prev_code = ''
for i, (CODE, test_feature_vectors) in enumerate(all_test_feature_vectors):
    # print(i, CODE)
    print('test_feature_vectors:', len(test_feature_vectors))
    # continue
    test_labels_path = app_paths[CODE]['test_labels_path']
    # print('test_labels_path:', test_labels_path)
    ### load the labels
    test_class_labels = read_json(test_labels_path[0])
    print('test_class_labels:', len(test_class_labels))
    # print(CODE, test_feature_vectors[0])
    # break

    ### give unique labels for each anomaly from each application (assumes that features from each application are in sequence)
    if prev_code != CODE: 
        prev_code = CODE   
        if test_class == []:
            max_class = 0
        else:
            max_class = max(test_class)
        # print('CODE', CODE)
        # print('max_class:', max_class)

    ### prepare the feature vectors for classification
    for test_data, feature_vector in test_feature_vectors:
        file_name = test_data.split('/')[-1].split('.')[0]
        # print(CODE, test_data)
        class_list = test_class_labels[file_name]
        # print('class_list:', class_list)
        class_label = None
        # break

        # print('test_data:', test_data)
        print('feature_vector:', np.array(feature_vector).shape)
        print('test_class_label:', test_class_labels[file_name])

        # print(np.array(feature_vector).shape)
        if len(class_list) == 1:
            ### Hardcode the class label for all applications
            if CODE == 'theft_protection':
                if class_list[0] == 1:
                    class_label = 1
                elif class_list[0] == 2:
                    class_label = 2
                elif class_list[0] == 3:
                    class_label = 3
            elif CODE == 'mamba2':
                if class_list[0] == 1:
                    class_label = None    ### exclude comm failure samples
                elif class_list[0] == 2 or class_list[0] == 4:   ### combine both the anomaly types since they have same effect
                    class_label = 5
                elif class_list[0] == 3:
                    class_label = 6
                # elif class_list[0] == 4:
                #     class_label = 7
            elif CODE == 'lora_ducy':
                if class_list[0] == 5:
                    class_label = 8

        else:
            print('multiple class labels found for the test data:', test_data)
            continue
            ### only consider samples which has only one type of multiple anomaly instances
            # print(set(class_list))
            # if len(set(class_list)) == 1:
            #     class_label = class_list[0]
            # else:
            #     class_label = None


            
        if class_label != None:
            feature_vector = np.array(feature_vector)
            if feature_vector.shape[0] == 1:
                # print('feature_vector:', feature_vector[0].shape)
                feat1 = feature_vector[0][0]
                feat2 = feature_vector[0][1]
                pad_num = 500 - feat1.shape[0]

                if pad_num > 0:
                    feat1 = np.pad(feat1, (0, pad_num), 'constant', constant_values=(0))
                    feat2 = np.pad(feat2, (0, pad_num), 'constant', constant_values=(0))
                
                
                padded_features.append((feat1, feat2))
                test_files.append(test_data)
                test_class.append(class_label)
                # print('class', class_label)

                # break
            else:
                for fv in feature_vector:
                    print('feature_vector:', fv.shape)
                    feat1 = fv[0]
                    feat2 = fv[1]
                    pad_num = 500 - feat1.shape[0]

                    if pad_num > 0:
                        feat1 = np.pad(feat1, (0, pad_num), 'constant', constant_values=(0))
                        feat2 = np.pad(feat2, (0, pad_num), 'constant', constant_values=(0))

                
                    padded_features.append((feat1, feat2))
                    test_files.append(test_data)
                    test_class.append(class_label)
                    # print('class', class_label)
        



In [None]:
np.array(padded_features).shape

In [None]:
print(len(test_files))
print(test_files)

In [None]:
print(len(test_class))
print(test_class)
print(set(test_class))

In [None]:
### get count of samples for each label
unique, counts = np.unique(test_class, return_counts=True)
print('unique:', unique)
print('counts:', counts)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

N_CLUSTER = len(set(test_class))
print('N_CLUSTER:', N_CLUSTER)

data = np.array(padded_features)

# Reshape the data for clustering
num_samples, num_features, num_points = data.shape

# Flatten the feature vectors (reshape to (48, 1000))
data_reshaped = data.reshape(num_samples, num_features * num_points)

# Normalize the data
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data_reshaped)

# Apply K-Means clustering
kmeans = KMeans(init="k-means++", max_iter=1000, n_clusters=N_CLUSTER, n_init='auto' )   # n_clusters=N_CLUSTER, random_state=0, n_init=4 
kmeans.fit(data_normalized)
# Get cluster labels
labels = kmeans.labels_
print('kmeans:', labels)

# dbscan = DBSCAN(eps=0.1, min_samples=2, metric='euclidean', )
# dbscan.fit(data_normalized)
# labels = dbscan.labels_
# print('labels', labels)
# ### map negative labels to positive labels
# unique_labels = np.unique(labels)
# # print('unique_labels:', unique_labels)
# map_labels = dict()
# for i, l in enumerate(unique_labels):
#     map_labels[f'{l}'] = i

# new_labels = [map_labels[f'{l}'] for l in labels]
# # print('new_labels:', new_labels)
# labels = new_labels
# print('dbscan:', labels)

# print('ground_truth', test_class)


In [None]:
print(kmeans.cluster_centers_)
print(kmeans.n_iter_)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, adjusted_rand_score, normalized_mutual_info_score, f1_score, confusion_matrix
from scipy.optimize import linear_sum_assignment

# # Sample data: Replace with your actual predictions
# kmeans_labels = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1])  # Replace with K-Means predictions
# ground_truth = np.array([2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1])  # Given ground truth

ground_truth = np.array(test_class)
labels = np.array(labels)

# Map cluster labels to ground truth labels using the Hungarian algorithm
def best_cluster_mapping(y_true, y_pred):
    """Finds the best mapping between predicted and true labels using the Hungarian algorithm."""
    unique_classes = np.unique(y_true)
    unique_clusters = np.unique(y_pred)
    cost_matrix = np.zeros((len(unique_classes), len(unique_clusters)))

    for i, cls in enumerate(unique_classes):
        for j, cluster in enumerate(unique_clusters):
            cost_matrix[i, j] = -np.sum((y_true == cls) & (y_pred == cluster))  # Negative for maximization

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    mapping = {unique_clusters[col]: unique_classes[row] for row, col in zip(row_ind, col_ind)}

    return np.vectorize(mapping.get)(y_pred)  # Map predictions

print('ground_truth:', ground_truth)
print('labels:', labels)
# Remap cluster labels to best-matching class labels
remapped_labels = best_cluster_mapping(ground_truth, labels)

# Evaluation Metrics
accuracy = accuracy_score(ground_truth, remapped_labels)
ari = adjusted_rand_score(ground_truth, remapped_labels)
# nmi = normalized_mutual_info_score(ground_truth, remapped_labels)
f1 = f1_score(ground_truth, remapped_labels, average='weighted')
conf_matrix = confusion_matrix(ground_truth, remapped_labels)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Adjusted Rand Index (ARI): {ari:.4f}")
# print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:\n", conf_matrix)


In [None]:
for l,g in zip(remapped_labels, ground_truth):
    print(l,g)

In [None]:
# Visualizing with PCA (if needed)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_normalized)

plt.scatter(data_pca[:, 0], data_pca[:, 1], c=labels, cmap='viridis', edgecolors='k')
# plt.scatter(data_pca[:, 0], data_pca[:, 1], c=ground_truth, cmap='viridis', edgecolors='r', alpha=0.5)


plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('K-Means Clustering Visualization')
plt.colorbar(label='Cluster Label')
plt.show()

In [None]:
remapped_labels

In [None]:
ground_truth