# Clustering - Fixed Window - DL Extractor
- Use Deep Learning based extractors to get abstract features by giving the entire detection subseq as input
- Use these features to cluster the detections with similar anomalies
- The feature extraction is NOT dependent on the corresponding normal behaviour subtrace
- This will serve as benchmark peroformance with off the shelf models, without any optimization 
- We tested this approach across all applications

In [None]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt
import plotly.express as px
from statistics import mode
from sklearn import preprocessing

import tensorflow as tf
import TSFEDL.models_keras as tsfedl
import joblib


# ############ configuration - trace ################
# ############################################


CODE = 'theft_protection'       ### application (code)       ###  'theft_protection', 'mamba2', 'lora_ducy'
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 4                     ### format of data collection
WINDOW = 500                ### window size
SUBSEQ =  'diag_subseq'        # 'diag_subseq' , diag_subseq_multi       ### subsequence type, diag_subseq, subseq
INST_SEP = 'M2'             ###  M2, M3
EXTRACTOR = 'forecaster'     #### 'forecaster', 'autoencoder'


base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)


################# configuration - diag ################
IS_VAR_WINDOW = False             ### True, False; wether to use variable window size or not

#####################################################

ref_samples_basepath = os.path.join(normalbase_path, f'diag_refsamples{WINDOW}')
ref_var_samples_basepath = os.path.join(normalbase_path, 'diag_var_refsamples')
diag_subseq_basepath = os.path.join(faultybase_path, f'{SUBSEQ}/subseq')
diag_el_basepath = os.path.join(faultybase_path, f'{SUBSEQ}/el')
subseq_label_basepath = os.path.join(diag_subseq_basepath, 'subseq_labels')
test_labels_basepath = os.path.join(faultybase_path, 'labels')


# print('ref_samples_path:\n', ref_samples_basepath)
# print('ref_var_samples_path:\n', ref_var_samples_basepath)
# print('diag_subseq_path:\n', diag_subseq_basepath)

######### get paths #######################
ref_samples_path = [os.path.join(ref_samples_basepath, x) for x in os.listdir(ref_samples_basepath)]
# ref_var_samples_path = [os.path.join(ref_var_samples_basepath, x) for x in os.listdir(ref_var_samples_basepath)]   

train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

test_subseq_path = [os.path.join(diag_subseq_basepath, x) for x in os.listdir(diag_subseq_basepath)]
test_el_path = [os.path.join(diag_el_basepath, x) for x in os.listdir(diag_el_basepath)]
test_labels_path = [os.path.join(subseq_label_basepath, x) for x in os.listdir(subseq_label_basepath)]
eval_labels_path = [os.path.join(test_labels_basepath, x) for x in os.listdir(test_labels_basepath)]

# ### remove.Ds_store from all lists
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]
ref_samples_path = [x for x in ref_samples_path if '.DS_Store' not in x]
# ref_var_samples_path = [x for x in ref_var_samples_path if '.DS_Store' not in x]
test_subseq_path = [x for x in test_subseq_path if '.DS_Store' not in x if '.json' in x]
test_feature_path = [x for x in test_el_path if '.DS_Store' not in x]
test_labels_path = [x for x in test_labels_path if '.DS_Store' not in x]
eval_labels_path = [x for x in eval_labels_path if '.DS_Store' not in x]

varlist_path.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

if IS_VAR_WINDOW:
    # train_data_path = ref_var_samples_path
    raise ValueError('Variable window size not implemented yet')
else:
    train_data_path = ref_samples_path

test_data_path = test_subseq_path

# print('train_data:', train_data_path)
print(len(train_data_path))
# print('test_data:\n', test_data_path)
print(len(test_data_path))
print('test_labels:\n', test_labels_path)
print('eval_labels:\n', eval_labels_path)



In [None]:
if INST_SEP == 'M2':
    #### Load diag_sepAP2_m2

    path_diag_sepAP2_m2 = os.path.join(faultybase_path, 'diag_sepAP2_m2/subseq')
    path_diag_sepAP2_m2_feature = os.path.join(faultybase_path, 'diag_sepAP2_m2/feature')
    path_labels_diag_sepAP2_m2 = os.path.join(path_diag_sepAP2_m2, 'subseq_labels')
    print('path_diag_sepAP2_m2:', path_diag_sepAP2_m2)
    print('path_labels_diag_sepAP2_m2:', path_labels_diag_sepAP2_m2)

elif INST_SEP == 'M3':
    #### Load diag_sepAP2_m3

    path_diag_sepAP2_m2 = os.path.join(faultybase_path, 'diag_sepAP2_m3/subseq')
    path_diag_sepAP2_m2_feature = os.path.join(faultybase_path, 'diag_sepAP2_m3/feature')
    path_labels_diag_sepAP2_m2 = os.path.join(path_diag_sepAP2_m2, 'subseq_labels')
    print('path_diag_sepAP2_m3:', path_diag_sepAP2_m2)
    print('path_labels_diag_sepAP2_m3:', path_labels_diag_sepAP2_m2)

files_sepap2 = os.listdir(path_diag_sepAP2_m2)
files_sepap2 = [os.path.join(path_diag_sepAP2_m2, x) for x in files_sepap2 if '.DS_Store' not in x]    ### remove .DS_Store
files_sepap2 = [x for x in files_sepap2 if os.path.isfile(x)]
feature_sepap2 = os.listdir(path_diag_sepAP2_m2_feature)
feature_sepap2 = [os.path.join(path_diag_sepAP2_m2_feature, x) for x in feature_sepap2 if '.DS_Store' not in x]    ### remove .DS_Store


labels_sepap2 = os.listdir(path_labels_diag_sepAP2_m2)
labels_sepap2 = [os.path.join(path_labels_diag_sepAP2_m2, x) for x in labels_sepap2 if '.DS_Store' not in x]    ### remove .DS_Store
labels_sepap2 = [x for x in labels_sepap2 if os.path.isfile(x)]

if CODE=='mamba2' and INST_SEP=='M2':
    _check_label = labels_sepap2[0].split('/')[-1].split('_')[-1]
    print('_check_label:', _check_label)
    if _check_label == 'm.json':
        print('Updated Labels')
    else:
        raise ValueError('Labels not updated')

In [None]:
##### Get file names for desired class
### class = 0, 1, 2; -1 for all classes
CLASS = -1

#### Load labels for diag_sepAP2_m2
labels_dict = read_json(labels_sepap2[0])
print('labels_dict:', labels_dict)


key_names = list(labels_dict.keys())
# print('key_names:', key_names)

sel_ap2_files = []
sel_ap2_classes = []
sel_ap2_features = []
for km in key_names:
    print('km:', km)
    _class = labels_dict[km][0]
    print('class:', _class)

    ### exclude samples with class -1
    if _class != -1:
        if _class == CLASS or CLASS == -1:
            _file_name = os.path.join(path_diag_sepAP2_m2, km+'.json')
            _feature_name = os.path.join(path_diag_sepAP2_m2_feature, km+'.json')
            print('file:', _file_name)
            ############# check if file exists or not
            if os.path.isfile(_file_name):
                print('file exists')
                sel_ap2_files.append(_file_name)
                sel_ap2_classes.append(_class)
                sel_ap2_features.append(_feature_name)
            else:
                raise ValueError('File not found:', _file_name)



## Feature Extraction with TSFE-DL


In [None]:
#################################################################################################
####################################### Select Extractor ########################################
#################################################################################################




print('Selected Extractor:', EXTRACTOR)

#################################################################################################
#################################################################################################
#################################################################################################

if EXTRACTOR == 'forecaster':    
    ### load the model
    model_path = f'./trained_models/forecaster_events_minmax_{CODE}_V{VER}.keras'
    # model_path = './trained_models/forecaster_events_minmax_theft_protection.keras'
    # model_path = './trained_models/forecaster_events_minmax_mamba2.keras'
    # model_path = './trained_models/forecaster_events_minmax_mamba+theft.keras'
    print('model_path:', model_path)
    model = tf.keras.models.load_model(model_path)
    # model.summary()

    new_model = tf.keras.Model(inputs=model.input, outputs=model.layers[-5].output, name='forecaster')
    new_model.summary()
elif EXTRACTOR == 'autoencoder':
    model_path = f'./trained_models/autoencoder_events_minmax_{CODE}_V{VER}.keras'
    # model_path = './trained_models/autoencoder_events_minmax_mamba+theft.keras'
    print('model_path:', model_path)
    model = tf.keras.models.load_model(model_path)
    # model.summary()
    ### get output of the encoder as features
    x = model.layers[-8].output
    output = tf.keras.layers.Flatten()(x)
    new_model = tf.keras.Model(inputs=model.input, outputs=output, name='autoencoder')
    new_model.summary()


sel_features = []
test_files = []
for file, cls in zip(sel_ap2_files, sel_ap2_classes):
    print('file:', file)
    det_subseq = read_json(file)
    # print('subseq:', det_subseq)

    ###### store event ids from detected subseq to use as features
    _sel_subseq = []
    for event in det_subseq:
        e_id = event[0]
        _sel_subseq.append(e_id)
    _sel_subseq = np.array(_sel_subseq)
    print('subseq:', _sel_subseq)
    print('cls:', cls)

    test_events = _sel_subseq
    scaler_events = joblib.load(f"./scalers/minmaxscaler_{CODE}_V{VER}.gz")

    ### pad test events to have min len of 50
    if len(test_events) < 50:
        pad_len = 50 - len(test_events)
        test_events = np.pad(test_events, (0, pad_len), 'constant', constant_values=0)
        # print(f'Padded {pad_len} zeros to test_events to have min length of 50')

    test_events = scaler_events.transform(np.array(test_events).reshape(-1, 50))
    print(np.array(test_events))
    # print(np.array(test_events).reshape(-1, 1).shape)


    ########################################################################################################
    ############################################ forecaster ###############################################
    ########################################################################################################
    if EXTRACTOR == 'forecaster':    
        # ### extract features
        # feat_single = []
        # for i in range(0, test_data_len, 50):
        #     sub_events = test_events[i:i+50]
        #     # print('sub_events:', sub_events)
        #     # print('len of sub_events:', len(sub_events))
        #     sub_events = np.array(sub_events)
        #     sub_events = sub_events.reshape(1, sub_events.shape[0], 1)
        #     # print('sub_events shape:', sub_events.shape)

        #     sub_features = new_model.predict(sub_events)
        #     sub_features = sub_features.flatten()
        #     # print('sub_features shape:', sub_features.shape)
        #     # print('sub_features:', sub_features)

        #     feat_single.extend(sub_features)

        # print('test_events shape:', test_events.shape)
        _test_events = np.array(test_events).reshape(-1, 50, 1)
        # print('_test_events shape:', _test_events.shape)

        feat_single = new_model.predict(_test_events)
        # print('feat_single shape:', feat_single.shape)
        feat_single = feat_single.flatten()
        # print('feat_single shape:', feat_single.shape)
    elif EXTRACTOR == 'autoencoder':
        _test_events = np.array(test_events).reshape(-1, 50, 1)
        feat_single = new_model.predict(_test_events)
        # print('feat_single shape:', feat_single.shape)
        feat_single = feat_single.flatten()
        # print('feat_single shape:', feat_single.shape)

    ########################################################################################################
    ############################################# forecaster ##############################################
    ########################################################################################################

    feat_single = np.array(feat_single)
    feat_single = np.nan_to_num(feat_single, nan=0.0, posinf=0.0, neginf=0.0)
    sel_features.append(feat_single)
    test_files.append(file)

    # break

    



In [None]:
len(sel_features)

## Clustering


In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Example variables
# padded_features = np.random.rand(x, 500)  # Replace with your actual feature matrix
# test_files = ["file1", "file2", ..., "fileX"]  # Replace with your actual file names
# test_class = ["label1", "label2", ..., "labelX"]  # Replace with your actual labels

# Normalize the features
scaler = StandardScaler()
padded_features_normalized = scaler.fit_transform(sel_features)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=2, metric="euclidean")
dbscan.fit(padded_features_normalized)

# Get cluster labels
cluster_labels = dbscan.labels_

# Group file names and labels by cluster
clusters = {}
for i, cluster in enumerate(cluster_labels):
    if cluster not in clusters:
        clusters[cluster] = {"files": [], "labels": [], "features": []}
    clusters[cluster]["files"].append(sel_ap2_files[i])
    clusters[cluster]["labels"].append(sel_ap2_classes[i])
    clusters[cluster]["features"].append(sel_features[i]) 

# Print the clusters
for cluster_id, cluster_data in clusters.items():
    print(f"Cluster {cluster_id}:")
    print(f"  Number of files: {len(cluster_data['files'])}")
    

for cluster_id, cluster_data in clusters.items():
    print(f"Cluster {cluster_id}:")
    # print(f"  Files: {cluster_data['files']}")
    # print(f"  Labels: {cluster_data['labels']}")
    files = cluster_data['files']
    labels = cluster_data['labels']
    features = cluster_data['features']
    for file, label, feat in zip(files, labels, features):
        print(file)
        print(label)
        # print(feat[:50])
        print('')
    print('')

In [None]:
# Flatten the labels by joining them into a single string per instance
ground_truth = np.array(["_".join(map(str, str(labels))) for labels in sel_ap2_classes])

# Convert flattened labels to numeric format
unique_classes = list(set(ground_truth))  # Get unique class labels
class_to_int = {cls: idx for idx, cls in enumerate(unique_classes)}  # Map each class to an integer
ground_truth_numeric = np.array([class_to_int[cls] for cls in ground_truth])
int_to_class = {v: k for k, v in class_to_int.items()}  # Reverse mapping


In [None]:
int_to_class

In [None]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score, adjusted_mutual_info_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from collections import Counter


def map_clusters_to_ground_truth(labels, ground_truth):
    # Find unique cluster labels
    unique_clusters = np.unique(labels)

    # Map each cluster to its most common ground truth label
    cluster_to_label = {}
    used_labels = []

    for cluster in unique_clusters:
        # Extract ground truth labels for samples in this cluster
        # print(labels, cluster)
        mask = (labels == cluster)
        # print('mask:', mask)
        gt_labels_in_cluster = ground_truth[mask]
        
        # Find the most common ground truth label
        # print('gt_labels_in_cluster:', gt_labels_in_cluster)
        # print(Counter(gt_labels_in_cluster))
        # print(Counter(gt_labels_in_cluster).most_common(1)[0])
        # print('')
        ### following logic allows multiple cluster for single class, and individual cluster if only one instance of that type, but assigns -1 if a cluster for that label already exisits and a single sample with sample label is found
        most_common_label, count = Counter(gt_labels_in_cluster).most_common(1)[0]
        if count > 1:
            cluster_to_label[cluster] = most_common_label
            used_labels.append(most_common_label)
        elif most_common_label not in used_labels:
            cluster_to_label[cluster] = most_common_label
            used_labels.append(most_common_label)
        else:
            cluster_to_label[cluster] = -1  ### if no majority class, assign -1

        # cluster_to_label[cluster] = most_common_label

    return cluster_to_label


# Apply DBSCAN clustering
# dbscan.fit(padded_features_normalized)

# Get cluster labels
labels = dbscan.labels_

# Filter out noise points (label -1 indicates noise in DBSCAN), consider this as False Negatives
filtered_indices = labels != -1
noise_indices = labels == -1

# print('filtered_indices', filtered_indices)
# filtered_labels = labels[filtered_indices]
# filtered_ground_truth = ground_truth[filtered_indices]
# print('filtered labels', filtered_labels)
# print('filtered_ground_truth', filtered_ground_truth)

filtered_labels = []
filtered_ground_truth = []
invalid_label = 100 ### DBSCAN noise label
for lb, gtm in zip(labels, ground_truth):
    if lb != -1:
        filtered_labels.append(lb)
        filtered_ground_truth.append(gtm)
        # print(lb, gtm)
    else:
        # print(invalid_label, gtm)
        filtered_labels.append(invalid_label)
        # filtered_labels.append(100)  ### keep 100 for noise
        filtered_ground_truth.append(gtm)
        invalid_label += 1
filtered_labels = np.array(filtered_labels)
filtered_ground_truth = np.array(filtered_ground_truth)
print('filtered labels', filtered_labels)
print('filtered_ground_truth', filtered_ground_truth)
print('')

#### every samples identified as noise is assigned a seperate cluster, these are False Negatives for DBSCAN
### calculate Homogenity and Completeness of the clusters
homogeneity = homogeneity_score(filtered_ground_truth, filtered_labels)
completeness = completeness_score(filtered_ground_truth, filtered_labels)
print(f"Homogeneity: {homogeneity:.4f}")
print(f"Completeness: {completeness:.4f}")
print('')

#### get the groundtruth class of majority samples in each cluster. This represents the correct class of the cluster
cluster_to_label = map_clusters_to_ground_truth(filtered_labels, filtered_ground_truth)
for cluster, label in cluster_to_label.items():
    print(f"Cluster {cluster} â†’ Label {label}")

print('\n')

## Step 2: Predict label for each sample using cluster mapping, as the label of majority sample to the cluster and all its samples, except for noise samples.
predicted_labels = np.array([cluster_to_label[cl] for cl in filtered_labels])
# predicted_labels = []
# for cl in filtered_labels:
#     if 100 <= cl <= 999:
#         predicted_labels.append(cl)   ### keep the noise samples as is
#         # predicted_labels.append(-1)   ### assign all noise samples to a single class -1
#     else:
#         predicted_labels.append(cluster_to_label[cl])
# predicted_labels = np.array(predicted_labels)

print('predicted_labels', predicted_labels)
print('filtered_ground_truth', filtered_ground_truth)
print('')


# Step 3: Compute accuracy and misclassification rate
correct = np.sum(predicted_labels == filtered_ground_truth)
total = len(filtered_ground_truth)
accuracy = correct / total
misclassification_rate = 1 - accuracy

print(f"Clustering Accuracy: {accuracy:.4f}")
print(f"Misclassification Rate: {misclassification_rate:.4f}")


#### calculate NMI to check how well does clustering labels agree with GT
nmi = adjusted_mutual_info_score(filtered_ground_truth, predicted_labels)
nmi2 = normalized_mutual_info_score(filtered_ground_truth, predicted_labels)


print(f"Adjusted Mutual Information (AMI): {nmi:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi2:.4f}")


# print('')
# ### for each cluster print the respective ground truth classes
# from collections import defaultdict
# cluster_to_classes = defaultdict(list)
# for lbl, true_cls in zip(filtered_labels, filtered_ground_truth):
#     # true_cls = int_to_class[true_cls]  ### convert back to original class label
#     cluster_to_classes[lbl].append(true_cls)
# for cluster_id, classes in cluster_to_classes.items():
#     unique, counts = np.unique(classes, return_counts=True)
#     class_count = dict(zip(unique, counts))
#     print(f"Cluster {cluster_id}: Class distribution: {class_count}")


## Feature Similarity

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

# Example: padded_features is a (46, 31) matrix
# padded_features = np.random.rand(46, 31)  # Replace with your actual data
# test_class = ["Class1", "Class2", ..., "Class46"]  # Replace with your actual class labels

# Define a function to calculate and plot similarity heatmap
def plot_similarity_heatmap(features, labels, metric, title):
    # Step 1: Calculate pairwise distances
    similarity_matrix = cdist(features, features, metric=metric)

    #### sort features based on similarity to group similar features together
    sorted_indices = np.argsort(similarity_matrix.sum(axis=1))
    similarity_matrix = similarity_matrix[sorted_indices][:, sorted_indices]
    labels = np.array(labels)[sorted_indices]


    # Step 2: Generate a heatmap
    plt.figure(figsize=(22, 20))
    ax = sns.heatmap(
        similarity_matrix,
        annot=False,  # Set to True if you want to display values
        fmt=".2f",
        cmap="viridis",
        cbar=True,
        xticklabels=labels,
        yticklabels=labels
    )
    plt.title(title)

    # Place x-axis ticks on top
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')  # Move x-axis label to the top

    # Rotate tick labels for better readability
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)

    plt.show()

# Metrics to calculate
metrics = {
    "euclidean": "Euclidean Distance",
    "cityblock": "Manhattan Distance",
    "chebyshev": "Chebyshev Distance",
    # "cosine": "Cosine Similarity",
    # "correlation": "Correlation Distance"
}

# ### sort the features in ascending order of their class labels
# sorted_indices = np.argsort(sel_ap2_classes)
# sel_features = sel_features[sorted_indices]
# sel_ap2_classes = np.array(sel_ap2_classes)[sorted_indices]


# Generate heatmaps for each metric
for metric, title in metrics.items():
    plot_similarity_heatmap(sel_features, sel_ap2_classes, metric, f"Pairwise Feature Similarity ({title})")

In [None]:
## Feature Extraction with SegLearn
# for test_data in test_data_path[0:]:
#     print('test_data:', test_data)
#     ### read the subseq
#     test_trace = read_traces(test_data)
#     # print('test_trace:', test_trace)
#     test_data_len = len(test_trace)
#     print('test_data_len:', test_data_len)

#     test_trace = np.array(test_trace).reshape(1, -1, 2)
#     print('test_trace:', test_trace.shape)

#     # features = FeatureTransform.fit_transform(test_trace)
#     feature_names = all_features().keys()
#     feature_functions = all_features()
#     for i, feat_label in enumerate(feature_names):
#         print(feat_label)
#         # print(feature_functions[feat_label])
#         func = feature_functions[feat_label]
#         feat = func(test_trace)
#         print(feat)
#         print(feat.shape)
#         print('')
    
    # break
