# Clustering - Fixed Window - Stat Extractor
- Use existing stats library extractors to get abstract features by giving the entire detection subseq as input
- Use these features to cluster the detections with similar anomalies
- The feature extraction is NOT dependent on the corresponding normal behaviour subtrace
- This will serve as benchmark peroformance with off the shelf models, without any optimization 
- We tested this approach across all applications

In [None]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt
import plotly.express as px
from statistics import mode
from sklearn import preprocessing

import tsfel
import pandas as pd

from seglearn.feature_functions import all_features
from seglearn.transform import FeatureRep


# ############ configuration - trace ################
# ############################################


CODE = 'mamba2'       ### application (code)       ###  'theft_protection', 'mamba2', 'lora_ducy'
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 4                     ### format of data collection
WINDOW = 50                 ### window size for subsequence
SUBSEQ =  'diag_subseq'        # 'diag_subseq' , diag_subseq_multi       ### subsequence type, diag_subseq, subseq
# EXTRACTOR = 'TSFEL'
EXTRACTOR = 'SegLearn'


base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)


################# configuration - diag ################
IS_VAR_WINDOW = False             ### True, False; wether to use variable window size or not

#####################################################

ref_samples_basepath = os.path.join(normalbase_path, f'diag_refsamples{WINDOW}')
ref_var_samples_basepath = os.path.join(normalbase_path, 'diag_var_refsamples')
diag_subseq_basepath = os.path.join(faultybase_path, f'{SUBSEQ}/subseq')
diag_el_basepath = os.path.join(faultybase_path, f'{SUBSEQ}/el')
subseq_label_basepath = os.path.join(diag_subseq_basepath, 'subseq_labels')
test_labels_basepath = os.path.join(faultybase_path, 'labels')


# print('ref_samples_path:\n', ref_samples_basepath)
# print('ref_var_samples_path:\n', ref_var_samples_basepath)
# print('diag_subseq_path:\n', diag_subseq_basepath)

######### get paths #######################
ref_samples_path = [os.path.join(ref_samples_basepath, x) for x in os.listdir(ref_samples_basepath)]
# ref_var_samples_path = [os.path.join(ref_var_samples_basepath, x) for x in os.listdir(ref_var_samples_basepath)]   

train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

test_subseq_path = [os.path.join(diag_subseq_basepath, x) for x in os.listdir(diag_subseq_basepath)]
test_el_path = [os.path.join(diag_el_basepath, x) for x in os.listdir(diag_el_basepath)]
test_labels_path = [os.path.join(subseq_label_basepath, x) for x in os.listdir(subseq_label_basepath)]
eval_labels_path = [os.path.join(test_labels_basepath, x) for x in os.listdir(test_labels_basepath)]

# ### remove.Ds_store from all lists
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]
ref_samples_path = [x for x in ref_samples_path if '.DS_Store' not in x]
# ref_var_samples_path = [x for x in ref_var_samples_path if '.DS_Store' not in x]
test_subseq_path = [x for x in test_subseq_path if '.DS_Store' not in x if '.json' in x]
test_feature_path = [x for x in test_el_path if '.DS_Store' not in x]
test_labels_path = [x for x in test_labels_path if '.DS_Store' not in x]
eval_labels_path = [x for x in eval_labels_path if '.DS_Store' not in x]

varlist_path.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

if IS_VAR_WINDOW:
    # train_data_path = ref_var_samples_path
    raise ValueError('Variable window size not implemented yet')
else:
    train_data_path = ref_samples_path

test_data_path = test_subseq_path

# print('train_data:', train_data_path)
print(len(train_data_path))
# print('test_data:\n', test_data_path)
print(len(test_data_path))
print('test_labels:\n', test_labels_path)
print('eval_labels:\n', eval_labels_path)




## Calculate Feature Vectors

- For fixed window size, load all the ref samples before hand
- For variable window, load the map_len; further load files only with the suitable len

Feature extraction using DL:
- load DL model with appropriate weights
- make all the detections of same length (500)
- give the detections as inputs and extract features
- cluster these features

## Feature Extraction with TSFEL


In [None]:
#################################################################################################
####################################### Select Extractor ########################################
#################################################################################################


print('Selected Extractor:', EXTRACTOR)

#################################################################################################
#################################################################################################
#################################################################################################


test_feature_vectors = []
test_files = []
feature_list = []
for test_data in test_data_path[0:]:
    print('test_data:', test_data)
    ### read the subseq
    test_trace = read_traces(test_data)
    print('test_trace:', test_trace)
    test_data_len = len(test_trace)
    print('test_data_len:', test_data_len)

    if test_data_len > 500:
        # print('test data length is more than 500, skipping...')
        # missing_features.append((test_data, 'test data length is more than 500'))
        # continue

        print('test data length is more than 500, truncating...')
        test_trace = test_trace[:500]
        test_data_len = 500
    
    # df = pd.DataFrame(test_trace, columns=['event', 'ts'],)
    # ### seperate event and ts
    # test_events = df['event']
    # test_ts = df['ts']

    ### transform the test trace from [(var,ts1), (var,ts2), (var, ts3)] to [[var1, var2, var3], [ts1, ts2, ts3]]
    test_events = []
    test_intervals = []
    prev_time = test_trace[0][1]
    time_diff = 0
    for x in test_trace:
        time_diff = x[1] - prev_time
        test_intervals.append(time_diff)
        prev_time = x[1]
        test_events.append(x[0])

    assert len(test_events) == len(test_intervals) == test_data_len


    # print(df)
    print(test_events)

    ########################################################################################################
    ############################################ TSFEL #####################################################
    ########################################################################################################
    if EXTRACTOR == 'TSFEL':    
        ### extract features
        feat_all = []
        ### select only the sequence of events without timestamps
        cfg_file = tsfel.get_features_by_domain('temporal')               # All features will be extracted.
        tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)

        # cfg_file = tsfel.get_features_by_domain('statistical')               # All features will be extracted.
        # stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)   

        # cfg_file = tsfel.get_features_by_domain('fractal')               # All features will be extracted. 
        # frac_features = tsfel.time_series_features_extractor(cfg_file, test_events)

        feat_all.extend(tmp_features.values[0])
        # feat_all.extend(stat_features.values[0])
        # feat_all.extend(frac_features.values[0])

    ########################################################################################################
    ############################################# TSFEL ####################################################
    ########################################################################################################

    ########################################################################################################
    ############################################# SegLearn #################################################
    ########################################################################################################
    # features = FeatureTransform.fit_transform(test_trace)
    if EXTRACTOR == 'SegLearn':
        ### reshape the data to 3D array n_samples x length_of_sample x n_variables (multivariate time series)
        test_events = np.array(test_events).reshape(1, -1, 1)  
        test_intervals = np.array(test_intervals).reshape(1, -1, 1)

        # print('test_events:', test_events.shape)
        feature_names = all_features().keys()
        feature_functions = all_features()
        feat_all = []
        for i, feat_label in enumerate(feature_names):
            # print(feat_label)
            # print(feature_functions[feat_label])
            func = feature_functions[feat_label]
            feat1 = func(test_events)
            feat1 = np.array(feat1).reshape(-1,)
            # print(feat1)
            # print(feat1.shape)
            feat_all.extend(feat1)

            # feat2 = func(test_intervals)
            # feat2 = np.array(feat2).reshape(-1,)
            # print(feat2)
            # feat_all.extend(feat2)
            
            # print('')

    ########################################################################################################
    ############################################# SegLearn #################################################
    ########################################################################################################

    test_files.append(test_data)
    test_feature_vectors.append( feat_all )
    # print(feat_all.columns)
    # print(feat_all.values[0])
    # print(feat_all.values[0].shape)

    # break

########################## TSFEL ###########################
if EXTRACTOR == 'TSFEL':
    feature_list.extend(tmp_features.columns)
    # feature_list.extend(stat_features.columns)
    # feature_list.extend(frac_features.columns)

    test_feature_vectors= pd.DataFrame(test_feature_vectors, columns=feature_list)

###########################################################

########################## SegLearn ###########################
if EXTRACTOR == 'SegLearn':
    feature_list.extend(feature_names)
    test_feature_vectors= pd.DataFrame(test_feature_vectors)

###########################################################



In [None]:
feature_list

In [None]:
if EXTRACTOR == 'TSFEL':
    ### filter features
    print('test_feature_vectors:', test_feature_vectors.shape)
    corr_features, test_feature_vectors = tsfel.correlated_features(test_feature_vectors, drop_correlated=True)
    print('test_feature_vectors:', test_feature_vectors.shape)
    # print('corr_features:', corr_features)

test_feature_vectors = test_feature_vectors.values

# ### Normalising Features
# scaler = preprocessing.StandardScaler()
# test_feature_vectors = scaler.fit_transform(test_feature_vectors)


## Prepare Data

In [None]:
padded_features = []
test_class = []

    
### load the labels
test_class_labels = read_json(test_labels_path[0])
# print('test_class_labels:', len(test_class_labels))
# print('test_class_labels:', test_class_labels)

### prepare the feature vectors for classification
i = 0
for (test_data, feature_vector) in zip(test_files, test_feature_vectors):
    file_name = test_data.split('/')[-1].split('.')[0]
    # print('file_name:', file_name)
    class_list = test_class_labels[file_name]
    # print('class_list:', class_list)
    # print('feature_vector:', feature_vector.shape)
    class_label = None
    # break

    # print('test_data:', test_data)
    # print('feature_vector:', np.array(feature_vector).shape)
    # print('test_class_label:', test_class_labels[file_name])

    if len(class_list) == 1:
        ### Hardcode the class label for all applications
        class_label = class_list

    else:
        # print('multiple class labels found for the test data:', test_data)
        class_label = class_list


        
    if class_label != None:
        feature_vector = np.array(feature_vector)
        # print('feature_vector:', feature_vector.shape) 
        # print('before:', feature_vector) 

        # np.where(np.isnan(feature_vector))
        feature_vector = np.nan_to_num(feature_vector)
        # print('after:', feature_vector)

        padded_features.append(feature_vector)
        # test_files.append(test_data)
        test_class.append(class_label)
        # print('feature_vector:', feature_vector.shape[0])

        # pad_num = 500 - feature_vector.shape[0]
        # print('pad_num:', pad_num)
        # padded_features = np.pad(feature_vector, (0,pad_num), 'constant', constant_values=(0))
        # print('padded_features:', padded_features.shape)

    # i += 1
    # if i==6:
    #     break

In [None]:
print(np.array(padded_features).shape)
print('test_class:', len(test_class))

In [None]:
padded_features[0]

In [None]:
unique_classes = []
for tc in test_class:
    # print(tc)
    if tc in unique_classes:
        continue
    else:
        print('tc:', tc)
        unique_classes.append(tc)

print('unique_classes:', unique_classes)
print('number of unique classes:', len(unique_classes))

## Clustering (KMeans)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

unique_classes = []
map_labels = dict()
ground_truth = []
label_key = 0
for tc in test_class:
    # print(tc)
    if tc in unique_classes:
        ground_truth.append(map_labels[(str(tc))])
        continue
    else:
        print('tc:', tc)
        unique_classes.append(tc)
        map_labels[str(tc)] = label_key
        ground_truth.append(map_labels[(str(tc))])
        label_key += 1

    
N_CLUSTER = len(unique_classes)
print('N_CLUSTER:', N_CLUSTER)

data = np.array(padded_features)

# Flatten the feature vectors (reshape to (48, 1000))
if len(np.array(padded_features).shape) == 3:
    # Reshape the data for clustering
    num_samples, num_features, num_points = data.shape
    data_reshaped = data.reshape(num_samples, num_features * num_points)
else:
    data_reshaped = data


#############################################################################################

# Normalize the data
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data_reshaped)


#############################################################################################

# Apply K-Means clustering
kmeans = KMeans(init="k-means++", max_iter=300, n_clusters=N_CLUSTER, n_init=30 )   # n_clusters=N_CLUSTER, random_state=0, n_init=4 
kmeans.fit(data_normalized)
# Get cluster labels
labels = kmeans.labels_
print('kmeans:', labels)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, adjusted_rand_score, normalized_mutual_info_score, f1_score, confusion_matrix
from scipy.optimize import linear_sum_assignment

# # Sample data: Replace with your actual predictions
# kmeans_labels = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1])  # Replace with K-Means predictions
# ground_truth = np.array([2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1])  # Given ground truth

ground_truth = np.array(ground_truth)
labels = np.array(labels)

# Map cluster labels to ground truth labels using the Hungarian algorithm
def best_cluster_mapping(y_true, y_pred):
    """Finds the best mapping between predicted and true labels using the Hungarian algorithm."""
    unique_classes = np.unique(y_true)
    unique_clusters = np.unique(y_pred)
    cost_matrix = np.zeros((len(unique_classes), len(unique_clusters)))

    for i, cls in enumerate(unique_classes):
        for j, cluster in enumerate(unique_clusters):
            cost_matrix[i, j] = -np.sum((y_true == cls) & (y_pred == cluster))  # Negative for maximization

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    mapping = {unique_clusters[col]: unique_classes[row] for row, col in zip(row_ind, col_ind)}

    return np.vectorize(mapping.get)(y_pred)  # Map predictions

print('ground_truth:', ground_truth)
print('labels:', labels)
# Remap cluster labels to best-matching class labels
remapped_labels = best_cluster_mapping(ground_truth, labels)

# Evaluation Metrics
accuracy = accuracy_score(ground_truth, remapped_labels)
f1 = f1_score(ground_truth, remapped_labels, average='weighted')
conf_matrix = confusion_matrix(ground_truth, remapped_labels)

ari = adjusted_rand_score(ground_truth, labels)
nmi = normalized_mutual_info_score(ground_truth, labels)

# Print results
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print("Confusion Matrix:\n", conf_matrix)

## Clustering (DBSCAN)

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Example variables
# padded_features = np.random.rand(x, 500)  # Replace with your actual feature matrix
# test_files = ["file1", "file2", ..., "fileX"]  # Replace with your actual file names
# test_class = ["label1", "label2", ..., "labelX"]  # Replace with your actual labels

data = np.array(padded_features)

# Flatten the feature vectors (reshape to (48, 1000))
if len(np.array(padded_features).shape) == 3:
    # Reshape the data for clustering
    num_samples, num_features, num_points = data.shape
    data_reshaped = data.reshape(num_samples, num_features * num_points)
else:
    data_reshaped = data

unique_classes = []
map_labels = dict()
map_ints = dict()
ground_truth = []
label_key = 0
for tc in test_class:
    # print(tc)
    if tc in unique_classes:
        ground_truth.append(map_labels[(str(tc))])
        continue
    else:
        print('tc:', tc)
        unique_classes.append(tc)
        map_labels[str(tc)] = label_key
        map_ints[label_key] = str(tc)
        ground_truth.append(map_labels[(str(tc))])
        label_key += 1
        
ground_truth = np.array(ground_truth)
# Normalize the features
scaler = StandardScaler()
padded_features_normalized = scaler.fit_transform(data_reshaped)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=2, metric="euclidean")
dbscan.fit(padded_features_normalized)

# Get cluster labels
cluster_labels = dbscan.labels_

# Group file names and labels by cluster
clusters = {}
for i, cluster in enumerate(cluster_labels):
    if cluster not in clusters:
        clusters[cluster] = {"files": [], "labels": [], "features": []}
    clusters[cluster]["files"].append(test_files[i])
    clusters[cluster]["labels"].append(map_ints[ground_truth[i]])
    clusters[cluster]["features"].append(padded_features_normalized[i]) 

# Print the clusters
for cluster_id, cluster_data in clusters.items():
    print(f"Cluster {cluster_id}:")
    print(f"  Number of files: {len(cluster_data['files'])}")
    

for cluster_id, cluster_data in clusters.items():
    print(f"Cluster {cluster_id}:")
    # print(f"  Files: {cluster_data['files']}")
    # print(f"  Labels: {cluster_data['labels']}")
    files = cluster_data['files']
    labels = cluster_data['labels']
    features = cluster_data['features']
    for file, label, feat in zip(files, labels, features):
        print(file)
        print(label)
        print(feat[:50])
        print('')
    print('')

In [None]:
np.array(ground_truth)

In [None]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score

# Apply DBSCAN clustering
# dbscan.fit(padded_features_normalized)

# Get cluster labels
labels = dbscan.labels_

# Filter out noise points (label -1 indicates noise in DBSCAN)
filtered_indices = labels != -1
# print(filtered_indices)
filtered_labels = labels[filtered_indices]
filtered_ground_truth = ground_truth[filtered_indices]

# Evaluation Metrics
ari = adjusted_rand_score(filtered_ground_truth, filtered_labels)
nmi = normalized_mutual_info_score(filtered_ground_truth, filtered_labels)
if len(set(filtered_labels)) > 1:  # Silhouette score requires at least 2 clusters
    silhouette = silhouette_score(padded_features_normalized[filtered_indices], filtered_labels)
else:
    silhouette = '--'  # or some other value indicating that silhouette score is not applicable
homogeneity = homogeneity_score(filtered_ground_truth, filtered_labels)
completeness = completeness_score(filtered_ground_truth, filtered_labels)
v_measure = v_measure_score(filtered_ground_truth, filtered_labels)

# Print results
print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
if silhouette != '--':
    print(f"Silhouette Score: {silhouette:.4f}")
else:
    print("Silhouette Score: -- (not applicable, only one cluster)")
print(f"Homogeneity: {homogeneity:.4f}")
print(f"Completeness: {completeness:.4f}")
print(f"V-Measure: {v_measure:.4f}")

### for each cluster print the respective ground truth classes
from collections import defaultdict
cluster_to_classes = defaultdict(list)
for lbl, true_cls in zip(filtered_labels, filtered_ground_truth):
    true_cls = map_ints[true_cls]  ### convert back to original class label
    cluster_to_classes[lbl].append(true_cls)
for cluster_id, classes in cluster_to_classes.items():
    unique, counts = np.unique(classes, return_counts=True)
    class_count = dict(zip(unique, counts))
    print(f"Cluster {cluster_id}: Class distribution: {class_count}")


## Feature Similarity

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

# Example: padded_features is a (46, 31) matrix
# padded_features = np.random.rand(46, 31)  # Replace with your actual data
# test_class = ["Class1", "Class2", ..., "Class46"]  # Replace with your actual class labels

# Define a function to calculate and plot similarity heatmap
def plot_similarity_heatmap(features, labels, metric, title):
    # Step 1: Calculate pairwise distances
    similarity_matrix = cdist(features, features, metric=metric)

    # Step 2: Generate a heatmap
    plt.figure(figsize=(22, 20))
    ax = sns.heatmap(
        similarity_matrix,
        annot=False,  # Set to True if you want to display values
        fmt=".2f",
        cmap="viridis",
        cbar=True,
        xticklabels=labels,
        yticklabels=labels
    )
    plt.title(title)

    # Place x-axis ticks on top
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')  # Move x-axis label to the top

    # Rotate tick labels for better readability
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)

    plt.show()

# Metrics to calculate
metrics = {
    "euclidean": "Euclidean Distance",
    "cityblock": "Manhattan Distance",
    "chebyshev": "Chebyshev Distance",
    "cosine": "Cosine Similarity",
    "correlation": "Correlation Distance"
}

# Generate heatmaps for each metric
for metric, title in metrics.items():
    plot_similarity_heatmap(padded_features, test_class, metric, f"Pairwise Feature Similarity ({title})")

In [None]:
## Feature Extraction with SegLearn
# for test_data in test_data_path[0:]:
#     print('test_data:', test_data)
#     ### read the subseq
#     test_trace = read_traces(test_data)
#     # print('test_trace:', test_trace)
#     test_data_len = len(test_trace)
#     print('test_data_len:', test_data_len)

#     test_trace = np.array(test_trace).reshape(1, -1, 2)
#     print('test_trace:', test_trace.shape)

#     # features = FeatureTransform.fit_transform(test_trace)
#     feature_names = all_features().keys()
#     feature_functions = all_features()
#     for i, feat_label in enumerate(feature_names):
#         print(feat_label)
#         # print(feature_functions[feat_label])
#         func = feature_functions[feat_label]
#         feat = func(test_trace)
#         print(feat)
#         print(feat.shape)
#         print('')
    
    # break
