# Clustering - Fixed Window - DL Extractor
- Use Deep Learning based extractors to get abstract features by giving the entire detection subseq as input
- Use these features to cluster the detections with similar anomalies
- The feature extraction is NOT dependent on the corresponding normal behaviour subtrace
- This will serve as benchmark peroformance with off the shelf models, without any optimization 
- We tested this approach across all applications

In [1]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt
import plotly.express as px
from statistics import mode
from sklearn import preprocessing

import tensorflow as tf
import TSFEDL.models_keras as tsfedl
import joblib


# ############ configuration - trace ################
# ############################################


CODE = 'mamba2'       ### application (code)       ###  'theft_protection', 'mamba2', 'lora_ducy'
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 3                     ### format of data collection

base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)


################# configuration - diag ################
IS_VAR_WINDOW = False             ### True: varibale window size, False: fixed window size; wether to use variable window size or not

#####################################################


ref_samples_basepath = os.path.join(normalbase_path, 'diag_refsamples')
ref_var_samples_basepath = os.path.join(normalbase_path, 'diag_var_refsamples')
diag_subseq_basepath = os.path.join(faultybase_path, 'diag_subseq')
subseq_label_basepath = os.path.join(diag_subseq_basepath, 'subseq_labels')


print('ref_samples_path:\n', ref_samples_basepath)
print('ref_var_samples_path:\n', ref_var_samples_basepath)
print('diag_subseq_path:\n', diag_subseq_basepath)

######### get paths #######################
ref_samples_path = [os.path.join(ref_samples_basepath, x) for x in os.listdir(ref_samples_basepath)]
ref_var_samples_path = [os.path.join(ref_var_samples_basepath, x) for x in os.listdir(ref_var_samples_basepath)]   

train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

test_subseq_path = [os.path.join(diag_subseq_basepath, x) for x in os.listdir(diag_subseq_basepath)]
test_labels_path = [os.path.join(subseq_label_basepath, x) for x in os.listdir(subseq_label_basepath)]

# ### remove.Ds_store from all lists
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]
ref_samples_path = [x for x in ref_samples_path if '.DS_Store' not in x]
ref_var_samples_path = [x for x in ref_var_samples_path if '.DS_Store' not in x]
test_subseq_path = [x for x in test_subseq_path if '.DS_Store' not in x if '.json' in x]
test_labels_path = [x for x in test_labels_path if '.DS_Store' not in x]


varlist_path.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

if IS_VAR_WINDOW:
    train_data_path = ref_var_samples_path
else:
    train_data_path = ref_samples_path

test_data_path = test_subseq_path

print('train_data:\n', train_data_path)
print(len(train_data_path))
print('test_data:\n', test_data_path)
print(len(test_data_path))
print('test_labels:\n', test_labels_path)




../../trace_data/mamba2/single_thread/version_3/normal
../../trace_data/mamba2/single_thread/version_3/faulty_data
ref_samples_path:
 ../../trace_data/mamba2/single_thread/version_3/normal/diag_refsamples
ref_var_samples_path:
 ../../trace_data/mamba2/single_thread/version_3/normal/diag_var_refsamples
diag_subseq_path:
 ../../trace_data/mamba2/single_thread/version_3/faulty_data/diag_subseq
train_data:
 ['../../trace_data/mamba2/single_thread/version_3/normal/diag_refsamples/3721.json', '../../trace_data/mamba2/single_thread/version_3/normal/diag_refsamples/2833.json', '../../trace_data/mamba2/single_thread/version_3/normal/diag_refsamples/23684.json', '../../trace_data/mamba2/single_thread/version_3/normal/diag_refsamples/24141.json', '../../trace_data/mamba2/single_thread/version_3/normal/diag_refsamples/13232.json', '../../trace_data/mamba2/single_thread/version_3/normal/diag_refsamples/22996.json', '../../trace_data/mamba2/single_thread/version_3/normal/diag_refsamples/729.json', '

## Feature Extraction with TSFE-DL


In [2]:
#################################################################################################
####################################### Select Extractor ########################################
#################################################################################################

EXTRACTOR = 'forecaster'

print('Selected Extractor:', EXTRACTOR)

#################################################################################################
#################################################################################################
#################################################################################################

if EXTRACTOR == 'forecaster':    
    ### load the model
    model = tf.keras.models.load_model('./trained_models/forecaster_events_minmax_mamba+theft.keras')
    # model.summary()

    new_model = tf.keras.Model(inputs=model.input, outputs=model.layers[-5].output)
    new_model.summary()


test_feature_vectors = []
test_files = []
for test_data in test_data_path[0:]:
    print('test_data:', test_data)
    ### read the subseq
    test_trace = read_traces(test_data)
    # print('test_trace:', test_trace)
    test_data_len = len(test_trace)
    print('test_data_len:', test_data_len)

    if test_data_len > 500:
        # print('test data length is more than 500, skipping...')
        # missing_features.append((test_data, 'test data length is more than 500'))
        # continue

        print('test data length is more than 500, truncating...')
        test_trace = test_trace[:500]
        test_data_len = 500
    else:
        ### pad the test data
        test_trace = test_trace + [(0,0)]*(500-test_data_len)
        test_data_len = 500
    
    # df = pd.DataFrame(test_trace, columns=['event', 'ts'],)
    # ### seperate event and ts
    # test_events = df['event']
    # test_ts = df['ts']

    ### transform the test trace from [(var,ts1), (var,ts2), (var, ts3)] to [[var1, var2, var3], [ts1, ts2, ts3]]
    test_events = []
    test_intervals = []
    prev_time = test_trace[0][1]
    time_diff = 0
    for x in test_trace:
        time_diff = x[1] - prev_time
        test_intervals.append(time_diff)
        prev_time = x[1]
        test_events.append(x[0])

    # print('len of test_trace:', len(test_trace))
    assert len(test_events) == len(test_intervals) == test_data_len


    # print(df)
    # print(test_events)

    
    print(test_events)
    scaler_events = joblib.load(f"./scalers/minmaxscaler_{CODE}.gz")
    test_events = scaler_events.transform(np.array(test_events).reshape(-1, 50))
    print(np.array(test_events))
    # print(np.array(test_events).reshape(-1, 1).shape)


    ########################################################################################################
    ############################################ forecaster ###############################################
    ########################################################################################################
    if EXTRACTOR == 'forecaster':    
        # ### extract features
        # feat_single = []
        # for i in range(0, test_data_len, 50):
        #     sub_events = test_events[i:i+50]
        #     # print('sub_events:', sub_events)
        #     # print('len of sub_events:', len(sub_events))
        #     sub_events = np.array(sub_events)
        #     sub_events = sub_events.reshape(1, sub_events.shape[0], 1)
        #     # print('sub_events shape:', sub_events.shape)

        #     sub_features = new_model.predict(sub_events)
        #     sub_features = sub_features.flatten()
        #     # print('sub_features shape:', sub_features.shape)
        #     # print('sub_features:', sub_features)

        #     feat_single.extend(sub_features)

        # print('test_events shape:', test_events.shape)
        _test_events = np.array(test_events).reshape(-1, 50, 1)
        # print('_test_events shape:', _test_events.shape)

        feat_single = new_model.predict(_test_events)
        # print('feat_single shape:', feat_single.shape)
        feat_single = feat_single.flatten()
        # print('feat_single shape:', feat_single.shape)

    ########################################################################################################
    ############################################# forecaster ##############################################
    ########################################################################################################

    test_feature_vectors.append(feat_single)
    test_files.append(test_data)

    # break

    



Selected Extractor: forecaster


test_data: ../../trace_data/mamba2/single_thread/version_3/faulty_data/diag_subseq/trace_trial4_8851-9212.json
test_data_len: 361
[62, 61, 49, 57, 63, 61, 49, 57, 64, 23, 24, 49, 49, 56, 57, 65, 23, 24, 49, 49, 56, 57, 66, 23, 24, 49, 49, 56, 57, 67, 23, 24, 49, 49, 56, 50, 50, 31, 50, 53, 54, 55, 56, 57, 58, 59, 20, 21, 22, 49, 49, 49, 57, 60, 61, 49, 57, 62, 61, 49, 57, 63, 61, 49, 57, 64, 23, 24, 49, 49, 56, 57, 65, 70, 23, 24, 49, 49, 56, 57, 66, 70, 44, 74, 54, 57, 67, 70, 23, 24, 49, 49, 56, 50, 50, 31, 50, 53, 54, 55, 56, 57, 58, 59, 20, 21, 22, 49, 49, 49, 57, 60, 61, 49, 57, 62, 61, 49, 57, 63, 61, 49, 57, 64, 70, 23, 24, 49, 49, 56, 57, 65, 23, 24, 49, 49, 56, 57, 66, 23, 24, 49, 49, 56, 44, 57, 67, 23, 24, 49, 49, 56, 50, 50, 31, 50, 53, 54, 55, 56, 57, 58, 59, 20, 21, 22, 49, 49, 49, 57, 60, 61, 49, 57, 62, 61, 49, 57, 63, 61, 49, 57, 64, 23, 24, 49, 49, 56, 57, 65, 68, 23, 24, 49, 49, 56, 57, 66, 23, 24, 49, 49, 56, 57, 67, 23, 24, 49, 49, 56, 50, 50, 31, 50, 69, 51, 26, 5

In [3]:
###
print(len(test_data_path))
print(np.array(test_feature_vectors).shape)

15
(15, 200)


In [4]:
test_feature_vectors

[array([ 9.64027584e-01, -1.09099532e-02,  9.64027584e-01, -9.64027584e-01,
        -9.64027584e-01, -9.40192073e-31, -9.64027584e-01, -9.64027584e-01,
        -9.64027584e-01, -9.64027584e-01,  9.64027584e-01, -9.64027584e-01,
         9.64027584e-01, -9.63950157e-01,  7.61594057e-01,  9.64027584e-01,
        -1.10132726e-16,  9.64027584e-01,  9.63599205e-01,  9.64027584e-01,
         9.64027584e-01, -8.31968129e-01,  9.64027584e-01, -9.64027584e-01,
        -9.57327902e-01, -1.43913116e-19, -9.64027584e-01, -9.64027584e-01,
        -9.64027524e-01, -9.64027584e-01,  9.64027584e-01, -9.64027584e-01,
         9.64027584e-01, -6.39417410e-01,  9.64027584e-01,  9.64027584e-01,
        -9.64027584e-01,  9.64027584e-01,  9.62754250e-01,  9.64027584e-01,
         9.40755427e-01, -1.33313880e-14,  9.64027584e-01, -9.64027584e-01,
        -9.64027464e-01, -5.74383448e-29, -9.63974535e-01, -9.64027584e-01,
        -9.64027584e-01, -9.64027584e-01,  9.64027584e-01, -9.63306546e-01,
         9.6

## Prepare Data

In [5]:
padded_features = []
test_class = []

    
### load the labels
test_class_labels = read_json(test_labels_path[0])
# print('test_class_labels:', len(test_class_labels))
# print('test_class_labels:', test_class_labels)

### prepare the feature vectors for classification
i = 0
for (test_data, feature_vector) in zip(test_files, test_feature_vectors):
    file_name = test_data.split('/')[-1].split('.')[0]
    # print('file_name:', file_name)
    class_list = test_class_labels[file_name]
    # print('class_list:', class_list)
    # print('feature_vector:', feature_vector.shape)
    class_label = None
    # break

    # print('test_data:', test_data)
    # print('feature_vector:', np.array(feature_vector).shape)
    # print('test_class_label:', test_class_labels[file_name])

    if len(class_list) == 1:
        ### Hardcode the class label for all applications
        class_label = class_list

    else:
        # print('multiple class labels found for the test data:', test_data)
        class_label = class_list


        
    if class_label != None:
        feature_vector = np.array(feature_vector)
        # print('feature_vector:', feature_vector.shape) 
        # print('before:', feature_vector) 

        # np.where(np.isnan(feature_vector))
        feature_vector = np.nan_to_num(feature_vector)
        # print('after:', feature_vector)

        padded_features.append(feature_vector)
        # test_files.append(test_data)
        test_class.append(class_label)
        # print('feature_vector:', feature_vector.shape[0])

        # pad_num = 500 - feature_vector.shape[0]
        # print('pad_num:', pad_num)
        # padded_features = np.pad(feature_vector, (0,pad_num), 'constant', constant_values=(0))
        # print('padded_features:', padded_features.shape)

    # i += 1
    # if i==6:
    #     break

In [6]:
np.array(padded_features).shape

(15, 200)

## Clustering


In [11]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

unique_classes = []
map_labels = dict()
ground_truth = []
label_key = 0
for tc in test_class:
    # print(tc)
    if tc in unique_classes:
        ground_truth.append(map_labels[(str(tc))])
        continue
    else:
        print('tc:', tc)
        unique_classes.append(tc)
        map_labels[str(tc)] = label_key
        ground_truth.append(map_labels[(str(tc))])
        label_key += 1

    
N_CLUSTER = len(unique_classes)
print('N_CLUSTER:', N_CLUSTER)

data = np.array(padded_features)

# Flatten the feature vectors (reshape to (48, 1000))
if len(np.array(padded_features).shape) == 3:
    # Reshape the data for clustering
    num_samples, num_features, num_points = data.shape
    data_reshaped = data.reshape(num_samples, num_features * num_points)
else:
    data_reshaped = data


#############################################################################################

# Normalize the data
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data_reshaped)


#############################################################################################

# Apply K-Means clustering
kmeans = KMeans(init="k-means++", max_iter=300, n_clusters=N_CLUSTER, n_init=50 )   # n_clusters=N_CLUSTER, random_state=0, n_init=4 
kmeans.fit(data_normalized)
# Get cluster labels
labels = kmeans.labels_
print('kmeans:', labels)

tc: [4]
tc: [2, 2, 2, 2, 2]
tc: [2, 2]
tc: [1]
tc: [3, 3, 3, 3, 3, 3]
tc: [3, 3, 3, 3, 3, 3, 3, 3]
tc: [2]
tc: [3, 3, 3, 3, 3]
N_CLUSTER: 8
kmeans: [1 2 6 1 1 4 5 2 0 3 7 7 1 1 1]


In [12]:
import numpy as np
from sklearn.metrics import accuracy_score, adjusted_rand_score, normalized_mutual_info_score, f1_score, confusion_matrix
from scipy.optimize import linear_sum_assignment

# # Sample data: Replace with your actual predictions
# kmeans_labels = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1])  # Replace with K-Means predictions
# ground_truth = np.array([2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1])  # Given ground truth

ground_truth = np.array(ground_truth)
labels = np.array(labels)

# Map cluster labels to ground truth labels using the Hungarian algorithm
def best_cluster_mapping(y_true, y_pred):
    """Finds the best mapping between predicted and true labels using the Hungarian algorithm."""
    unique_classes = np.unique(y_true)
    unique_clusters = np.unique(y_pred)
    cost_matrix = np.zeros((len(unique_classes), len(unique_clusters)))

    for i, cls in enumerate(unique_classes):
        for j, cluster in enumerate(unique_clusters):
            cost_matrix[i, j] = -np.sum((y_true == cls) & (y_pred == cluster))  # Negative for maximization

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    mapping = {unique_clusters[col]: unique_classes[row] for row, col in zip(row_ind, col_ind)}

    return np.vectorize(mapping.get)(y_pred)  # Map predictions

print('ground_truth:', ground_truth)
print('labels:', labels)
# Remap cluster labels to best-matching class labels
remapped_labels = best_cluster_mapping(ground_truth, labels)

# Evaluation Metrics
accuracy = accuracy_score(ground_truth, remapped_labels)
f1 = f1_score(ground_truth, remapped_labels, average='weighted')
conf_matrix = confusion_matrix(ground_truth, remapped_labels)

ari = adjusted_rand_score(ground_truth, labels)
nmi = normalized_mutual_info_score(ground_truth, labels)

# Print results
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print("Confusion Matrix:\n", conf_matrix)

ground_truth: [0 1 0 2 0 3 4 5 6 6 5 7 2 6 6]
labels: [1 2 6 1 1 4 5 2 0 3 7 7 1 1 1]
F1 Score: 0.5178
Accuracy: 0.5333
Adjusted Rand Index (ARI): 0.0998
Normalized Mutual Information (NMI): 0.7002
Confusion Matrix:
 [[1 0 2 0 0 0 0 0]
 [0 1 0 0 0 0 0 0]
 [0 0 2 0 0 0 0 0]
 [0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0]
 [0 1 0 0 0 1 0 0]
 [0 0 2 0 0 0 1 1]
 [0 0 0 0 0 1 0 0]]


In [None]:
## Feature Extraction with SegLearn
# for test_data in test_data_path[0:]:
#     print('test_data:', test_data)
#     ### read the subseq
#     test_trace = read_traces(test_data)
#     # print('test_trace:', test_trace)
#     test_data_len = len(test_trace)
#     print('test_data_len:', test_data_len)

#     test_trace = np.array(test_trace).reshape(1, -1, 2)
#     print('test_trace:', test_trace.shape)

#     # features = FeatureTransform.fit_transform(test_trace)
#     feature_names = all_features().keys()
#     feature_functions = all_features()
#     for i, feat_label in enumerate(feature_names):
#         print(feat_label)
#         # print(feature_functions[feat_label])
#         func = feature_functions[feat_label]
#         feat = func(test_trace)
#         print(feat)
#         print(feat.shape)
#         print('')
    
    # break
