In [None]:
# Clustering methods
import json
import numpy as np
import pandas as pd
import os
from libraries.utils import get_paths, read_traces, read_json, mapint2var, is_consistent

In [None]:
# Configuration
CODE = 'mamba2'               ### application (code) theft_protection, mamba2, lora_ducy
BEHAVIOUR_FAULTY = 'faulty_data'        ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'             ### normal, faulty_data
THREAD = 'single'                       ### single, multi
VER = 3                                 ### format of data collection

base_dir = '../../trace_data'              ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print("Normal base path:", normalbase_path)
print("Faulty base path:", faultybase_path)

In [None]:
train_base_path = os.path.join(normalbase_path, 'train_data')
print("Train base path:", train_base_path)

train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = [os.path.join(normalbase_path, x) for x in os.listdir(normalbase_path) if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

test_data_path = paths_traces
test_label_path = paths_label

In [None]:
# Function to read the data and transform into separate files with 50 as sequence length and window size of 1
import os
def read_process_train_data(file_path, sequence_length, window_size):
    count = 0
    output_directory = './train_data_processed/'
    filename_dictionary = {}
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in file_path:
        result = []
        print("Reading file:", filename)
        count = count + 1
        with open(filename, 'r') as f:
            data = json.load(f)
        
        event_ids = [item[0] for item in data]
        timestamps = [item[1] for item in data]
        timestamp_difference = np.diff(timestamps).tolist()
        trimmed_event_ids = event_ids[1:]

        filename = f'trace_trial{count}' 

        for i in range(0, len(trimmed_event_ids) - sequence_length + 1, window_size):
            event_id_seq_len = trimmed_event_ids[i:i+sequence_length]
            timestamp_diff_seq_len = timestamp_difference[i:i+sequence_length]
            # result = [event_id_seq_len, timestamp_diff_seq_len]
            result = [event_id_seq_len]

            index_start = i
            index_end = i + sequence_length
            new_filename = f'{filename}_{index_start}-{index_end}.json'
            output_path = os.path.join(output_directory, new_filename)

            #Saving the file
            with open(output_path, 'w') as out_file:
                json.dump(result, out_file)
            
            #Saving to dictionary for easy backtracking
            filename_dictionary[new_filename] = {
                "source_file": filename,
                "data_start_index": index_start,
                "data_end_index": index_end,
                "sequence_length": sequence_length,
                "window_size": window_size
            }
    
    dictionary_path = f'{output_directory}/file_dict/'
    if not os.path.exists(dictionary_path):
        os.makedirs(dictionary_path)
    
    dict_path = os.path.join(dictionary_path,'filename_dict.json')

    with open(dict_path, 'w') as track_dict:
        json.dump(filename_dictionary, track_dict, indent=2)
    
    print("Filename Dictionary saved to :", dict_path)

In [None]:
train_data_path

In [None]:
# processing the data
sequence_length = 50
window_size = 1
train_data_processed = read_process_train_data(train_data_path,sequence_length,window_size)

In [None]:
import os
import json
def load_data(file_path):
    traces = []
    filenames = []
    for file in os.listdir(file_path):
        if file.endswith('.json') and not file.startswith('filename_dict'):
            with open(os.path.join(file_path, file), 'r') as f:
                data = json.load(f)
                traces.append(data)
                filenames.append(file)
    return traces, filenames


In [None]:
processed_train_data_path = './train_data_processed/'
traces, files = load_data(processed_train_data_path)

In [None]:
print(len(traces))
print(files)

In [None]:
from libraries.anomaly_detection import extract_features_seglearn
features_df = extract_features_seglearn(traces)

In [None]:
features_df

In [None]:
from sklearn.cluster import KMeans

def cluster_features(df, n_clusters):
    kmeans = KMeans(n_clusters,init="k-means++", max_iter=300, n_init=30, random_state=42)
    kmeans.fit(df)
    labels = kmeans.labels_
    print("kmeans:", labels)
    return kmeans

In [None]:
from sklearn.metrics import silhouette_score

def find_n_clusters_value(data, min_k, max_k):
    best_k = min_k
    best_score = -1

    for k in range(min_k, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(data)
        score = silhouette_score(data, labels)
        if score > best_score:
            best_score = score
            best_k = k
    
    return best_k

In [None]:
# Clustering
from sklearn.preprocessing import StandardScaler
import joblib

scaler = StandardScaler()
features_df_clean = features_df.dropna()
valid_index = features_df_clean.index
filtered_files = [files[i] for i in valid_index]

X_scaled = scaler.fit_transform(features_df_clean)

N_Clusters = find_n_clusters_value(X_scaled, min_k=6, max_k=10)
print("\n optimal number of clusters: ", N_Clusters)

kmeans_model = cluster_features(pd.DataFrame(X_scaled), N_Clusters)
features_df_clean['cluster'] = kmeans_model.labels_
features_df_clean['file'] = filtered_files

train_features = X_scaled
train_labels = kmeans_model.labels_



model_save_path = './trained_model/'
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

joblib.dump(kmeans_model, f'{model_save_path}kmeans_model.pkl')

joblib.dump(train_features, f'{model_save_path}train_features.pkl')
joblib.dump(train_labels, f'{model_save_path}train_clusters.pkl')

# Saving the scaler
scaler_save_path = './scalers/'
if not os.path.exists(scaler_save_path):
    os.makedirs(scaler_save_path)
joblib.dump(scaler, f'{scaler_save_path}scaler.pkl')


print("\n Cluster assignments")
print(features_df_clean[['file', 'cluster']])

# features_df.to_csv("clustered_features_seglearn.csv", index=False)
# print("\n file saved")

In [None]:
# Plotting the clusters
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=features_df_clean['cluster'], cmap='tab10', s=50)
plt.title("K Means(PCA)")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.colorbar(scatter, label="Cluster")
plt.grid(True)
plt.show()


In [None]:
# prediction for test data
from libraries.anomaly_detection import test_single_for_clustering, merge_detections, get_correct_detections

## checking the detections against the ground truth
DIFF_VAL = 5 
all_detections = []         # To store detections for each file
y_pred_all = []             # To store the predicted labels
y_true_all = []             # To store the ground truth labels
all_tp = []                 # To store all true positives
all_fp = []                 # To store all false positives
all_fn = []                 # To store all false negatives
all_gt = []                 # To store the ground truth
sequence_length = 50                                                # Sequence length for the model

# Loading the model and scaler
model_path = f'{model_save_path}kmeans_model.pkl'
loaded_kmeans_model = joblib.load(model_path)

scaler_path = f'{scaler_save_path}scaler.pkl'
loaded_scaler = joblib.load(scaler_path) 


trained_features_path = f'{model_save_path}train_features.pkl'
trained_clusters_path = f'{model_save_path}train_clusters.pkl'
trained_features = joblib.load(trained_features_path)
trained_cluster_labels = joblib.load(trained_clusters_path)
print(trained_features)
print(trained_cluster_labels)

# Iterating through each test data file and label file
for test_data, test_label in zip(test_data_path, test_label_path):
    detection, inference_time = test_single_for_clustering(test_data,sequence_length,trained_features, trained_cluster_labels,loaded_scaler)            # Detecting anomalies in the test data
    print("Detection : ", detection)

    print("len(detection) : ", len(detection))

    merge_detection, agg_ts = merge_detections(detection, diff_val=DIFF_VAL)

    print("Merge detection : ", merge_detection)
    
    ground_truth_raw = read_traces(test_label)                                               # read ground truth labels from the label file
    ground_truth = ground_truth_raw['labels']                                                # extract labels from dictionary from ground truth data

    label_trace_name = list(ground_truth.keys())[0]
    ground_truth = ground_truth[label_trace_name]

    correct_pred, rest_pred, y_pred, y_true, false_neg = get_correct_detections(merge_detection, ground_truth)  # Comparing detected anomaly with ground truth

    y_pred_all.extend(y_pred)          # predicted labels
    y_true_all.extend(y_true)          # actual ground truth labels
    all_detections.append((test_data, merge_detection, test_label))
    all_tp.append((test_data, correct_pred, test_label))
    all_fp.append((test_data, rest_pred, test_label))
    all_fn.append((test_data, false_neg, test_label))
    all_gt.append((test_data, ground_truth, test_label))

    print("Inference time : ", inference_time)

    # break

In [None]:
test_data_path

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred_all = np.array(y_pred_all)
y_true_all = np.array(y_true_all)

# Calculate evaluation metrics
precision = precision_score(y_true_all, y_pred_all)
recall = recall_score(y_true_all, y_pred_all)
f1 = f1_score(y_true_all, y_pred_all)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

conf_matrix = confusion_matrix(y_true_all, y_pred_all)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['Normal', 'Anomaly'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()