# Clustering - Fixed Window - DL extractor
- Use Deep Learning based extractors to get abstract features by giving the entire detection subseq as input
- Use these features to cluster the detections with similar anomalies
- The feature extraction is NOT dependent on the corresponding normal behaviour subtrace
- This will serve as benchmark peroformance with off the shelf models, without any optimization 
- We tested this approach across all applications

In [1]:
'''
TODO:
- select a DL based feature extractor
- check if we need to fine-tune the feature extractor on training data before using it on test data
- make a list of suitable clustering algorithms for different usecases

CLUSTERING
- use the feature extractor to extract features from the detections
- try to cluster the detections based on the extracted features

Steps:
- try TSFEDL for anomaly detection using sepcialized anomaly detectors (detect)
 
- try TSFEDL to extract features using SOTA DL models (diag)
- try TSFEL to extract handcrafted features (statistical, spectral, temporal, etc) (diag)
'''


'\nTODO:\n- select a DL based feature extractor\n- check if we need to fine-tune the feature extractor on training data before using it on test data\n- make a list of suitable clustering algorithms for different usecases\n\nCLUSTERING\n- use the feature extractor to extract features from the detections\n- try to cluster the detections based on the extracted features\n\nSteps:\n- try TSFEDL for anomaly detection using sepcialized anomaly detectors (detect)\n \n- try TSFEDL to extract features using SOTA DL models (diag)\n- try TSFEL to extract handcrafted features (statistical, spectral, temporal, etc) (diag)\n'

In [1]:
import json
import os
import sys
sys.path.append('../')  ### to detect libraries in the parent directory
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from collections import defaultdict
from libraries.utils import *
from libraries.exeint import exeInt
import plotly.express as px
from statistics import mode
from sklearn import preprocessing


# ############ configuration - trace ################
# ############################################


CODE = 'theft_protection'       ### application (code)       ###  'theft_protection', 'mamba2', 'lora_ducy'
BEHAVIOUR_FAULTY = 'faulty_data'            ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'            ### normal, faulty_data
THREAD = 'single'           ### single, multi
VER = 3                     ### format of data collection

base_dir = '../../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print(normalbase_path)
print(faultybase_path)


################# configuration - diag ################
IS_VAR_WINDOW = False             ### True, False; wether to use variable window size or not

#####################################################


ref_samples_basepath = os.path.join(normalbase_path, 'diag_refsamples')
ref_var_samples_basepath = os.path.join(normalbase_path, 'diag_var_refsamples')
diag_subseq_basepath = os.path.join(faultybase_path, 'diag_subseq')
subseq_label_basepath = os.path.join(diag_subseq_basepath, 'subseq_labels')


print('ref_samples_path:\n', ref_samples_basepath)
print('ref_var_samples_path:\n', ref_var_samples_basepath)
print('diag_subseq_path:\n', diag_subseq_basepath)

######### get paths #######################
ref_samples_path = [os.path.join(ref_samples_basepath, x) for x in os.listdir(ref_samples_basepath)]
ref_var_samples_path = [os.path.join(ref_var_samples_basepath, x) for x in os.listdir(ref_var_samples_basepath)]   

train_varlist_path = os.listdir(normalbase_path)
train_varlist_path = [os.path.join(normalbase_path, x) for x in train_varlist_path if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

test_subseq_path = [os.path.join(diag_subseq_basepath, x) for x in os.listdir(diag_subseq_basepath)]
test_labels_path = [os.path.join(subseq_label_basepath, x) for x in os.listdir(subseq_label_basepath)]

# ### remove.Ds_store from all lists
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]
ref_samples_path = [x for x in ref_samples_path if '.DS_Store' not in x]
ref_var_samples_path = [x for x in ref_var_samples_path if '.DS_Store' not in x]
test_subseq_path = [x for x in test_subseq_path if '.DS_Store' not in x if '.json' in x]
test_labels_path = [x for x in test_labels_path if '.DS_Store' not in x]


varlist_path.sort()

# print(paths_log)
# print(paths_traces)
# print(varlist_path)
# print(paths_label)

if IS_VAR_WINDOW:
    train_data_path = ref_var_samples_path
else:
    train_data_path = ref_samples_path

test_data_path = test_subseq_path

print('train_data:\n', train_data_path)
print(len(train_data_path))
print('test_data:\n', test_data_path)
print(len(test_data_path))
print('test_labels:\n', test_labels_path)




../../trace_data/theft_protection/single_thread/version_3/normal
../../trace_data/theft_protection/single_thread/version_3/faulty_data
ref_samples_path:
 ../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples
ref_var_samples_path:
 ../../trace_data/theft_protection/single_thread/version_3/normal/diag_var_refsamples
diag_subseq_path:
 ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq
train_data:
 ['../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/379.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/396.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/115.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/400.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsamples/142.json', '../../trace_data/theft_protection/single_thread/version_3/normal/diag_refsam

## Calculate Feature Vectors

- For fixed window size, load all the ref samples before hand
- For variable window, load the map_len; further load files only with the suitable len

Feature extraction using DL:
- load DL model with appropriate weights
- make all the detections of same length (500)
- give the detections as inputs and extract features
- cluster these features

### Feature Extraction with TSFEL


In [2]:
import tsfel
import pandas as pd

In [3]:
test_feature_vectors = []
test_files = []
feature_list = []
for test_data in test_data_path[0:]:
    print('test_data:', test_data)
    ### read the subseq
    test_trace = read_traces(test_data)
    # print('test_trace:', test_trace)
    test_data_len = len(test_trace)
    print('test_data_len:', test_data_len)

    if test_data_len > 500:
        # print('test data length is more than 500, skipping...')
        # missing_features.append((test_data, 'test data length is more than 500'))
        # continue

        print('test data length is more than 500, truncating...')
        test_trace = test_trace[:500]
        test_data_len = 500
    
    df = pd.DataFrame(test_trace, columns=['event', 'ts'],)
    ### seperate event and ts
    test_events = df['event']
    test_ts = df['ts']

    # print(df)
    # print(test_events)


    ### extract features
    feat_all = []
    ### select only the sequence of events without timestamps
    cfg_file = tsfel.get_features_by_domain('temporal')               # All features will be extracted.
    tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)

    cfg_file = tsfel.get_features_by_domain('statistical')               # All features will be extracted.
    stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)   

    # cfg_file = tsfel.get_features_by_domain('fractal')               # All features will be extracted. 
    # frac_features = tsfel.time_series_features_extractor(cfg_file, test_events)

    feat_all.extend(tmp_features.values[0])
    feat_all.extend(stat_features.values[0])
    # feat_all.extend(frac_features.values[0])

    test_files.append(test_data)
    test_feature_vectors.append( feat_all )
    # print(feat_all.columns)
    # print(feat_all.values[0])
    # print(feat_all.values[0].shape)
    # break

feature_list.extend(tmp_features.columns)
feature_list.extend(stat_features.columns)
# feature_list.extend(frac_features.columns)

test_feature_vectors= pd.DataFrame(test_feature_vectors, columns=feature_list)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_379-423.json
test_data_len: 44


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_1217-1225.json
test_data_len: 8


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_1303-1339.json
test_data_len: 36


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_623-662.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_2500-2544.json
test_data_len: 44


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_2296-2335.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_908-942.json
test_data_len: 34


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_410-449.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_526-534.json
test_data_len: 8


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_775-846.json
test_data_len: 71


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_313-352.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_1813-1852.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_1104-1179.json
test_data_len: 75


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_310-321.json
test_data_len: 11


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_720-759.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_1230-1267.json
test_data_len: 37


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_1710-1779.json
test_data_len: 69


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_2280-2324.json
test_data_len: 44


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_2055-2092.json
test_data_len: 37


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_1375-1419.json
test_data_len: 44


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_1496-1533.json
test_data_len: 37


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_2416-2455.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_1841-1978.json
test_data_len: 137


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_2173-2212.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_2102-2141.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_526-565.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_2427-2471.json
test_data_len: 44


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_1101-1140.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_1616-1655.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_2055-2103.json
test_data_len: 48


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_116-203.json
test_data_len: 87


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial2_625-716.json
test_data_len: 91


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_952-991.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


test_data: ../../trace_data/theft_protection/single_thread/version_3/faulty_data/diag_subseq/trace_trial1_190-229.json
test_data_len: 39


  tmp_features = tsfel.time_series_features_extractor(cfg_file, test_events)


  stat_features = tsfel.time_series_features_extractor(cfg_file, test_events)


array([[-0.03565052,  0.36514837,  0.58815489,  0.27683665,  0.        ,
         0.        ,  0.02747211,  0.22383852,  0.        ,  1.02333886,
        -0.2151274 ,  0.00616152, -0.48007936,  0.99366079, -0.29830502,
        -0.28741942, -0.26092079,  0.04438817],
       [-1.5614928 , -2.73861279, -2.06197904, -1.88497068,  0.        ,
         0.        , -0.90657972, -3.29121428,  0.        , -2.8703066 ,
        -0.2151274 , -1.40790774,  2.48768395, -1.70909656,  3.52101584,
         2.58586636,  3.99020389,  0.05155828],
       [-0.39928583,  0.36514837,  0.8427756 , -0.55700332,  0.        ,
         0.        ,  0.02747211, -0.46425977,  0.        ,  0.18383967,
        -0.45107358, -0.36044903, -0.48007936, -0.35771789, -0.10969659,
         0.01583977, -0.23311931,  1.61000207],
       [-0.29946437,  0.36514837, -0.68016256,  0.96899426,  0.        ,
         0.        ,  0.02747211,  0.91267033,  0.        , -0.36121199,
        -0.45107358, -0.20333022, -0.48007936, -0.357

In [7]:
### filter features
print('test_feature_vectors:', test_feature_vectors.shape)
corr_features, test_feature_vectors = tsfel.correlated_features(test_feature_vectors, drop_correlated=True)
print('test_feature_vectors:', test_feature_vectors.shape)
# print('corr_features:', corr_features)

### Normalising Features
scaler = preprocessing.StandardScaler()
test_feature_vectors = scaler.fit_transform(test_feature_vectors)


test_feature_vectors: (34, 45)
test_feature_vectors: (34, 18)


In [9]:
test_feature_vectors.shape

(34, 18)

## Prepare Data

In [10]:
padded_features = []
test_class = []

    
### load the labels
test_class_labels = read_json(test_labels_path[0])
# print('test_class_labels:', len(test_class_labels))
# print('test_class_labels:', test_class_labels)

### prepare the feature vectors for classification
for (test_data, feature_vector) in zip(test_files, test_feature_vectors):
    file_name = test_data.split('/')[-1].split('.')[0]
    # print('file_name:', file_name)
    class_list = test_class_labels[file_name]
    # print('class_list:', class_list)
    class_label = None
    # break

    # print('test_data:', test_data)
    # print('feature_vector:', np.array(feature_vector).shape)
    # print('test_class_label:', test_class_labels[file_name])

    if len(class_list) == 1:
        ### Hardcode the class label for all applications
        class_label = class_list

    else:
        # print('multiple class labels found for the test data:', test_data)
        class_label = class_list


        
    if class_label != None:
        feature_vector = np.array(feature_vector)

        padded_features.append(feature_vector)
        # test_files.append(test_data)
        test_class.append(class_label)
        # print('feature_vector:', feature_vector.shape[0])

        # pad_num = 500 - feature_vector.shape[0]
        # print('pad_num:', pad_num)
        # padded_features = np.pad(feature_vector, (0,pad_num), 'constant', constant_values=(0))
        # print('padded_features:', padded_features.shape)

    # break

In [11]:
np.array(padded_features).shape

(34, 18)

## Clustering


In [14]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

unique_classes = []
map_labels = dict()
ground_truth = []
label_key = 0
for tc in test_class:
    # print(tc)
    if tc in unique_classes:
        ground_truth.append(map_labels[(str(tc))])
        continue
    else:
        print('tc:', tc)
        unique_classes.append(tc)
        map_labels[str(tc)] = label_key
        ground_truth.append(map_labels[(str(tc))])
        label_key += 1

    
N_CLUSTER = len(unique_classes)
print('N_CLUSTER:', N_CLUSTER)

data = np.array(padded_features)

# Flatten the feature vectors (reshape to (48, 1000))
if len(np.array(padded_features).shape) == 3:
    # Reshape the data for clustering
    num_samples, num_features, num_points = data.shape
    data_reshaped = data.reshape(num_samples, num_features * num_points)
else:
    data_reshaped = data


#############################################################################################

# Apply K-Means clustering
kmeans = KMeans(init="k-means++", max_iter=300, n_clusters=N_CLUSTER, n_init=30 )   # n_clusters=N_CLUSTER, random_state=0, n_init=4 
kmeans.fit(data_reshaped)
# Get cluster labels
labels = kmeans.labels_
print('kmeans:', labels)

tc: [2]
tc: [1]
tc: [2, 2, 2]
tc: [2, 2]
tc: [2, 2, 2, 2]
tc: [2, 2, 1]
N_CLUSTER: 6
kmeans: [4 1 4 0 4 0 5 0 1 4 0 0 2 3 0 4 2 4 4 4 4 0 2 0 0 0 4 0 0 4 2 2 0 0]


In [15]:
import numpy as np
from sklearn.metrics import accuracy_score, adjusted_rand_score, normalized_mutual_info_score, f1_score, confusion_matrix
from scipy.optimize import linear_sum_assignment

# # Sample data: Replace with your actual predictions
# kmeans_labels = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1])  # Replace with K-Means predictions
# ground_truth = np.array([2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1])  # Given ground truth

ground_truth = np.array(ground_truth)
labels = np.array(labels)

# Map cluster labels to ground truth labels using the Hungarian algorithm
def best_cluster_mapping(y_true, y_pred):
    """Finds the best mapping between predicted and true labels using the Hungarian algorithm."""
    unique_classes = np.unique(y_true)
    unique_clusters = np.unique(y_pred)
    cost_matrix = np.zeros((len(unique_classes), len(unique_clusters)))

    for i, cls in enumerate(unique_classes):
        for j, cluster in enumerate(unique_clusters):
            cost_matrix[i, j] = -np.sum((y_true == cls) & (y_pred == cluster))  # Negative for maximization

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    mapping = {unique_clusters[col]: unique_classes[row] for row, col in zip(row_ind, col_ind)}

    return np.vectorize(mapping.get)(y_pred)  # Map predictions

print('ground_truth:', ground_truth)
print('labels:', labels)
# Remap cluster labels to best-matching class labels
remapped_labels = best_cluster_mapping(ground_truth, labels)

# Evaluation Metrics
accuracy = accuracy_score(ground_truth, remapped_labels)
f1 = f1_score(ground_truth, remapped_labels, average='weighted')
conf_matrix = confusion_matrix(ground_truth, remapped_labels)

ari = adjusted_rand_score(ground_truth, labels)
nmi = normalized_mutual_info_score(ground_truth, labels)

# Print results
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print("Confusion Matrix:\n", conf_matrix)

ground_truth: [0 0 0 1 0 1 0 1 0 2 1 1 2 0 1 0 3 0 0 0 0 1 4 1 1 1 0 1 1 0 5 3 1 1]
labels: [4 1 4 0 4 0 5 0 1 4 0 0 2 3 0 4 2 4 4 4 4 0 2 0 0 0 4 0 0 4 2 2 0 0]
F1 Score: 0.7748
Accuracy: 0.7647
Adjusted Rand Index (ARI): 0.7269
Normalized Mutual Information (NMI): 0.7352
Confusion Matrix:
 [[10  0  1  0  2  1]
 [ 0 14  0  0  0  0]
 [ 1  0  0  1  0  0]
 [ 0  0  0  2  0  0]
 [ 0  0  0  1  0  0]
 [ 0  0  0  1  0  0]]
