- K-Means Clustering:

    - Calculate features for each time series (discussed in the next point) and use K-Means clustering to group similar patterns.

- Hierarchical Clustering:

    - Cluster time series hierarchically based on similarity.

__Things to Consider__

- Here we take the subtraces (50 events) 
- Every subtrace that contains any anomaly is labelled as anomalous


In [None]:
import json
import os
import numpy as np
import pandas as pd

def load_sample(file_path):
        data = np.load(file_path, allow_pickle=False)
        return data

In [None]:
############ configuration ################
############################################

code = 'theft_protection'       ### application (code)
behaviour = 'faulty_data'            ### normal, faulty_data
thread_typ = 'single'           ### single, multi
version = 2.2                     ### format of data collection
sub_len = 50

base_dir = '../trace_data' ### can be replaced with 'csv', 'exe_plot', 'histogram'
log_path = base_dir+f'/{code}/{thread_typ}_thread/version_{version}/{behaviour}'

#### subtraces
subtrace_path = f"../data-subtraces/version_{version}/{behaviour}/subtraces/{sub_len}"
print(log_path)

In [None]:
### get files from subtraces
# all_subtraces = os.listdir(subtrace_path)
# all_subtraces.remove('.DS_Store')

anomalies_files = os.listdir(subtrace_path+'/anomalies')
if '.DS_Store' in anomalies_files:
    anomalies_files.remove('.DS_Store')

normal_files = os.listdir(subtrace_path+'/normal')
if '.DS_Store' in normal_files:
    normal_files.remove('.DS_Store')

anomalies_path = [subtrace_path+'/anomalies/'+file for file in anomalies_files]
normal_path = [subtrace_path+'/normal/'+file for file in normal_files]


In [None]:
anomalies_path

## k-means clustering

_Feature Extraction for K-Means Clustering:_

__Execution Intervals:__

- Calculate the mean, standard deviation, and other statistical measures of the time differences between consecutive executions for each variable within a subtrace.

__Event Frequency:__

- Count the frequency of each variable within the subtrace.

__Sequence Patterns:__

- Convert the subtrace into a sequence of events and use techniques like sequence embedding to represent these sequences numerically.

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

def extract_features(subtrace):
    timestamps = [int(timestamp) for _, timestamp in subtrace]
    # print(timestamps)
    execution_intervals = np.diff(timestamps)
    
    mean_execution_interval = np.mean(execution_intervals)
    std_execution_interval = np.std(execution_intervals)

    # Count occurrences of unique variables in the subtrace
    unique_variables, variable_counts = np.unique(subtrace[:, 0], return_counts=True)
    event_frequency = dict(zip(unique_variables, variable_counts))

    # Additional features can be added based on your specific requirements

    return [mean_execution_interval, std_execution_interval] + list(event_frequency.values())


In [None]:
normal_labels = [0]*len(normal_files)
anomalies_labels = [1]*len(anomalies_files)

# #### split the normal data in 80:20 ratio
# X_train, X_test, y_train, y_test = train_test_split(normal_files, normal_labels, test_size=0.2, random_state=42)

# #### combine the train and test data
# X_test += anomalies_files
# y_test += anomalies_labels

# #### shuffle test files
# X_test, y_test = shuffle(X_test, y_test, random_state=42)

all_files = normal_path + anomalies_path
all_labels = normal_labels + anomalies_labels

#### shuffle all files
all_files, all_labels = shuffle(all_files, all_labels, random_state=42)

In [None]:


# Extract features for each subtrace
all_features = []
for sub_path in all_files:
    subtrace = np.load(sub_path, allow_pickle=False)
    # print(subtrace)

    features = extract_features(subtrace)
    all_features.append(features)


In [None]:
all_features

In [None]:
# Convert the feature matrix to a numpy array
X = np.array(features)

# Normalize features (optional but can be beneficial for K-Means)
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Fit K-Means model
num_clusters = 2  # Adjust based on your understanding of the data
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_normalized)

# Get cluster assignments
cluster_assignments = kmeans.labels_

# Assume anomalous cluster is the one with fewer instances
anomalous_cluster = np.argmin(np.bincount(cluster_assignments))

# Label subtraces based on cluster assignments
# If a subtrace is assigned to the anomalous cluster, label it as anomalous
labels = ['Anomalous' if label == anomalous_cluster else 'Normal' for label in cluster_assignments]

# Print or use the labels as needed
for i, label in enumerate(labels):
    print(f"Subtrace {i+1}: {label}")

## hierarchical clustering

_Feature Extraction for Hierarchical Clustering:_

__Temporal Patterns:__

- Use the timestamps of events within the subtrace to capture temporal patterns. Features could include mean, standard deviation, and other statistical measures of timestamps.

__Event Co-occurrence:__

- Create a matrix indicating the co-occurrence of events within the subtrace.

__Time Series Characteristics:__

- Extract basic statistical features such as mean, variance, skewness, and kurtosis for the entire subtrace.

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform
import numpy as np

def extract_features_for_hierarchical(subtrace):
    timestamps = [timestamp for _, timestamp in subtrace]
    execution_intervals = np.diff(timestamps)

    mean_execution_interval = np.mean(execution_intervals)
    std_execution_interval = np.std(execution_intervals)

    event_frequency = {var: subtrace.count([var, _]) for var, _ in subtrace}

    # Additional features can be added based on your specific requirements

    return [mean_execution_interval, std_execution_interval] + list(event_frequency.values())

def calculate_event_co_occurrence_matrix(subtrace):
    unique_vars = list(set(var for var, _ in subtrace))
    co_occurrence_matrix = np.zeros((len(unique_vars), len(unique_vars)))

    for i, var1 in enumerate(unique_vars):
        for j, var2 in enumerate(unique_vars):
            co_occurrence_matrix[i, j] = sum(1 for x, _ in subtrace if x == var1 and [var2, _] in subtrace)

    return co_occurrence_matrix

def hierarchical_clustering_features(subtrace):
    features = extract_features_for_hierarchical(subtrace)
    co_occurrence_matrix = calculate_event_co_occurrence_matrix(subtrace)

    # Flatten the upper triangular part of the co-occurrence matrix (excluding the diagonal)
    flattened_co_occurrence = squareform(pdist(co_occurrence_matrix, 'euclidean'))

    # Concatenate features with flattened co-occurrence matrix
    hierarchical_features = features + list(flattened_co_occurrence)

    return hierarchical_features


In [None]:
# Example subtraces (replace with your own data)
subtraces = [
    [['var1', 100], ['var2', 110], ['var1', 120]],
    [['var2', 110], ['var1', 120], ['var2', 130]],
    [['var1', 140], ['var2', 150], ['var1', 160]],
    # ... more subtraces
]

# Extract features for Hierarchical clustering
# Extract features for each subtrace
features_hierarchical = [extract_features_for_hierarchical(subtrace) for subtrace in subtraces]
print("Hierarchical Features:", features_hierarchical)

In [None]:
# Extract features for each subtrace
features_hierarchical = [extract_features_for_hierarchical(subtrace) for subtrace in subtraces]

# Calculate event co-occurrence matrix
co_occurrence_matrices = [calculate_event_co_occurrence_matrix(subtrace) for subtrace in subtraces]

# Flatten the upper triangular part of the co-occurrence matrices (excluding the diagonal)
flattened_co_occurrence = [squareform(pdist(matrix, 'euclidean')) for matrix in co_occurrence_matrices]

# Concatenate features with flattened co-occurrence matrices
hierarchical_features = np.concatenate([features_hierarchical, flattened_co_occurrence], axis=1)

# Normalize features (optional but can be beneficial for Hierarchical Clustering)
scaler = StandardScaler()
hierarchical_features_normalized = scaler.fit_transform(hierarchical_features)

# Fit Hierarchical Clustering model
num_clusters = 2  # Adjust based on your understanding of the data
hierarchical = AgglomerativeClustering(n_clusters=num_clusters)
hierarchical_assignments = hierarchical.fit_predict(hierarchical_features_normalized)

# Assume anomalous cluster is the one with fewer instances
anomalous_cluster_hierarchical = np.argmin(np.bincount(hierarchical_assignments))

# Label subtraces based on cluster assignments
# If a subtrace is assigned to the anomalous cluster, label it as anomalous
labels_hierarchical = ['Anomalous' if label == anomalous_cluster_hierarchical else 'Normal' for label in hierarchical_assignments]

# Print or use the labels as needed
for i, label in enumerate(labels_hierarchical):
    print(f"Subtrace {i+1}: {label}")

## Evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_clustering(labels_true, labels_pred):
    # Convert labels to binary (1 for anomalous, 0 for normal)
    labels_true_binary = np.array([1 if label == 'Anomalous' else 0 for label in labels_true])
    labels_pred_binary = np.array([1 if label == 'Anomalous' else 0 for label in labels_pred])

    # Calculate precision, recall, and F1-score
    precision = precision_score(labels_true_binary, labels_pred_binary)
    recall = recall_score(labels_true_binary, labels_pred_binary)
    f1 = f1_score(labels_true_binary, labels_pred_binary)

    return precision, recall, f1

In [None]:
# Example: Ground truth labels and predicted labels
labels_true = ['Normal', 'Anomalous', 'Normal', 'Normal', 'Anomalous']
labels_pred_kmeans = ['Normal', 'Anomalous', 'Normal', 'Normal', 'Anomalous']  # Replace with your predicted labels for K-Means
labels_pred_hierarchical = ['Normal', 'Anomalous', 'Normal', 'Normal', 'Anomalous']  # Replace with your predicted labels for Hierarchical



In [None]:
# Evaluate K-Means model
precision_kmeans, recall_kmeans, f1_kmeans = evaluate_clustering(labels_true, labels_pred_kmeans)
print("K-Means Model:")
print(f"Precision: {precision_kmeans:.2f}")
print(f"Recall: {recall_kmeans:.2f}")
print(f"F1-score: {f1_kmeans:.2f}")
print()



In [None]:
# Evaluate Hierarchical Clustering model
precision_hierarchical, recall_hierarchical, f1_hierarchical = evaluate_clustering(labels_true, labels_pred_hierarchical)
print("Hierarchical Clustering Model:")
print(f"Precision: {precision_hierarchical:.2f}")
print(f"Recall: {recall_hierarchical:.2f}")
print(f"F1-score: {f1_hierarchical:.2f}")