In [1]:
# Dustminer
# Training phase
# 1. Loading the normal data -> train data
# 2. Labelling each event as 'good'
# 3. Segment them into a window size of 20
# 4. Then finding frequent patterns in the data.

In [2]:
# Importing necessary libraries
import os
import numpy as np
import pandas as pd
from libraries.utils import get_paths, read_traces, read_json, mapint2var, is_consistent

In [3]:
# Configuration
CODE = 'theft_protection'               ### application (code) theft_protection, mamba2, lora_ducy
BEHAVIOUR_FAULTY = 'faulty_data'        ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'             ### normal, faulty_data
THREAD = 'single'                       ### single, multi
VER = 3                                 ### format of data collection

base_dir = './trace_data'              ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print("Normal base path:", normalbase_path)
print("Faulty base path:", faultybase_path)

Normal base path: ./trace_data/theft_protection/single_thread/version_3/normal
Faulty base path: ./trace_data/theft_protection/single_thread/version_3/faulty_data


In [4]:
train_base_path = os.path.join(normalbase_path, 'train_data')
print("Train base path:", train_base_path)

print("Current working directory:", os.getcwd())
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = [os.path.join(normalbase_path, x) for x in os.listdir(normalbase_path) if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

test_data_path = paths_traces
test_label_path = paths_label

Train base path: ./trace_data/theft_protection/single_thread/version_3/normal\train_data
Current working directory: c:\Uni Bremen\Job\Comnets\Anomaly Detection\Anomaly_Detection\dustminer


In [5]:
train_data_path

['./trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_0_110.json',
 './trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_1250_2000.json',
 './trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_150_900.json',
 './trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_2050_2300.json']

In [6]:
test_label_path

['./trace_data/theft_protection/single_thread/version_3/faulty_data\\labels\\trace_trial1_labels.json',
 './trace_data/theft_protection/single_thread/version_3/faulty_data\\labels\\trace_trial2_labels.json',
 './trace_data/theft_protection/single_thread/version_3/faulty_data\\labels\\trace_trial3_labels.json']

In [7]:
# Load training data
def load_data(file_paths):
    data = []
    for file in file_paths:
        traces = read_traces(file)
        if isinstance(traces, list):
            id_sequence = [int(trace[0]) for trace in traces]
            data.append(id_sequence)
    return data

train_data = load_data(train_data_path)
print(train_data)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15], [13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13, 14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 

In [8]:
def label_event(event, anomaly_set):
    return 'bad' if event[0] in anomaly_set else 'good'

In [9]:
def extract_id_from_ground_truth(ground_truth):
    ground_truth_ids = []
    for i in ground_truth:
            ground_truth_ids.append(i[0])
            ground_truth_ids.append(i[1])
    return ground_truth_ids

In [10]:
all_ground_truth_id = []
for test_label in test_label_path:
    ground_truth_raw = read_traces(test_label)                                               # read ground truth labels from the label file
    ground_truth = ground_truth_raw['labels']                                                # extract labels from dictionary from ground truth data

    label_trace_name = list(ground_truth.keys())[0]
    ground_truth = ground_truth[label_trace_name]
    ground_truths = extract_id_from_ground_truth(ground_truth)
    all_ground_truth_id.append(ground_truths)

In [11]:
# def segment_log(log, window_size=20):
#     return [log[i:i+window_size] for i in range(0, len(log), window_size)]

def segment_log(logs, window_size=5, step=1):
    segments = []
    for trace in logs:
        for i in range(0, len(trace) - window_size + 1, step):
            segments.append(trace[i:i+window_size])
    return segments


In [12]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import matplotlib.pyplot as plt


# Training phase


labeled_data = [(id, label_event(id, all_ground_truth_id)) for id in train_data]
good_logs = [e for e, label in labeled_data if label == 'good']
# bad_logs = [e for e, label in labeled_data if label == 'bad']   # for training data everything will be good


In [13]:
bad_logs = []
good_segments = segment_log(good_logs)
bad_segments = segment_log(bad_logs)

def mine_patterns_from_traces(log_segments, min_support=0.1):
    transactions = [[str(e) for e in seg] for seg in log_segments if len(seg) > 0]
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    return apriori(df, min_support=min_support, use_colnames=True)

fp_good = mine_patterns_from_traces(good_segments)
fp_bad = mine_patterns_from_traces(bad_segments)


In [14]:
fp_good

Unnamed: 0,support,itemsets
0,0.199566,(10)
1,0.199566,(11)
2,0.200108,(12)
3,0.198482,(13)
4,0.197939,(14)
...,...,...
56,0.680043,"(7, 8, 6)"
57,0.599783,"(7, 9, 6)"
58,0.599783,"(9, 8, 6)"
59,0.679501,"(7, 9, 8)"


In [28]:
def score_test_segments(segments, normal_patterns):
    pattern_set = [frozenset(p) for p in normal_patterns]
    scores = []
    for seg in segments:
        items = set(str(e) for e in seg)
        score = sum(1 for p in pattern_set if p.issubset(items))
        scores.append(score)
    return scores


In [29]:
def predict_anomalies(scores, threshold):
    return [1 if score < threshold else 0 for score in scores]

In [45]:
def test_single_dustminer(file_path, fp_good, threshold):
    anomalies = []
    file_name = os.path.basename(file_path)
    if file_path.find('.npy') != -1:
        test_data = np.load(file_path)
    else:
        test_data = read_traces(file_path)

    print("test 1 ",test_data)
    X_test = []
    sequence_length = 5
    for i in range(0, len(test_data) - sequence_length, sequence_length):
        id_value = [int(trace[0]) for trace in test_data[i:i + sequence_length]]
        X_test.append(id_value)

    print("X_test",X_test)
    #test_segments = segment_log(test_data)
    normal_patterns = fp_good['itemsets']

    print("Normal patterns", normal_patterns)

    test_scores = score_test_segments(X_test, normal_patterns)
    print("test scores", test_scores)
    threshold = np.percentile(test_scores, 90)   

    errors = predict_anomalies(test_scores, threshold)
    print("pred", errors)

    for i in range(len(errors)):
        if errors[i] > 0:
            anomaly_seq_end_ind = (i * sequence_length) + sequence_length
            anomaly_seq_start_index = 0 if  i == 0  else anomaly_seq_end_ind - sequence_length + 1

            if anomaly_seq_end_ind < len(test_data):
                anomalies.append([
                    (test_data[anomaly_seq_start_index][0], test_data[anomaly_seq_end_ind][0]),
                    (test_data[anomaly_seq_start_index][1], test_data[anomaly_seq_end_ind][1]),
                    os.path.basename(file_path)
                ])

    return anomalies

In [46]:
from libraries.anomaly_detection import merge_detections, get_correct_detections

DIFF_VAL = 0 
all_detections = []         # To store detections for each file
y_pred_all = []             # To store the predicted labels
y_true_all = []             # To store the ground truth labels
all_tp = []                 # To store all true positives
all_fp = []                 # To store all false positives
all_fn = []                 # To store all false negatives
all_gt = []                 # To store the ground truth
threshold = 0


for test_data, test_label in zip(test_data_path, test_label_path):
    print("test data", test_data)
    detection = test_single_dustminer(test_data, fp_good, threshold)


    print("Detection : ", detection)

    print("Detection : ", detection)
    print("len(detection) : ", len(detection))

    all_detections.append((test_data, detection, test_label))


    
    merge_detection, agg_ts = merge_detections(detection, diff_val=DIFF_VAL)

    print("Merge detection : ", merge_detection)
    
    ground_truth_raw = read_traces(test_label)                                               # read ground truth labels from the label file
    ground_truth = ground_truth_raw['labels']                                                # extract labels from dictionary from ground truth data

    label_trace_name = list(ground_truth.keys())[0]
    ground_truth = ground_truth[label_trace_name]

    correct_pred, rest_pred, y_pred, y_true, false_neg = get_correct_detections(merge_detection, ground_truth)  # Comparing detected anomaly with ground truth

    y_pred_all.extend(y_pred)          # predicted labels
    y_true_all.extend(y_true)          # actual ground truth labels
    all_tp.append((test_data, correct_pred, test_label))
    all_fp.append((test_data, rest_pred, test_label))
    all_fn.append((test_data, false_neg, test_label))
    all_gt.append((test_data, ground_truth, test_label))

test data ./trace_data/theft_protection/single_thread/version_3/faulty_data\trace_trial1.json
test 1  [[0, 1307], [1, 1335], [2, 1356], [3, 1361], [4, 1869], [5, 1874], [6, 1877], [7, 1903], [8, 2919], [9, 2924], [6, 2928], [7, 2953], [8, 3970], [9, 3975], [6, 3979], [7, 4004], [8, 5021], [9, 5025], [6, 5030], [7, 5055], [8, 6071], [9, 6076], [6, 6080], [7, 6105], [8, 7122], [9, 7127], [10, 7130], [11, 7135], [12, 7140], [6, 7145], [7, 7170], [8, 8187], [9, 8191], [13, 8195], [14, 8200], [15, 8204], [6, 8209], [7, 8234], [8, 9250], [9, 9255], [6, 9259], [7, 9284], [8, 10301], [9, 10305], [6, 10310], [7, 10335], [8, 11352], [9, 11356], [6, 11361], [7, 11386], [8, 12403], [9, 12408], [10, 12411], [11, 12416], [12, 12421], [6, 12425], [7, 12450], [8, 13467], [9, 13472], [13, 13476], [14, 13480], [15, 13484], [6, 13488], [7, 13514], [8, 14530], [9, 14534], [6, 14539], [7, 14564], [8, 15580], [9, 15585], [6, 15589], [7, 15614], [8, 16631], [9, 16636], [6, 16640], [7, 16665], [8, 17682], [9,

In [47]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred_all = np.array(y_pred_all)
y_true_all = np.array(y_true_all)

# Calculate evaluation metrics
precision = precision_score(y_true_all, y_pred_all)
recall = recall_score(y_true_all, y_pred_all)
f1 = f1_score(y_true_all, y_pred_all)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.0996
Recall: 1.0000
F1 Score: 0.1812
