In [None]:
# Dustminer Implementation
# Loading the normal data as Good pile and Faulty data as Bad pile

In [None]:
# Importing necessary libraries
import os
import numpy as np
import pandas as pd
from libraries.utils import get_paths, read_traces, read_json, mapint2var, is_consistent

In [None]:
# Configuration
CODE = 'theft_protection'               ### application (code) theft_protection, mamba2, lora_ducy
BEHAVIOUR_FAULTY = 'faulty_data/diag_subseq/subseq/'        ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal/'             ### normal, faulty_data
THREAD = 'single'                       ### single, multi
VER = 4                                 ### format of data collection

base_dir = '../trace_data'              ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print("Normal base path:", normalbase_path)
print("Faulty base path:", faultybase_path)

#######
MAX_PATTERN_LEN = 15
DELTA = 0.05
TOP_K = 10       # 10, 1000

MIN_COVERAGE = 0.4 

In [None]:
train_base_path = os.path.join(normalbase_path, 'train_data') #'diag_refsamples500')
print("Train base path:", train_base_path)

print("Current working directory:", os.getcwd())
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = [os.path.join(normalbase_path, x) for x in os.listdir(normalbase_path) if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

print("Number of training data files:", len(train_data_path))
print("Number of training varlist files:", len(train_varlist_path))
print("Number of faulty log files:", len(paths_log))
print("Number of faulty trace files:", len(paths_traces))
print("Number of faulty varlist files:", len(varlist_path))
print("Number of faulty label files:", len(paths_label))

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

test_data_path = paths_traces
test_label_path = paths_label

In [None]:
paths_traces

In [None]:
train_data_path[::]

In [None]:
test_label_path

In [None]:
def load_data(file_paths):
    data = []
    for file in file_paths:
        traces = read_traces(file)
        if isinstance(traces, list) and len(traces) <=2:
            # id_sequence = [trace for trace in traces]
            id_sequence = traces[0]
            print("id_sequence:", id_sequence)

        elif isinstance(traces, list) and len(traces) > 2:
            id_sequence = [int(trace[0]) for trace in traces if isinstance(trace, list) and len(trace) >= 2]
        
        data.append(id_sequence)

    return data


In [None]:
good_log_directory = train_data_path
bad_log_directory = test_data_path

In [None]:
good_log_directory

In [None]:
bad_log_directory

In [None]:
#### Load data ####
good_sequences = load_data(good_log_directory)
bad_sequences = load_data(bad_log_directory)


In [None]:
import re
from pathlib import Path

def get_ground_truth_file(path, ground_truth_files):
    filename = []
    for file in ground_truth_files:
        if Path(path).stem in Path(file).stem:
            print("from function - matched ground truth file is :", file)
            filename = file
    return filename

def get_trace_info(path):
    filename = Path(path).stem
    match = re.search(r"(trace_trial_?\d+)_(\d+)-(\d+)", filename)
    if match:
        name = match.group(1)
        start = int(match.group(2))
        end = int(match.group(3))
        test_data_name = name+'_'+str(start)+'-'+str(end)+'.json'
        return name, start, end, test_data_name
    else:
        raise ValueError("Filename format not recognized")

def find_sequence_ground_truth(test_data_path, ground_truth):
    trace = read_traces(test_data_path)
    name, start, end,test_data_name = get_trace_info(test_data_path)
    sequence = [int(ev[0]) for ev in trace if isinstance(ev, list) and len(ev) >= 2]
    gt_start_end_pair = [[x[0], x[1]] for x in ground_truth]
    return sequence, gt_start_end_pair


def create_labels(sequence, gt_start_end_pair, test_data_start_index, test_data_end_index):
    start_index = test_data_start_index
    end_index = test_data_end_index
    event_list = []
    event_id_list = []
    for start, end in gt_start_end_pair:
        event_list = []
        for event_id in range(start_index, end_index):
            if event_id >= start and event_id <= end:
                print("Event ID {} is in ground truth range ({}, {})".format(event_id, start, end))
                print("event_id - start_index:", event_id , start_index)
                event_list.append(sequence[event_id - start_index])
        if event_list:
            event_id_list.append(event_list)

    return event_id_list

In [None]:
test_data_path

In [None]:
from libraries.anomaly_detection import discover_test_files, load_ground_truth_dir, build_labels
import json 
 
gt_path   = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/labels" # example
ground_truth_path = [os.path.join(gt_path, x) for x in os.listdir(gt_path)]

print("ground truth path:", ground_truth_path)

new_label = {}

for test_data in test_data_path:
    print("Test data file:", test_data)


    print("---------------------------------------------------")
    test_data_name_1, test_data_start_index, test_data_end_index, test_data_name= get_trace_info(test_data)
    print("Test data name is : ", test_data_name_1)
    print("Test data start index is : ", test_data_start_index)
    print("Test data end index is : ", test_data_end_index)


    ground_truth_filename = get_ground_truth_file(test_data_name_1, ground_truth_path)
    if not ground_truth_filename:
        print("No matching ground truth file found for test data:", test_data)
        continue

    else:
        print("Ground truth file name is : ", ground_truth_filename)


        ground_truth_raw = read_traces(ground_truth_filename)                                               # read ground truth labels from the label file
        ground_truth = ground_truth_raw['labels']                                                # extract labels from dictionary from ground truth data

        label_trace_name = list(ground_truth.keys())[0]
        ground_truth = ground_truth[label_trace_name]

        print("ground truth:", ground_truth)

        print("The test data file ", test_data, " is not corresponding to ground truth file : ", ground_truth_filename)
        print("The test data file ", test_data, " is corresponding to ground truth file : ", ground_truth_filename)
        sequence, gt_start_end_pair = find_sequence_ground_truth(test_data, ground_truth)
        print("Event ID sequence is : ", sequence)
        print("Ground truth start-end pairs are : ", gt_start_end_pair)
        labels = create_labels(sequence, gt_start_end_pair, test_data_start_index, test_data_end_index)
        print("Labels are : ", labels)
        new_label[test_data_name] = labels
        print("New label dictionary is : ", new_label)

output_dir = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/"
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "gt_test_data_labels.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(new_label, f, indent=4)

print(f"\n Saved to file: {output_path}")


In [None]:
import math
from collections import defaultdict

def get_max_length(seq, event_id):
    max_length = 0
    start_index = seq.index(event_id)
    end_index = 0
    for i in range(start_index + 1 , len(seq)):
        if seq[i] == event_id:
            end_index = i - start_index
            if end_index > max_length:
                max_length = end_index
            start_index = i
    return max_length


def max_gap_two_lists(sequences):
    max_length = 0
    events = []
    event_id_max_length = {}

    for seq in sequences:
        if seq and isinstance(seq[0], list):
            inner_sequences = seq
        else:
            inner_sequences = [seq]  
        for inner_seq in inner_sequences:
            for i, x in enumerate(inner_seq):
                gap = get_max_length(inner_seq, x)
                if x not in event_id_max_length:
                    event_id_max_length[x] = gap
                    events.append(x)
                else:
                    if gap > event_id_max_length[x]:
                        event_id_max_length[x] = gap

                if event_id_max_length[x] > max_length:
                    max_length = event_id_max_length[x]
    return max_length

In [None]:
# all_sequences_1 = good_sequences

# MAX_PATTERN_LEN = max_gap_two_lists(all_sequences_1)

In [None]:
def next_same_index(seq):
    """"
    This function is used to compute next occurence of every element in the sequence.
    parameters :
        seq - list(int)
    Returns:
        nxt - list(int) where nxt[i] is the index of the next occurrence of seq[i] in seq and n if none.
    """
    last_pos = {}
    n = len(seq)
    nxt = [n] * n
    for i in range(n - 1, -1, -1):
        v = seq[i]
        if v in last_pos:
            nxt[i] = last_pos[v]
        last_pos[v] = i
    return nxt


def dynamic_window_sequence(sequences):
    """
    This function generates dynamic windows for each sequence in sequences.
    For example, for sequence [1,2,3,1,4,2], the dynamic windows are: [1,2,3],[1,4,2]
    parameters :
        sequences - list of list(int)
    Returns:
        final_windows - list of list(int) containing all dynamic windows from all sequences.
    """
    final_windows = []
    for seq in sequences:
        if not seq:
            continue
        # print("Generating dynamic windows for sequence:", seq)
        nxt = next_same_index(seq)
        # print("Next same index array:", nxt)
        n = len(seq)
        for i in range(n):
            j = nxt[i]
            if j > i:
                final_windows.append(seq[i:j])
    return final_windows

def has_substring(pattern, sequence):
    """
    This function checks if the given pattern is a subsequence of sequence.
    parameters : 
        pattern - list(int)
        sequence - list(int)
    Returns:
        True if the pattern appears as a continous match in sequence.
        False if no match
    """
    pattern = tuple(pattern)
    sequence = tuple(sequence)
    m = len(pattern)
    n = len(sequence)
    if m == 0:
        return True
    if m > n:
        return False
    for i in range(n - m + 1):
        if sequence[i:i+m] == pattern:
            return True
    return False


# def compress_patterns(patterns):
#     """
#     Compress frequent patterns by removing redundant shorter ones.
#     For each pattern P, if there exists a longer pattern Q such that:
#         - P appears as a CONTIGUOUS substring of Q, and has support(P) == support(Q), then P is dropped (only Q is kept).

#     Parameters:
#         patterns (dict[tuple[int, ...], int]): Dictionary mapping each pattern tuple to its support count.

#     Returns:
#         dict[tuple[int, ...], int]: Compressed dictionary containing only longest unique-support patterns.

#     For example:
#         Input  : {(6,7):3, (6,7,8):3, (6,8):2}
#         Output : {(6,7,8):3, (6,8):2} Because (6,7) is a substring of (6,7,8) and both have same support=3.
#     """
#     items = list(patterns.items())
#     items.sort(key=lambda x: (-len(x[0]), x[0]))
#     print(items)
#     compressed_patterns = {}
#     for pattern, support in items:
#         drop = False
#         for other in compressed_patterns.keys():
#             if patterns[other] == support and has_substring(pattern, other):
#                 drop = True
#                 break
#         if not drop:
#             compressed_patterns[pattern] = support
#     return compressed_patterns


def compress_patterns(current, next):
    """
    Compress frequent patterns by removing redundant shorter ones.
    For each pattern P, if there exists a longer pattern Q such that:
        - P appears as a CONTIGUOUS substring of Q, and has support(P) == support(Q), then P is dropped (only Q is kept).

    Parameters:
        patterns (dict[tuple[int, ...], int]): Dictionary mapping each pattern tuple to its support count.

    Returns:
        dict[tuple[int, ...], int]: Compressed dictionary containing only longest unique-support patterns.

    For example:
        Input  : {(6,7):3, (6,7,8):3, (6,8):2}
        Output : {(6,7,8):3, (6,8):2} Because (6,7) is a substring of (6,7,8) and both have same support=3.
    """
    current_pats = list(current.keys())
    next_pats = list(next.keys())
    compressed_patterns = {}
    for pattern in current_pats:
        drop = False
        for other in next_pats:
            # print("Comparing pattern:", pattern, "with other pattern:", other)
            # print("Has substring:", has_substring(pattern, other))
            if pattern == other[:-1]:
                # print("Pattern", pattern, "is a substring of", other, current[pattern], next[other])
                if abs(current[pattern] - next[other])<=0.3:
                    # print("Droped Pattern", pattern)
                    drop = True
                    break
        if not drop:
            compressed_patterns[pattern] = current[pattern]
    return compressed_patterns

def check_ordered_support(pattern, windows):
    if not windows:
        return 0.0

    # print("pattern from check is : ", pattern)
    # print("pattern from check is : ", pattern[0])

    # match_count = 0
    # pattern1 = pattern[1:]
    # for window in windows:
    #     # print("Evaluating window:", window)
    #     # print("Evaluating pattern1:", pattern1)
    #     i = 1
    #     j = 0
    #     while i < len(window) and j < len(pattern1):
    #         if window[i] == pattern1[j]:
    #             j = j + 1
    #         i = i + 1

    #     if j == len(pattern1):
    #         match_count += 1

    ###################
    pattern_str = ','.join(map(str, pattern))
    # print("pattern from check is : ", pattern_str, windows_str)
    match_count = 0
    for window in windows:
        window_str = ','.join(map(str, window))
        if pattern_str in window_str:
            match_count += 1
            # print("Matched pattern:", pattern_str, "in window:", window_str)

    return round(match_count / len(windows), 4)

In [None]:
from collections import defaultdict

def generate_cartesian_event(sequence, delta=0.1, max_len=10):
    seq = sequence
    print(f"Sequence: {seq}")
    print("Length:", len(seq))

    dynamic_windows = dynamic_window_sequence([seq]) 
    # print("Dynamic windows generated:", dynamic_windows)
    sorted_dynamic_windows = defaultdict(list)
    for _win in dynamic_windows:
        key = _win[0]
        sorted_dynamic_windows[key].append(_win)

    # print("sorted_dynamic_windows:", sorted_dynamic_windows)
    seq_length = len(seq)
    S1 = list(set(seq))
    infile_support = {e: seq.count(e)/seq_length for e in S1}

    # print("S1 is ", infile_support)
    
    # min_val = min(infile_support.values())
    min_val = 0
    max_val = max(infile_support.values())
    # print("Min and Max values are :", min_val, max_val)
    
    normalized_infile_value = {}
    if min_val == max_val:
        normalized_infile_value = {e: 1.0 for e in infile_support}
    else:
        for e, v in infile_support.items():
            normalized_infile_value[e] = round((v - min_val) / (max_val - min_val), 4)
    
    # print("Normalized S1 is ", normalized_infile_value)
    s1_new = {k: v for k, v in normalized_infile_value.items() if v > delta}
    event_id_s1 = list(s1_new.keys())
    
    # print(f"S1 Generated: {len(s1_new)} items")
    # print("S1", s1_new)
    # print('event_id_s1:', event_id_s1)

    S2_candidates = set()
    for a in range(len(event_id_s1)):
        for b in range(len(event_id_s1)):
            if a != b: 
                S2_candidates.add((event_id_s1[a], event_id_s1[b]))

    all_patterns = {}
    s2_with_support = {}
    
    # print(f"Total S2 candidates: {S2_candidates}")
    for pair in S2_candidates:
        # print("Evaluating pair:", pair)
        # print("Evaluating pair:", pair[0])
        support = check_ordered_support(pair, sorted_dynamic_windows[pair[0]])
        if support >= delta:
            s2_with_support[pair] = support

    # print("S2 count is :", len(s2_with_support))
    # print("S2: ", s2_with_support)
    all_patterns[2] = s2_with_support

    current_patterns = s2_with_support
    S1_items = list(s1_new.keys())
    k = 3

    while k<=max_len:
        candidates = set()
        
        for pattern in current_patterns.keys(): 
            for item in S1_items:
                if item not in pattern:
                    new_cand = tuple(list(pattern) + [item])
                    # print('pattern:', pattern)
                    # print("Generated new candidate pattern:", new_cand)
                    candidates.add(new_cand)
                            
        next_patterns = {}
        for m in candidates:
            # support = check_ordered_support(m, dynamic_windows)
            support = check_ordered_support(m, sorted_dynamic_windows[m[0]])
            if support >= delta:
                next_patterns[m] = support
        
        if not next_patterns:
            break
        # print('current patterns:', current_patterns)
        # print('next_patterns', next_patterns)
        compressed_pattern = compress_patterns(current_patterns, next_patterns)
        # print(f"Compressed S{k} patterns:", compressed_pattern)

        # print(f"S{k}: ", len(compressed_pattern))
        # print(f"S{k}: ", compressed_pattern)

        ### storing compressed pattern of previous length
        # print('k', k)
        # print('len of', k-1, len(all_patterns[k-1]))
        # print('compressed pattern length:', len(compressed_pattern))
        all_patterns[k-1] = compressed_pattern
        all_patterns[k] = next_patterns
        current_patterns = next_patterns
        k += 1
    print("\n")

    return all_patterns, sorted_dynamic_windows

In [None]:

good_sequence_patterns = []
for i in range(0, len(good_sequences)):
    print(f"length of {i} good file is : {len(good_sequences[i])}")
    _patterns, _ = generate_cartesian_event(good_sequences[i], delta=DELTA, max_len=MAX_PATTERN_LEN)
    good_sequence_patterns.append(_patterns)
    # break


In [None]:
x = (14, 13)
y = [14, 15, 6, 7, 8, 9, 6, 7, 8, 9, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 13]

x_str = ','.join(map(str, x))
y_str = ','.join(map(str, y))
print(x_str, y_str)
print(x_str in y_str)

In [None]:
def merge_sequence_patterns(pattern_list_of_dicts):
    all_supports = {}
    # print("Number of sequences to merge:", len(pattern_list_of_dicts))
    for i in range(len(pattern_list_of_dicts)):
        _keys = pattern_list_of_dicts[i].keys()
        # print("Keys in pattern_list_of_dicts:", _keys)
        for key in _keys:
            pattern_dict = pattern_list_of_dicts[i][key]
            for pattern, support in pattern_dict.items():
                # print('p', pattern, 's', support)
                if pattern not in all_supports:
                    all_supports[pattern] = []
                all_supports[pattern].append(support)
    # print("All supports collected:", all_supports)
    final_patterns = {}
    for pattern, support_list in all_supports.items():
        average_val = sum(support_list) / len(support_list)
        final_patterns[pattern] = round(average_val, 4)
        
    ### sort them in dict according to length
    pattern_keys = list(final_patterns.keys())
    pattern_keys.sort(key=lambda x: len(x))
    # print("Sorted pattern keys by length:", pattern_keys)
    sorted_patterns = defaultdict(dict)
    for key in pattern_keys:
        key_len = len(key)
        sorted_patterns[key_len][key] = final_patterns[key]

    return sorted_patterns


def check_subsequence(discriminative_pat, gt_seq, MIN_COVERAGE):
    pt = len(discriminative_pat)
    gt = len(gt_seq)

    if pt == 0:
        return False
    # elif pt < gt:
    #     for i in range(gt - pt + 1):
    #         if gt_seq[i:i+pt] == list(discriminative_pat):
    #             coverage = pt / gt
    #             print("Coverage is :", coverage)
    #             return coverage >= MIN_COVERAGE         
    else:
        #### slide the shorter seq over longer seq and check how much portion of gt is matching
        pass

In [None]:
merged_good_patterns = merge_sequence_patterns(good_sequence_patterns)
print("total Good Patterns : ", len(merged_good_patterns))
print("Good Patterns : ", merged_good_patterns)

### Save Good Patterns

In [None]:
### save good patterns as using pandas dataframe
import os
from genericpath import isdir
import json
output_dir = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/normal/"
isdir(output_dir)

output_path = os.path.join(output_dir, "dustminer_train")
os.makedirs(output_path, exist_ok=True)

output_path = os.path.join(output_path, f"good_patterns_{MAX_PATTERN_LEN}_{DELTA}.json")

### convert keys from tuple to string for json serialization
# _merged_good_patterns = {str(k): v for k, v in merged_good_patterns.items()}

serialize_2 = {}
for length, patterns in merged_good_patterns.items():
    serialize_1 = {}
    for pattern, support in patterns.items():
        serialize_1[str(pattern)] = support
    serialize_2[length] = serialize_1

# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(_merged_good_patterns, f, indent=4)
# print(f"\n Saved to file: {output_path}")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(serialize_2, f, indent=4)
print(f"\n Saved to file: {output_path}")

def load_good_patterns(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        patterns_str = json.load(f)
    # Convert string keys back to tuple
    # patterns = {eval(k): v for k, v in patterns_str.items()}
    patterns = defaultdict(dict)
    for length, pattern_dict in patterns_str.items():
        for pattern_str, support in pattern_dict.items():
            pattern_tuple = eval(pattern_str)
            length_tuple = int(length)
            patterns[length_tuple][pattern_tuple] = support
            
    return patterns

In [None]:
# _patterns = load_good_patterns(output_path)
# print("Loaded patterns:", _patterns)

### Evaluation

In [None]:
def longest_common_contiguous_length(a, b):
    """Return length of longest common contiguous subsequence between lists a and b."""
    if not a or not b:
        return 0
    m, n = len(a), len(b)
    # dp[j] will represent length of longest suffix match for a[:i] and b[:j]
    dp = [0] * (n + 1)
    maxlen = 0
    for i in range(1, m + 1):
        # iterate backwards to avoid overwriting values needed for this row
        for j in range(n, 0, -1):
            if a[i - 1] == b[j - 1]:
                dp[j] = dp[j - 1] + 1
                if dp[j] > maxlen:
                    maxlen = dp[j]
            else:
                dp[j] = 0
    return maxlen


def continuous_coverage(gt_seq, detection_seq):
    """
    Compute maximum continuous overlap (as fraction of ground-truth length).
    It finds the longest contiguous common subsequence between gt_seq and detection_seq
    then returns matched_length / len(gt_seq).
    """
    if not isinstance(gt_seq, (list, tuple)) or not isinstance(detection_seq, (list, tuple)):
        return 0.0
    if len(gt_seq) == 0:
        return 0.0
    max_common = longest_common_contiguous_length(list(gt_seq), list(detection_seq))
    return max_common / len(gt_seq)


def check_subsequence(discriminative_pat, gt_seq, MIN_COVERAGE=0.6):
    """
    Return True if the best continuous overlap between discriminative_pat and gt_seq
    covers at least MIN_COVERAGE fraction of the ground-truth (gt_seq).
    """
    coverage = continuous_coverage(gt_seq, discriminative_pat)
    return coverage >= MIN_COVERAGE

def check_correct_detections(detections, gt_seq, MIN_COVERAGE=0.6):
    """
    Calculate TP and FP in given detections against ground-truth sequence.
    Returns list of y_true and y_pred for each detection.
    """
    y_true = []
    y_pred = []
    correct_pred = []
    rest_pred = []
    false_neg = []
    correct_det_ind = []
    if not detections:
        return y_true, y_pred
    
    for gt in gt_seq:
        matched = False
        gt_coverage = []
        for det in detections:
            coverage = continuous_coverage(gt, det)
            gt_coverage.append(coverage)
        
        print("gt_coverage:", gt_coverage)
        if gt_coverage:
            max_ind = np.argmax(np.array(gt_coverage))
            # print("max_ind:", max_ind,)
            if gt_coverage[max_ind] >= MIN_COVERAGE:
                matched = True
                correct_pred.append(detections[max_ind])
                # print("Matched detection:", detections[max_ind], "with coverage:", gt_coverage[max_ind])
                # print("Ground truth sequence:", gt)
                # print('detections before pop:', detections)
                
                correct_det_ind.append(max_ind)
                # print('detections after pop:', detections)

        if matched:
            ### true positive
            y_true.append(1)
            y_pred.append(1)
        else:
            ### false negative
            y_true.append(1)
            y_pred.append(0)
            false_neg.append(gt)

    print('detections:', detections)
    correct_det_ind = set(correct_det_ind)
    correct_det_ind = sorted(correct_det_ind, reverse=True)
    print("correct_det_ind:", correct_det_ind)
    correct_pred = set(correct_pred)
    for ind in correct_det_ind:
        detections.pop(ind)  # remove matched detection to avoid double counting

    for det in detections:
        ### false positive
        rest_pred.append(det)
        y_true.append(0)
        y_pred.append(1)
            
    return correct_pred, rest_pred, false_neg, y_true, y_pred


# # Example usage with provided sequences
# # gt = [12, 6, 7, 8, 9, 6]
# s1 = [12, 6, 7, 8, 9, 13, 14]
# s2 = [11, 12, 6, 7, 8, 9, 13, 14]
# s3 = [10, 11, 12, 6, 7, 8, 9, 13, 14]

# gt = [6,7,10, 11, 12, 6, 7, 8, 9, 6]
# for s in (s1, s2, s3):
#     cov = continuous_coverage(gt, s)
#     ok = check_subsequence(s, gt, MIN_COVERAGE=0.5)
#     print(f"detection={s}  coverage_of_gt={cov:.4f}  passes_0.5={ok}")

In [None]:

bad_sequences = load_data(bad_log_directory)

bad_sequence_patterns = []
for i in range(0, len(bad_sequences)):
    print(f"length of {i} bad file is : {len(bad_sequences[i])}")
    _patterns, _ = generate_cartesian_event(bad_sequences[i], delta=DELTA, max_len=MAX_PATTERN_LEN)
    bad_sequence_patterns.append(_patterns)
    print('')
    # break

In [None]:
bad_sequence_patterns

In [None]:
#### Get Discriminative Patterns
# good paterns in merged_good_patterns

LABEL_PATH = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/gt_test_data_labels.json"


# ### get all good patterns without support
# good_key_patterns = list(merged_good_patterns.keys())
# ### sort good patterns based on length
# good_key_patterns.sort(key=lambda x: len(x), reverse=True)
# print("Total Good Patterns before comparison: ", len(good_key_patterns))
all_tp = []
all_fp = []
all_fn = []
all_gt = []
all_detections = [] ### format [file1_detection, file2_detection] -> file1_detection: [(state1, 0), (ts1, ts2), filename]  
all_features = []  ### collection of features (corresponding events for anomaly from reference samples)
y_pred_all = []
y_true_all = []
for i in range(0, len(bad_sequence_patterns)):
    # bad_sequence_patterns = {}
    # print(f"length of {i} bad file is : {len(bad_sequences[i])}")
    # _patterns, _ = generate_cartesian_event(bad_sequences[i], delta=DELTA, max_len=MAX_PATTERN_LEN)
    _patterns = bad_sequence_patterns[i]
    _file_name = bad_log_directory[i].split('/')[-1]
    print("Processing bad file:", _file_name)

    _keys = _patterns.keys()
    # print("Keys in bad sequence patterns:", _keys)

    #### accumulate all patterns with different lengths
    discriminative_patterns_good = []
    discriminative_patterns_bad = []
    discriminative_support_good = []
    discriminative_support_bad = []
    for key in _keys:
        # print("Processing patterns of length:", key)
        bad_patterns_dict = _patterns[key]
        good_patterns_dict = merged_good_patterns[key]
        good_patterns_keys = list(good_patterns_dict.keys())

        # print('B', bad_patterns_dict, 'G', good_patterns_dict)
        for bad_pattern, bad_support in bad_patterns_dict.items():
            if bad_pattern not in good_patterns_dict:
                # print("Discriminative Pattern found:", bad_pattern)
                discriminative_patterns_bad.append(bad_pattern)
                discriminative_support_bad.append(bad_support)
            else:
                good_support = good_patterns_dict[bad_pattern]
                if abs(bad_support - good_support) > 0.3:
                    # print("Discriminative Pattern found with different support:", bad_pattern)
                    discriminative_patterns_bad.append(bad_pattern)
                    discriminative_support_bad.append(bad_support)
                else:
                    pass
                good_patterns_keys.remove(bad_pattern)
        for good_pattern in good_patterns_keys:
            good_support = good_patterns_dict[good_pattern]
            # print("Discriminative Pattern found in good patterns:", good_pattern)
            discriminative_patterns_good.append(good_pattern)
            discriminative_support_good.append(good_support)
        # break

    ### sort discriminative patterns based on respective support
    discriminative_patterns_good_sorted = sorted(zip(discriminative_patterns_good, discriminative_support_good), key=lambda x: x[1], reverse=True)
    discriminative_patterns_bad_sorted = sorted(zip(discriminative_patterns_bad, discriminative_support_bad), key=lambda x: x[1], reverse=True)
    print('Disc pattern count:', len(discriminative_patterns_bad_sorted)) 
    print('Disc patterns sorted', discriminative_patterns_bad_sorted )

    with open(LABEL_PATH, "r") as f:
        label_map = json.load(f)

    gt_seq = label_map[_file_name]  # assuming we take the first ground truth sequence for simplicity
    detection = [p for p, _ in discriminative_patterns_bad_sorted[:TOP_K]]

    # print("Label map loaded:", label_map)
    print('gt_seq:', gt_seq, 'pd_seq:', detection)

    correct_pred, rest_pred, false_neg, y_true, y_pred = check_correct_detections(detection, gt_seq, MIN_COVERAGE=MIN_COVERAGE)

    print('correct_pred:', correct_pred)
    print('rest_pred:', rest_pred)

    y_pred_all.extend(y_pred)
    y_true_all.extend(y_true)
    
    label_file = LABEL_PATH
    all_detections += [(_file_name, detection, label_file)]  ### used to plot detections
    # all_features += [feature]
    all_tp += [(_file_name, correct_pred, label_file)]
    all_fp += [(_file_name, rest_pred, label_file)]
    all_fn += [(_file_name, false_neg, label_file)]
    all_gt += [(_file_name, gt_seq, label_file)]
    
    print('')
    # break

In [None]:
# print("Discriminative Good Patterns:", discriminative_patterns_good_sorted)
# print("Discriminative Good Supports:", discriminative_support_good)
print("Discriminative Bad Patterns:", discriminative_patterns_bad_sorted)
print("Discriminative Bad Supports:", discriminative_support_bad)

print("All True Positives:", all_tp)
print("All False Positives:", all_fp)
print("FP count:", sum(len(x[1]) for x in all_fp))


In [None]:
### Evaluation metrics

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, average_precision_score, ConfusionMatrixDisplay, adjusted_rand_score, normalized_mutual_info_score


# Calculate precision
precision = precision_score(y_true_all, y_pred_all)
print(f'Precision: {precision:.4f}')

# Calculate recall
recall = recall_score(y_true_all, y_pred_all)
print(f'Recall: {recall:.4f}')

# # Calculate average precision
# average_precision = average_precision_score(y_true_all, y_pred_all)
# print(f'Average Precision: {average_precision:.4f}')

# Calculate F1 score
f1 = f1_score(y_true_all, y_pred_all)
print(f"F1 Score: {f1:.4f}")
print('')

# ### isntance length mean and std
# mean_inst_len = np.mean(inst_len_all)
# std_inst_len = np.std(inst_len_all)
# print('avg_inst_len:', mean_inst_len)
# print('std_inst_len:', std_inst_len)
# print('')

# ### avg number of anomalies in each detection 
# mean_gt_in_inst = np.mean(gt_in_inst_all)
# std_gt_in_inst = np.std(gt_in_inst_all)
# print('avg_gt_in_inst:', mean_gt_in_inst)
# print('std_gt_in_inst:', std_gt_in_inst)
# print('')

# ### avg number of detecions per GT
# mean_inst_in_gt = np.mean(num_anomaly_per_gt_all)
# std_inst_in_gt = np.std(num_anomaly_per_gt_all)
# print('mean_inst_in_gt:', mean_inst_in_gt)
# print('std_inst_in_gt:', std_inst_in_gt)
# print('')


# Calculate confusion matrix
conf_matrix = confusion_matrix(y_true_all, y_pred_all)
print("Confusion Matrix:")
print(conf_matrix)
if len(conf_matrix) == 1:
    conf_matrix = np.array([[0, 0], [0, conf_matrix[0][0]]])
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['normal', 'anomaly'])
disp.plot()

In [None]:
all_fn

## Classwise Detections

In [None]:
CLASS_LABEL_PATH = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/diag_subseq/subseq/subseq_labels/subseq_class.json"

with open(CLASS_LABEL_PATH, 'r') as f:
    class_mapping = json.load(f)

# print('class mapping:', class_mapping)
classwise_fn = defaultdict(list)
classwise_tp = defaultdict(list)
gt_len = 0
for file_fn, file_gt in zip(all_fn, all_gt):
    fn = file_fn[1]
    gt = file_gt[1]
    # print('file fn:', file_fn)
    # print('file gt:', file_gt)

    file_name = file_gt[0].split('.')[0]
    # print('file name:', file_name)
    classes = class_mapping[file_name]

    # print('classes:', classes)
    # print('gt:', gt)
    # print('fn:', fn)
    for label, cl in zip(gt, classes):
        # print('label:', label)
        if label in fn:
            classwise_fn[cl].append(label)
        else:
            classwise_tp[cl].append(label)
            # print('tp:', label)

    gt_len += len(gt)
    # print('file gt:', len(gt))
    # print('file fn:', len(fn))
    # print('\n')
    # break

total_fn = 0
total_tp = 0
keys = set(list(classwise_fn.keys()) + list(classwise_tp.keys()))
# print('keys:', keys)
class_recall = []
for key in keys:
    print('class:', key)
    total_fn += len(classwise_fn[key])
    total_tp += len(classwise_tp[key])

    crecall = len(classwise_tp[key])/(len(classwise_fn[key])+len(classwise_tp[key]))
    crecall = round(crecall, 4)

    # print('not detected:', len(classwise_fn[key]))
    print('detected:', len(classwise_tp[key]))
    print('total anomalies:', len(classwise_fn[key])+len(classwise_tp[key]))
    print('Recall (classwise):', crecall)
    print('\n')

    class_recall.append(crecall)


# print('total fn+tp:', total_fn+total_tp)
# print('total gt:', gt_len)
assert total_fn+total_tp == gt_len, 'total fn+tp not equal to total gt'
print('All class recalls:', class_recall)

In [None]:
class_mapping

In [None]:
# discriminative_score = {}

# #Here we take all the distinct event patterns from both bad and good logs and store in all_patterns.

# if isinstance(merged_good_patterns, list):
#     good_seqs = merged_good_patterns[0] 
# else:
#     good_seqs = merged_good_patterns

# all_patterns = set(good_seqs.keys())

# for d in bad_sequence_patterns:
#     all_patterns.update(d.keys())

# print("all_patterns :", len(all_patterns))

# for pattern in all_patterns:
#     support_good = good_seqs.get(pattern, 0.0)
#     bad_seq_count = [d[pattern] for d in bad_sequence_patterns if pattern in d]
    
#     if bad_seq_count:
#         support_bad = sum(bad_seq_count) / len(bad_seq_count)
#     else:
#         support_bad = 0.0

#     score = support_good - support_bad
#     discriminative_score[pattern] = score

# sorted_patterns = sorted(discriminative_score.items(), key=lambda x: abs(x[1]), reverse=True)
# threshold = 0.30

# discriminative_patterns = {}
# for pat, score in sorted_patterns:
#     if abs(score) >= threshold:
#         discriminative_patterns[pat] = {"support": score}

In [None]:

# def normalize_patterns(discrinimative_obj):
#     if discrinimative_obj is None:
#         return []
#     if isinstance(discrinimative_obj, dict):
#         seq = discrinimative_obj.keys()
#     else:
#         seq = discrinimative_obj

#     out = []
#     for p in seq:
#         if isinstance(p, (list, tuple, set)):
#             out.append(tuple(p))
#     return out

# def normalize_gt_sequences(gt_seq_list):
#     if not gt_seq_list:
#         return []
#     if isinstance(gt_seq_list[0], int):
#         return [gt_seq_list]
#     else:
#         return [seq for seq in gt_seq_list if isinstance(seq, list) and len(seq) > 0]

In [None]:
# import json
# import os
# import psutil
# import time
# from collections import defaultdict

# process = psutil.Process(os.getpid())
# start_mem = process.memory_info().rss / (1024 * 1024)  # in MB
# start_time = time.perf_counter() 

# LABEL_PATH = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/gt_test_data_labels.json"

# MIN_COVERAGE = 0.60 

# discriminative_patterns_seq = normalize_patterns(discriminative_patterns)

# with open(LABEL_PATH, "r") as f:
#     label_map = json.load(f)

# tp = fp = fn = tn = 0
# matched_patterns_global = set()

# all_tp = []         
# all_fp = []        
# all_fn = []        
# all_gt = []        
# y_true_all = []     
# y_pred_all = []     

# label_file_path = LABEL_PATH 

# for test_file_name, raw_gt in label_map.items():
#     gt_seqs_list = normalize_gt_sequences(raw_gt)

#     all_gt.append((test_file_name, gt_seqs_list, label_file_path))

#     correct_pred_file = [] # To store that GT_seq that matched discriminative patterns  
#     rest_pred_file = [] # stores which patterns did not match gt sequence     
#     false_neg_file = [] # for fn sequence

#     total_patterns = len(discriminative_patterns_seq)  # no of patterns mined by dustminer

#     for gt_seq in gt_seqs_list:
#         y_true_all.append(1)   # appending 1 as gt label for every sequence is 1 fault present

#         matched_for_gt = []
#         for discriminative_patterns_1 in discriminative_patterns_seq:
#             if check_subsequence(discriminative_patterns_1, gt_seq, MIN_COVERAGE): # check if there is a complete match or atleast 60% match 
#                 matched_for_gt.append(discriminative_patterns_1)
#                 matched_patterns_global.add(discriminative_patterns_1) # set of all discriminative_patterns that match at least one gt sequence

#         matched_count = len(matched_for_gt) # no of patterns that matched this gt sequence

#         if matched_count == 0: # when no patterns match for this particular gt then its FN
#             fn += 1
#             fp += total_patterns   
#             false_neg_file.append(gt_seq)
#             rest_pred_file.append((gt_seq, list(discriminative_patterns_seq))) # appending this gt_seq and all patterns as its FN
#             y_pred_all.append(0)  # no detection so appending 0

#         else: # when atleast one pattern match for this particular gt then its TP
#             tp += 1
#             fp += (total_patterns - matched_count)

#             fp_for_gt = [
#                 p for p in discriminative_patterns_seq
#                 if p not in matched_for_gt
#             ]

#             correct_pred_file.append((gt_seq, matched_for_gt))
#             rest_pred_file.append((gt_seq, fp_for_gt)) # appending gt and remaining unmatched patterns
#             y_pred_all.append(1)  # fault detected so appending 1

#     all_tp.append((test_file_name, correct_pred_file, label_file_path))
#     all_fp.append((test_file_name, rest_pred_file, label_file_path))
#     all_fn.append((test_file_name, false_neg_file, label_file_path))

# if tp + fp > 0:
#     precision = tp / (tp + fp)
# else:
#     precision = 0.0

# if tp + fn > 0:
#     recall = tp / (tp + fn)
# else:
#     recall = 0.0

# if (precision + recall) > 0:
#     f1 = (2 * precision * recall) / (precision + recall)
# else:
#     f1 = 0.0

# print(f"TP={tp} FP={fp} FN={fn} TN={tn}")
# print(f"precision={precision:.3f}  recall={recall:.3f}  f1={f1:.3f}")

# matched_patterns = sorted(matched_patterns_global, key=lambda x: (len(x), x))
# print(f"\nMatched {len(matched_patterns)} discriminative patterns with GT sequences:")

# avg_value_length_tp_discriminative = 0
# for p in matched_patterns:
#     print(p)
#     avg_value_length_tp_discriminative += len(p)

# avg_value_length_tp_discriminative = (
#     avg_value_length_tp_discriminative / len(matched_patterns)
#     if matched_patterns else 0
# )

# end_mem = process.memory_info().rss / (1024 * 1024)
# print(f"Memory used: {end_mem - start_mem:.2f} MB")

# end_time = time.perf_counter()
# elapsed_ms = (end_time - start_time) * 1000
# print(f"\nTime taken: {elapsed_ms:.2f} ms")


In [None]:
# classwise_fn = defaultdict(list)
# classwise_tp = defaultdict(list)
# gt_len = 0

# CLASS_LABEL_PATH = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/diag_subseq/subseq/subseq_labels/subseq_class.json"

# with open(CLASS_LABEL_PATH, 'r') as f:
#     class_mapping = json.load(f)

# for file_fn, file_gt in zip(all_fn, all_gt):
#     test_filename = file_gt[0].replace('.json', '')
    
#     fn = file_fn[1] 
#     gt = file_gt[1] 
    
#     if test_filename in class_mapping:
#         class_ids = class_mapping[test_filename]
        
#         if len(gt) != len(class_ids):
#             print(f"Mismatch in {test_filename}. GT len: {len(gt)}, Class ID len: {len(class_ids)}")
#             continue

#         for i, label in enumerate(gt):
#             current_class_id = class_ids[i] 
#             if label in fn:
#                 classwise_fn[current_class_id].append(label)
#             else:
#                 classwise_tp[current_class_id].append(label)
#     else:
#         print(f"{test_filename} not found in class mapping JSON.")

#     gt_len += len(gt)
#     # print('file gt:', len(gt))
#     # print('file fn:', len(fn))
#     # print('\n')
#     # break

# total_fn = 0
# total_tp = 0
# keys = set(list(classwise_fn.keys()) + list(classwise_tp.keys()))
# # print('keys:', keys)
# class_recall = []
# for key in keys:
#     print('class:', key)
#     total_fn += len(classwise_fn[key])
#     total_tp += len(classwise_tp[key])

#     crecall = len(classwise_tp[key])/(len(classwise_fn[key])+len(classwise_tp[key]))

#     # print('not detected:', len(classwise_fn[key]))
#     print('detected:', len(classwise_tp[key]))
#     print('total anomalies:', len(classwise_fn[key])+len(classwise_tp[key]))
#     print('Recall (classwise):', crecall)
#     print('\n')

#     class_recall.append(crecall)


# # print('total fn+tp:', total_fn+total_tp)
# # print('total gt:', gt_len)
# # assert total_fn+total_tp == gt_len, 'total fn+tp not equal to total gt'
# print('All class recalls:', class_recall)