In [None]:
# Dustminer Implementation
# Loading the normal data as Good pile and Faulty data as Bad pile

In [None]:
# Importing necessary libraries
import os
import numpy as np
import pandas as pd
from libraries.utils import get_paths, read_traces, read_json, mapint2var, is_consistent

In [None]:
# Configuration
CODE = 'theft_protection'               ### application (code) theft_protection, mamba2, lora_ducy
BEHAVIOUR_FAULTY = 'faulty_data/diag_subseq/subseq/'        ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal/'             ### normal, faulty_data
THREAD = 'single'                       ### single, multi
VER = 4                                 ### format of data collection

base_dir = './trace_data'              ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print("Normal base path:", normalbase_path)
print("Faulty base path:", faultybase_path)

In [None]:
train_base_path = os.path.join(normalbase_path, 'train_data') #'diag_refsamples500')
print("Train base path:", train_base_path)

print("Current working directory:", os.getcwd())
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = [os.path.join(normalbase_path, x) for x in os.listdir(normalbase_path) if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

print("Number of training data files:", len(train_data_path))
print("Number of training varlist files:", len(train_varlist_path))
print("Number of faulty log files:", len(paths_log))
print("Number of faulty trace files:", len(paths_traces))
print("Number of faulty varlist files:", len(varlist_path))
print("Number of faulty label files:", len(paths_label))

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

test_data_path = paths_traces
test_label_path = paths_label

In [None]:
paths_traces

In [None]:
train_data_path

In [None]:
test_label_path

In [None]:
def load_data(file_paths):
    data = []
    for file in file_paths:
        traces = read_traces(file)
        if isinstance(traces, list) and len(traces) <=2:
            # id_sequence = [trace for trace in traces]
            id_sequence = traces[0]
            print("id_sequence:", id_sequence)

        elif isinstance(traces, list) and len(traces) > 2:
            id_sequence = [int(trace[0]) for trace in traces if isinstance(trace, list) and len(trace) >= 2]
        
        data.append(id_sequence)

    return data


In [None]:
good_log_directory = train_data_path
bad_log_directory = test_data_path

In [None]:
good_log_directory

In [None]:
bad_log_directory

In [None]:
good_sequences = load_data(good_log_directory)


In [None]:
len(good_sequences)

In [None]:
count = 0
for i in range(len(good_sequences)):
    print(f"Good sequence {i} length: {len(good_sequences[i])}")
    count = count + len(good_sequences[i])
print("Total good sequences length:", count)

In [None]:
bad_sequences = load_data(bad_log_directory)

In [None]:
len(bad_sequences)

In [None]:
import re
from pathlib import Path

def get_ground_truth_file(path, ground_truth_files):
    filename = []
    for file in ground_truth_files:
        if Path(path).stem in Path(file).stem:
            print("from function - matched ground truth file is :", file)
            filename = file
    return filename

def get_trace_info(path):
    filename = Path(path).stem
    match = re.search(r"(trace_trial_?\d+)_(\d+)-(\d+)", filename)
    if match:
        name = match.group(1)
        start = int(match.group(2))
        end = int(match.group(3))
        test_data_name = name+'_'+str(start)+'-'+str(end)+'.json'
        return name, start, end, test_data_name
    else:
        raise ValueError("Filename format not recognized")

def find_sequence_ground_truth(test_data_path, ground_truth):
    trace = read_traces(test_data_path)
    name, start, end,test_data_name = get_trace_info(test_data_path)
    sequence = [int(ev[0]) for ev in trace if isinstance(ev, list) and len(ev) >= 2]
    gt_start_end_pair = [[x[0], x[1]] for x in ground_truth]
    return sequence, gt_start_end_pair


def create_labels(sequence, gt_start_end_pair, test_data_start_index, test_data_end_index):
    start_index = test_data_start_index
    end_index = test_data_end_index
    event_list = []
    event_id_list = []
    for start, end in gt_start_end_pair:
        event_list = []
        for event_id in range(start_index, end_index):
            if event_id >= start and event_id <= end:
                print("Event ID {} is in ground truth range ({}, {})".format(event_id, start, end))
                print("event_id - start_index:", event_id , start_index)
                event_list.append(sequence[event_id - start_index])
        if event_list:
            event_id_list.append(event_list)

    return event_id_list

In [None]:
test_data_path

In [None]:
from libraries.anomaly_detection import discover_test_files, load_ground_truth_dir, build_labels
import json 
 
gt_path   = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/labels" # example
ground_truth_path = [os.path.join(gt_path, x) for x in os.listdir(gt_path)]

print("ground truth path:", ground_truth_path)

new_label = {}

for test_data in test_data_path:
    print("Test data file:", test_data)


    print("---------------------------------------------------")
    test_data_name_1, test_data_start_index, test_data_end_index, test_data_name= get_trace_info(test_data)
    print("Test data name is : ", test_data_name_1)
    print("Test data start index is : ", test_data_start_index)
    print("Test data end index is : ", test_data_end_index)


    ground_truth_filename = get_ground_truth_file(test_data_name_1, ground_truth_path)
    if not ground_truth_filename:
        print("No matching ground truth file found for test data:", test_data)
        continue

    else:
        print("Ground truth file name is : ", ground_truth_filename)


        ground_truth_raw = read_traces(ground_truth_filename)                                               # read ground truth labels from the label file
        ground_truth = ground_truth_raw['labels']                                                # extract labels from dictionary from ground truth data

        label_trace_name = list(ground_truth.keys())[0]
        ground_truth = ground_truth[label_trace_name]

        print("ground truth:", ground_truth)

        print("The test data file ", test_data, " is not corresponding to ground truth file : ", ground_truth_filename)
        print("The test data file ", test_data, " is corresponding to ground truth file : ", ground_truth_filename)
        sequence, gt_start_end_pair = find_sequence_ground_truth(test_data, ground_truth)
        print("Event ID sequence is : ", sequence)
        print("Ground truth start-end pairs are : ", gt_start_end_pair)
        labels = create_labels(sequence, gt_start_end_pair, test_data_start_index, test_data_end_index)
        print("Labels are : ", labels)
        new_label[test_data_name] = labels
        print("New label dictionary is : ", new_label)

output_dir = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/"
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "gt_test_data_labels.json")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(new_label, f, indent=4)

print(f"\n Saved to file: {output_path}")


In [None]:
good_sequences

In [None]:
len(good_sequences)

In [None]:
import math
from collections import defaultdict

def get_max_length(seq, event_id):
    max_length = 0
    start_index = seq.index(event_id)
    end_index = 0
    for i in range(start_index + 1 , len(seq)):
        if seq[i] == event_id:
            end_index = i - start_index
            if end_index > max_length:
                max_length = end_index
            start_index = i
    return max_length


def max_gap_two_lists(sequences):
    max_length = 0
    events = []
    event_id_max_length = {}

    for seq in sequences:
        if seq and isinstance(seq[0], list):
            inner_sequences = seq
        else:
            inner_sequences = [seq]  
        for inner_seq in inner_sequences:
            for i, x in enumerate(inner_seq):
                gap = get_max_length(inner_seq, x)
                if x not in event_id_max_length:
                    event_id_max_length[x] = gap
                    events.append(x)
                else:
                    if gap > event_id_max_length[x]:
                        event_id_max_length[x] = gap

                if event_id_max_length[x] > max_length:
                    max_length = event_id_max_length[x]
    return max_length

In [None]:
all_sequences_1 = good_sequences

MAX_PATTERN_LEN = max_gap_two_lists(all_sequences_1)

In [None]:
MAX_PATTERN_LEN

In [None]:
def next_same_index(seq):
    """"
    This function is used to compute next occurence of every element in the sequence.
    parameters :
        seq - list(int)
    Returns:
        nxt - list(int) where nxt[i] is the index of the next occurrence of seq[i] in seq and n if none.
    """
    last_pos = {}
    n = len(seq)
    nxt = [n] * n
    for i in range(n - 1, -1, -1):
        v = seq[i]
        if v in last_pos:
            nxt[i] = last_pos[v]
        last_pos[v] = i
    return nxt


def dynamic_window_sequence(sequences):
    """
    This function generates dynamic windows for each sequence in sequences.
    For example, for sequence [1,2,3,1,4,2], the dynamic windows are: [1,2,3],[1,4,2]
    parameters :
        sequences - list of list(int)
    Returns:
        final_windows - list of list(int) containing all dynamic windows from all sequences.
    """
    final_windows = []
    for seq in sequences:
        if not seq:
            continue
        nxt = next_same_index(seq)
        n = len(seq)
        for i in range(n):
            j = nxt[i]
            if j > i:
                final_windows.append(seq[i:j])
    return final_windows

def has_substring(pattern, sequence):
    """
    This function checks if the given pattern is a subsequence of sequence.
    parameters : 
        pattern - list(int)
        sequence - list(int)
    Returns:
        True if the pattern appears as a continous match in sequence.
        False if no match
    """
    pattern = tuple(pattern)
    sequence = tuple(sequence)
    m = len(pattern)
    n = len(sequence)
    if m == 0:
        return True
    if m > n:
        return False
    for i in range(n - m + 1):
        if sequence[i:i+m] == pattern:
            return True
    return False


def compress_patterns(patterns):
    """
    Compress frequent patterns by removing redundant shorter ones.
    For each pattern P, if there exists a longer pattern Q such that:
        - P appears as a CONTIGUOUS substring of Q, and has support(P) == support(Q), then P is dropped (only Q is kept).

    Parameters:
        patterns (dict[tuple[int, ...], int]): Dictionary mapping each pattern tuple to its support count.

    Returns:
        dict[tuple[int, ...], int]: Compressed dictionary containing only longest unique-support patterns.

    For example:
        Input  : {(6,7):3, (6,7,8):3, (6,8):2}
        Output : {(6,7,8):3, (6,8):2} Because (6,7) is a substring of (6,7,8) and both have same support=3.
    """
    items = list(patterns.items())
    items.sort(key=lambda x: (-len(x[0]), x[0]))
    compressed_patterns = {}
    for pattern, support in items:
        drop = False
        for other in compressed_patterns.keys():
            if patterns[other] == support and has_substring(pattern, other):
                drop = True
                break
        if not drop:
            compressed_patterns[pattern] = support
    return compressed_patterns

def check_ordered_support(pattern, windows):
    if not windows:
        return 0.0

    print("pattern from check is : ", pattern)
    print("pattern from check is : ", pattern[0])

    match_count = 0
    pattern1 = pattern[1:]
    for window in windows:
        i = 1
        j = 0
        while i < len(window) and j < len(pattern1):
            if window[i] == pattern1[j]:
                j = j + 1
            i = i + 1

        if j == len(pattern1):
            match_count += 1
            
    return match_count / len(windows)


In [None]:
from collections import defaultdict

def generate_cartesian_event(sequence, delta=0.0):
    seq = sequence
    print(f"Sequence: {seq}")
    print("Length:", len(seq))

    dynamic_windows = dynamic_window_sequence([seq]) 
    # print("Dynamic windows generated:", dynamic_windows)
    sorted_dynamic_windows = defaultdict(list)
    for _win in dynamic_windows:
        key = _win[0]
        sorted_dynamic_windows[key].append(_win)

    print("sorted_dynamic_windows:", sorted_dynamic_windows)
    seq_length = len(seq)
    S1 = list(set(seq))
    infile_support = {e: seq.count(e)/seq_length for e in S1}

    print("S1 is ", infile_support)
    
    min_val = min(infile_support.values())
    max_val = max(infile_support.values())
    
    normalized_infile_value = {}
    if min_val == max_val:
        normalized_infile_value = {e: 1.0 for e in infile_support}
    else:
        for e, v in infile_support.items():
            normalized_infile_value[e] = (v - min_val) / (max_val - min_val)
    
    print("Normalized S1 is ", normalized_infile_value)
    s1_new = {k: v for k, v in normalized_infile_value.items() if v >= delta}
    event_id_s1 = list(s1_new.keys())
    
    print(f"S1 Generated: {len(s1_new)} items")
    print("S1", s1_new)
    print('event_id_s1:', event_id_s1)

    S2_candidates = set()
    for a in range(len(event_id_s1)):
        for b in range(len(event_id_s1)):
            if a != b: 
                S2_candidates.add((event_id_s1[a], event_id_s1[b]))

    all_patterns = {}
    s2_with_support = {}
    
    print(f"Total S2 candidates: {S2_candidates}")
    for pair in S2_candidates:
        print("Evaluating pair:", pair)
        print("Evaluating pair:", pair[0])
        support = check_ordered_support(pair, sorted_dynamic_windows[pair[0]])
        if support >= delta:
            s2_with_support[pair] = support

    print("S2 count is :", len(s2_with_support))
    print("S2: ", s2_with_support)
    all_patterns[2] = s2_with_support

    current_patterns = s2_with_support
    S1_items = list(s1_new.keys())
    k = 3

    while k<=MAX_PATTERN_LEN:
        candidates = set()
        
        for pattern in current_patterns.keys(): 
            for item in S1_items:
                if item not in pattern:
                    new_cand = tuple(list(pattern) + [item])
                    candidates.add(new_cand)
                            
        next_patterns = {}
        for m in candidates:
            # support = check_ordered_support(m, dynamic_windows)
            support = check_ordered_support(m, sorted_dynamic_windows[m[0]])
            if support >= delta:
                next_patterns[m] = support
        
        if not next_patterns:
            break

        compressed_pattern = compress_patterns(next_patterns)

        print(f"S{k}: ", len(compressed_pattern))
        print(f"S{k}: ", compressed_pattern)

        all_patterns[k] = compressed_pattern
        current_patterns = compressed_pattern   
        k += 1
    print("\n")

    return all_patterns, sorted_dynamic_windows

In [None]:

def support_patterns(sequences, patterns):
    """
    Compute per-sequence support for continuous patterns using dynamic windows.
    For each pattern p (a tuple of event IDs), compute:
      - across: number of sequences in which p occurs at least once as a continuous substring inside any dynamic window.
      - in_file_avg: average number of continous occurrences per sequence, computed ONLY over sequence where p occurs (current sequence).

    Computing support:
      1) For each file (sequence), we build dynamic windows and group them by their anchor (window[0]).
      2) For a pattern p = (a, ...), we look only at windows anchored at 'a' and sum count_substring(p, window) across those windows.
      3) If the summed count for that sequence > 0, the sequence contributes to 'across' and contributes its count toward the present-only average.

    Parameters:
        sequences (list[list[int]]): List traces as sequences of event IDs.
        patterns (Iterable[tuple[int, ...]]): Patterns to measure.

    Returns:
        dict[tuple[int, ...], dict]: For each pattern p, a dict with:
            {
              "across": int,
              "in_file_avg": float
            }
    """
    support = {}
    if not patterns:
        return support

    sequence_window_by_anchor = []
    for seq in sequences:
        seq_1 = {}
        for w in dynamic_window_sequence([seq]):
            if not w:
                continue
            a = w[0]
            seq_1.setdefault(a, []).append(w)
        sequence_window_by_anchor.append(seq_1)

    per_sequence_counts = {p: [0]*len(sequences) for p in patterns}

    for i, t in enumerate(sequence_window_by_anchor):
        for p in patterns:
            if not p:
                continue
            a = p[0]
            cnt = 0
            for w in t.get(a, []):
                cnt += count_substring(p, w)          
            per_sequence_counts[p][i] = cnt

    for p, vec in per_sequence_counts.items():
        present = sum(1 for v in vec if v > 0)
        total = sum(v for v in vec if v > 0)
        support[p] = {"across": present, "in_file_avg": (float(total) / float(present)) if present > 0 else 0.0}
    return support

def FindDiscriminative(A, B, num_A, num_B, theta=0.8, delta=0.8):
    """
    Identify discriminative patterns that occur significantly more often in one group of sequences (A) than in another group (B), based on ratio thresholds.
    A pattern p is marked as discriminative if any of the following hold:
    1. B has zero evidence of the pattern: - B_stats[p]["across"] == 0, or B_stats[p]["in_file_avg"] == 0
    2. The ratio of presence across sequences exceeds theta: (across_A / across_B) >= theta
    3. The ratio of average in-sequence occurrences exceeds delta: (in_file_avg_A / in_file_avg_B) >= delta

    Parameters:
        A : dict[tuple[int, ...], dict] - Support statistics for patterns in group A (e.g., bad or good sequences).
            Each entry has:
                {
                "across": int,        # number of sequences where pattern appears
                "in_seq_avg": float   # avg. number of continous occurrences per sequence
                }
        B : dict[tuple[int, ...], dict] - Equivalent support statistics for patterns in group B.
        num_A : int - Number of sequences in group A.
        num_B : int - Number of sequences in group B.
        theta : float, default=0.8 - Threshold for "across-sequence" ratio test (presence ratio).
        delta : float, default=0.8 - Threshold for "in-sequence" ratio test (average frequency ratio).

    Returns:
        set[tuple[int, ...]] - Set of patterns that are discriminative for group A compared to B.

    for exmaple::
        A_stats = { (6,7,8): {"across": 5, "in_file_avg": 2.3} }
        B_stats = { (6,7,8): {"across": 1, "in_file_avg": 0.4} }
        FindDiscriminative(A_stats, B_stats, nA=10, nB=10, theta=0.8, delta=0.8)
        output - {(6,7,8)}
    """
    discriminative = set()
    for p in A:
        support_a = A[p]
        if p not in B:
            discriminative.add(p)
            continue
        support_b = B[p]

        seq_a = float(support_a["across"]) / float(num_A) if num_A > 0 else 0.0
        seq_b = float(support_b["across"]) / float(num_B) if num_B > 0 else 0.0
        per_seq_count_A = support_a["in_file_avg"]
        per_seq_count_B = support_b["in_file_avg"]

        if seq_b == 0.0 or per_seq_count_B == 0.0:
            discriminative.add(p)
            continue

        if seq_b > 0.0 and (seq_a / seq_b) >= theta:
            discriminative.add(p)
            continue
        if per_seq_count_B > 0.0 and (per_seq_count_A / per_seq_count_B) >= delta:
            discriminative.add(p)
            continue
    return discriminative

def mine_discriminative_patterns_progressive(good_seqs,bad_seqs,min_sup,theta=0.8,delta=0.8,k_start=1,k_max=10,stop_when_found=True,normalize=True,repeated_event_id=False):
    """
    Mine and score discriminative continous patterns between two groups of sequences good vs bad using a two-stage DustMiner-style pipeline.
    Stage 1 — Progressive discriminative mining
      • For K = k_start..k_max:
          - Generate frequent K-grams separately from good and bad sequences via GenerateFrequentSubSequences.
          - Compute per-pattern support stats in each group for both across and in seqeunce group.
          - Mark patterns discriminative via FindDiscriminative using θ (across ratio) and δ (in-sequence ratio).
          - Update Scommon = intersection of frequent K-grams (good ∩ bad) for next K.
      • At the end of Stage 1, we keep patterns discriminative for the bad group.

    Stage 2 — Quantitative scoring
      • Determine longest discriminative length (max_len_found).
      • Build full frequent pattern lattices up to max_len_found for each group via generate_frequent_patterns(...) to get supports for all prefixes.
      • Compute normalized probabilities P_bad(p) and P_good(p) using a chain rule over continous extensions; score each pattern with delta = P_bad - P_good.
      • Return patterns with positive delta, sorted by (delta, p, length p).

    Parameters:
        good_seqs : list[list[int]] - Traces labeled as good (normal data).
        bad_seqs : list[list[int]] - Traces labeled as bad (faulty).
        min_sup : int - Minimum window-support threshold used in Stage 1
        theta : float, default=0.8 - Across-sequence presence ratio threshold (A/B) in FindDiscriminative.
        delta : float, default=0.8 - In-sequence average count ratio threshold (A/B) in FindDiscriminative.
        k_start : int, default=1 - Initial pattern length for progressive mining.
        k_max : int - Maximum pattern length to mine during Stage 1.
        stop_when_found : bool, default=True - If True, break the mining as soon as any discriminative set sequenceGood and sequencebad is non-empty.
        normalize : bool, default=True - If True, compute Stage-2 chain probabilities; if False, fall back to across-sequence rates.
        repeated_event_id : bool, default=False - allow non-adjacent repeats if True and adjacent duplicates are always disallowed.

    Returns:
        dict[tuple[int, ...], dict] - Mapping each discriminative pattern p to: {"bad": float, "good": float, "delta": float} sorted by decreasing delta

    """
    Scommon = set()
    good_discriminative = set()
    bad_discriminative = set()
    K = k_start
    length_good_seq = len(good_seqs)
    length_bad_seq = len(bad_seqs)

    while True:
        if K > k_max:
            break

        k_frequent_good_patterns = generateFrequentSubSequences(good_seqs, K, Scommon, min_sup, repeated_event_id=repeated_event_id)
        k_frequent_bad_patterns = generateFrequentSubSequences(bad_seqs,  K, Scommon, min_sup, repeated_event_id=repeated_event_id)
        print("Good sequence mine patterns at stage : ",K," is : ", k_frequent_good_patterns)
        print("Bad sequence mine patterns at stage :",K, " is :", k_frequent_bad_patterns)

        if not k_frequent_good_patterns and not k_frequent_bad_patterns:
            break

        good_seq_support = support_patterns(good_seqs, set(k_frequent_good_patterns.keys()) if k_frequent_good_patterns else set())
        bad_seq_support = support_patterns(bad_seqs,  set(k_frequent_bad_patterns.keys())  if k_frequent_bad_patterns else set())

        if k_frequent_good_patterns:
            discriminative_good = FindDiscriminative(good_seq_support, bad_seq_support, length_good_seq, length_bad_seq, theta=theta, delta=delta)
            for p in discriminative_good:
                if p in k_frequent_good_patterns:
                    good_discriminative.add(p)

        if k_frequent_bad_patterns:
            discriminative_bad = FindDiscriminative(bad_seq_support, good_seq_support, length_bad_seq, length_good_seq, theta=theta, delta=delta)
            for p in discriminative_bad:
                if p in k_frequent_bad_patterns:
                    bad_discriminative.add(p)

        Scommon = findCommon(k_frequent_good_patterns, k_frequent_bad_patterns)
        K += 1
        if stop_when_found:
            if len(good_discriminative) > 0 or len(bad_discriminative) > 0:
                break

    patterns = set(bad_discriminative)
    if not patterns:
        return {}

    max_len_found = 1
    for p in patterns:
        if len(p) > max_len_found:
            max_len_found = len(p)

    full_bad  = generate_frequent_patterns(bad_seqs,  min_sup, max_len=max_len_found, repeated_event_id=repeated_event_id)
    full_good = generate_frequent_patterns(good_seqs, min_sup, max_len=max_len_found, repeated_event_id=repeated_event_id)

    def prefix_out_sum(full):
        i = defaultdict(int)
        for q, c in full.items():
            if len(q) >= 2:
                i[q[:-1]] += c
        return i

    bad_out  = prefix_out_sum(full_bad)
    good_out = prefix_out_sum(full_good)

    def prob_chain(pattern, full, out, total_events):
        if total_events <= 0:
            return 0.0
        if len(pattern) == 1:
            return float(full.get(pattern, 0)) / float(total_events)
        val = float(full.get((pattern[0],), 0)) / float(total_events)
        if val == 0.0:
            return 0.0
        for i in range(1, len(pattern)):
            pref = pattern[:i]
            n = float(full.get(pattern[:i+1], 0))
            d = float(out.get(pref, 0))
            if n == 0.0 or d == 0.0:
                return 0.0
            val *= (n / d)
        return val

    total_bad  = sum(len(s) for s in bad_seqs)
    total_good = sum(len(s) for s in good_seqs)

    discriminative = {}
    for p in patterns:
        if normalize:
            b = prob_chain(p, full_bad,  bad_out,  total_bad)
            g = prob_chain(p, full_good, good_out, total_good)
        else:
            bad_support = support_patterns(bad_seqs,  [p])
            good_support = support_patterns(good_seqs, [p])
            b = float(bad_support.get(p, {"across":0})["across"]) / float(max(1, length_bad_seq))
            g = float(good_support.get(p, {"across":0})["across"]) / float(max(1, length_good_seq))
        d = b - g
        if d > 0:
            discriminative[p] = {"bad": b, "good": g, "delta": d} 

    discriminative_sorted = dict(sorted(discriminative.items(), key=lambda x: (-x[1]["delta"], -len(x[0]), x[0])))
    return discriminative_sorted

In [None]:
good_sequence_patterns = []
for i in range(0, len(good_sequences)):
    print(f"length of {i} good file is : {len(good_sequences[i])}")
    good_sequence_patterns.append(generate_cartesian_event(good_sequences[i]))


# bad_sequence_patterns = []
# for i in range(0, len(bad_sequences)):
#     print(f"length of {i} bad file is : {len(bad_sequences[i])}")
#     bad_sequence_patterns.append(generate_cartesian_event(bad_sequences[i]))


good_sequence_patterns = generate_cartesian_event(good_sequences[0])
# bad_sequence_patterns = generate_cartesian_event(bad_sequences)

In [None]:
def merge_sequence_patterns(pattern_list_of_dicts):
    all_supports = {}
    
    for seq_patterns in pattern_list_of_dicts:
        for pattern, support in seq_patterns.items():
            if pattern not in all_supports:
                all_supports[pattern] = []
            
            all_supports[pattern].append(support)

    final_patterns = {}
    for pattern, support_list in all_supports.items():
        average_val = sum(support_list) / len(support_list)
        final_patterns[pattern] = average_val
        
    return final_patterns


In [None]:
merged_good_patterns = merge_sequence_patterns(good_sequence_patterns)
print("total Good Patterns : ", len(merged_good_patterns))

In [None]:
merged_good_patterns = [merged_good_patterns]

In [None]:
bad_sequence_patterns

In [None]:
discriminative_score = {}

#Here we take all the distinct event patterns from both bad and good logs and store in all_patterns.

if isinstance(merged_good_patterns, list):
    good_seqs = merged_good_patterns[0] 
else:
    good_seqs = merged_good_patterns

all_patterns = set(good_seqs.keys())

for d in bad_sequence_patterns:
    all_patterns.update(d.keys())

print("all_patterns :", len(all_patterns))

for pattern in all_patterns:
    support_good = good_seqs.get(pattern, 0.0)
    bad_seq_count = [d[pattern] for d in bad_sequence_patterns if pattern in d]
    
    if bad_seq_count:
        support_bad = sum(bad_seq_count) / len(bad_seq_count)
    else:
        support_bad = 0.0

    score = support_good - support_bad
    discriminative_score[pattern] = score

sorted_patterns = sorted(discriminative_score.items(), key=lambda x: abs(x[1]), reverse=True)
threshold = 0.30

discriminative_patterns = {}
for pat, score in sorted_patterns:
    if abs(score) >= threshold:
        discriminative_patterns[pat] = {"support": score}

In [None]:
discriminative_patterns

In [None]:
import json
import os
import psutil
import time
from collections import defaultdict

process = psutil.Process(os.getpid())
start_mem = process.memory_info().rss / (1024 * 1024)  # in MB
start_time = time.perf_counter() 

LABEL_PATH = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/gt_test_data_labels.json"

MIN_COVERAGE = 0.60 

def normalize_patterns(discrinimative_obj):
    if discrinimative_obj is None:
        return []
    if isinstance(discrinimative_obj, dict):
        seq = discrinimative_obj.keys()
    else:
        seq = discrinimative_obj

    out = []
    for p in seq:
        if isinstance(p, (list, tuple, set)):
            out.append(tuple(p))
    return out

def normalize_gt_sequences(gt_seq_list):
    if not gt_seq_list:
        return []
    if isinstance(gt_seq_list[0], int):
        return [gt_seq_list]
    else:
        return [seq for seq in gt_seq_list if isinstance(seq, list) and len(seq) > 0]

with open(LABEL_PATH, "r") as f:
    label_map = json.load(f)

discriminative_patterns_seq = normalize_patterns(discriminative_patterns)

def check_subsequence(discriminative_pat, gt_seq, MIN_COVERAGE):
    m = len(discriminative_pat)
    n = len(gt_seq)

    if m == 0 or m > n:
        return False

    for i in range(n - m + 1):
        if gt_seq[i:i+m] == list(discriminative_pat):
            coverage = m / n
            return coverage >= MIN_COVERAGE

    return False

tp = fp = fn = tn = 0
matched_patterns_global = set()

all_tp = []         
all_fp = []        
all_fn = []        
all_gt = []        
y_true_all = []     
y_pred_all = []     

label_file_path = LABEL_PATH 

for test_file_name, raw_gt in label_map.items():
    gt_seqs_list = normalize_gt_sequences(raw_gt)

    all_gt.append((test_file_name, gt_seqs_list, label_file_path))

    correct_pred_file = [] # To store that GT_seq that matched discriminative patterns  
    rest_pred_file = [] # stores which patterns did not match gt sequence     
    false_neg_file = [] # for fn sequence

    total_patterns = len(discriminative_patterns_seq)  # no of patterns mined by dustminer

    for gt_seq in gt_seqs_list:
        y_true_all.append(1)   # appending 1 as gt label for every sequence is 1 fault present

        matched_for_gt = []
        for discriminative_patterns_1 in discriminative_patterns_seq:
            if check_subsequence(discriminative_patterns_1, gt_seq, MIN_COVERAGE): # check if there is a complete match or atleast 60% match 
                matched_for_gt.append(discriminative_patterns_1)
                matched_patterns_global.add(discriminative_patterns_1) # set of all discriminative_patterns that match at least one gt sequence

        matched_count = len(matched_for_gt) # no of patterns that matched this gt sequence

        if matched_count == 0: # when no patterns match for this particular gt then its FN
            fn += 1
            fp += total_patterns   
            false_neg_file.append(gt_seq)
            rest_pred_file.append((gt_seq, list(discriminative_patterns_seq))) # appending this gt_seq and all patterns as its FN
            y_pred_all.append(0)  # no detection so appending 0

        else: # when atleast one pattern match for this particular gt then its TP
            tp += 1
            fp += (total_patterns - matched_count)

            fp_for_gt = [
                p for p in discriminative_patterns_seq
                if p not in matched_for_gt
            ]

            correct_pred_file.append((gt_seq, matched_for_gt))
            rest_pred_file.append((gt_seq, fp_for_gt)) # appending gt and remaining unmatched patterns
            y_pred_all.append(1)  # fault detected so appending 1

    all_tp.append((test_file_name, correct_pred_file, label_file_path))
    all_fp.append((test_file_name, rest_pred_file, label_file_path))
    all_fn.append((test_file_name, false_neg_file, label_file_path))

if tp + fp > 0:
    precision = tp / (tp + fp)
else:
    precision = 0.0

if tp + fn > 0:
    recall = tp / (tp + fn)
else:
    recall = 0.0

if (precision + recall) > 0:
    f1 = (2 * precision * recall) / (precision + recall)
else:
    f1 = 0.0

print(f"TP={tp} FP={fp} FN={fn} TN={tn}")
print(f"precision={precision:.3f}  recall={recall:.3f}  f1={f1:.3f}")

matched_patterns = sorted(matched_patterns_global, key=lambda x: (len(x), x))
print(f"\nMatched {len(matched_patterns)} discriminative patterns with GT sequences:")

avg_value_length_tp_discriminative = 0
for p in matched_patterns:
    print(p)
    avg_value_length_tp_discriminative += len(p)

avg_value_length_tp_discriminative = (
    avg_value_length_tp_discriminative / len(matched_patterns)
    if matched_patterns else 0
)

end_mem = process.memory_info().rss / (1024 * 1024)
print(f"Memory used: {end_mem - start_mem:.2f} MB")

end_time = time.perf_counter()
elapsed_ms = (end_time - start_time) * 1000
print(f"\nTime taken: {elapsed_ms:.2f} ms")


In [None]:
classwise_fn = defaultdict(list)
classwise_tp = defaultdict(list)
gt_len = 0

CLASS_LABEL_PATH = f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/diag_subseq/subseq/subseq_labels/subseq_class.json"

with open(CLASS_LABEL_PATH, 'r') as f:
    class_mapping = json.load(f)

for file_fn, file_gt in zip(all_fn, all_gt):
    test_filename = file_gt[0].replace('.json', '')
    
    fn = file_fn[1] 
    gt = file_gt[1] 
    
    if test_filename in class_mapping:
        class_ids = class_mapping[test_filename]
        
        if len(gt) != len(class_ids):
            print(f"Mismatch in {test_filename}. GT len: {len(gt)}, Class ID len: {len(class_ids)}")
            continue

        for i, label in enumerate(gt):
            current_class_id = class_ids[i] 
            if label in fn:
                classwise_fn[current_class_id].append(label)
            else:
                classwise_tp[current_class_id].append(label)
    else:
        print(f"{test_filename} not found in class mapping JSON.")

    gt_len += len(gt)
    # print('file gt:', len(gt))
    # print('file fn:', len(fn))
    # print('\n')
    # break

total_fn = 0
total_tp = 0
keys = set(list(classwise_fn.keys()) + list(classwise_tp.keys()))
# print('keys:', keys)
class_recall = []
for key in keys:
    print('class:', key)
    total_fn += len(classwise_fn[key])
    total_tp += len(classwise_tp[key])

    crecall = len(classwise_tp[key])/(len(classwise_fn[key])+len(classwise_tp[key]))

    # print('not detected:', len(classwise_fn[key]))
    print('detected:', len(classwise_tp[key]))
    print('total anomalies:', len(classwise_fn[key])+len(classwise_tp[key]))
    print('Recall (classwise):', crecall)
    print('\n')

    class_recall.append(crecall)


# print('total fn+tp:', total_fn+total_tp)
# print('total gt:', gt_len)
# assert total_fn+total_tp == gt_len, 'total fn+tp not equal to total gt'
print('All class recalls:', class_recall)

In [None]:
all_gt

In [None]:
detection_seq_path = (f"{base_dir}/{CODE}/{THREAD}_thread/version_{VER}/faulty_data/diag_subseq/subseq")

def load_detection_seq(json_path):
    """
    load the test data file from the path mentioned in detection_seq_path. The format of data is [event_id, timestamp] and we take only event_id.
    Parameters: json_path -> string
    Returns: event_ids -> [6,7,8,6,7]
    
    """
    with open(json_path, "r") as f:
        data = json.load(f)

    if isinstance(data, list) and len(data) > 0 and isinstance(data[0], list):
        return [row[0] for row in data]

    return []

detection_seq_map = {}

# Iterating over the gt_labels file we created with test_file name
for test_file_name in label_map.keys():                         # label_map is the GT labels we have created
    detection_path = os.path.join(detection_seq_path, test_file_name)   # appending the test_filename to detection_path to take the complete event_d seq for that file
    if os.path.exists(detection_path):
        detection_seq_map[test_file_name] = load_detection_seq(detection_path)  # Filters out the timestamp and returns only event_ids as lists
    else:
        detection_seq_map[test_file_name] = []

def count_occurrences(subseq, pattern):
    """
    This function returns the number of occurences of a pattern in the larger sequence of event_id. Here pattern is the gt_seq.
    Parameters: subseq -> list[int] [6, 7, 8, 6, 7, 8, 9]
                pattern -> list[int] [6, 7, 8]
    Returns: int -> count of pattern in subseq
    """
    if subseq is None:
        return None
    count = 0
    L = len(pattern)
    for i in range(len(subseq) - L + 1):
        if subseq[i:i+L] == pattern:
            count += 1
    return count


def analyze_fault(gt_seqs, detection_subseq, matched_patterns):
    results = []

    matched_set = {tuple(p) for p in matched_patterns}

    for gt in gt_seqs:
        gt_tuple = tuple(gt)
        occ = count_occurrences(detection_subseq, gt)

        status = "TP" if gt_tuple in matched_set else "FN"

        results.append({
            "gt_seq": gt,
            "occurrences_in_detection": occ,
            "status": status
        })

    return results

fault_details = []

# all_gt is of format 
# [('trace_trial1_1090-1140.json', [[12, 6, 7, 8, 9, 6]], './trace_data/theft_protection/single_thread/version_4/faulty_data/gt_test_data_labels.json'),

for file_name, gt_seqs_list, label_file_path in all_gt:
    detection_subseq = detection_seq_map.get(file_name, [])
    results = analyze_fault(gt_seqs_list, detection_subseq, matched_patterns_global) # matched_patterns_global is the set of all discriminative pattern that matched at least one GT sequence
    fault_details.append((file_name, results))

# for file_name, result_list in fault_details:
#     print(f"\nFile: {file_name}")
#     for r in result_list:
#         print(r)

all_entries = []
for file_name, results in fault_details:
    for r in results:
        all_entries.append(r)

fn_list = []
tp_list = []

for i in all_entries:
    if i["status"] == "FN" and i["occurrences_in_detection"] is not None:
        fn_list.append(i)
    elif i["status"] == "TP" and i["occurrences_in_detection"] is not None:
        tp_list.append(i)

fn_list_with_single_occurence = []
tp_list_with_more_occurence = []

for i in fn_list:
    if i["occurrences_in_detection"] == 1:
        fn_list_with_single_occurence.append(i)

for i in tp_list:
    if i["occurrences_in_detection"] >= 2:
        tp_list_with_more_occurence.append(i)

print(f"Total GT sequences with detection : {len(all_entries)}")
print(f"TP count : {len(tp_list)}")
print(f"FN count : {len(fn_list)}")
print(f"FN with exactly 1 occurrence : {len(fn_list_with_single_occurence)}")
print(f"TP with more than 1 occurrences : {len(tp_list_with_more_occurence)}")