In [1]:
# Dustminer Implementation
# Loading the normal data as Good pile and Faulty data as Bad pile

In [2]:
# Importing necessary libraries
import os
import numpy as np
import pandas as pd
from libraries.utils import get_paths, read_traces, read_json, mapint2var, is_consistent

In [3]:
# Configuration
CODE = 'theft_protection'               ### application (code) theft_protection, mamba2, lora_ducy
BEHAVIOUR_FAULTY = 'faulty_data'        ### normal, faulty_data
BEHAVIOUR_NORMAL = 'normal'             ### normal, faulty_data
THREAD = 'single'                       ### single, multi
VER = 3                                 ### format of data collection

base_dir = './trace_data'              ### can be replaced with 'csv', 'exe_plot', 'histogram'
normalbase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_NORMAL}'
faultybase_path = base_dir+f'/{CODE}/{THREAD}_thread/version_{VER}/{BEHAVIOUR_FAULTY}'

print("Normal base path:", normalbase_path)
print("Faulty base path:", faultybase_path)

Normal base path: ./trace_data/theft_protection/single_thread/version_3/normal
Faulty base path: ./trace_data/theft_protection/single_thread/version_3/faulty_data


In [4]:
train_base_path = os.path.join(normalbase_path, 'train_data')
print("Train base path:", train_base_path)

print("Current working directory:", os.getcwd())
train_data_path = [os.path.join(train_base_path, x) for x in os.listdir(train_base_path)]
train_varlist_path = [os.path.join(normalbase_path, x) for x in os.listdir(normalbase_path) if 'varlist' in x]

######### get paths #######################
paths_log, paths_traces, varlist_path, paths_label = get_paths(faultybase_path)

train_data_path = [x for x in train_data_path if '.DS_Store' not in x]
train_varlist_path = [x for x in train_varlist_path if '.DS_Store' not in x]
paths_log = [x for x in paths_log if '.DS_Store' not in x]
paths_traces = [x for x in paths_traces if '.DS_Store' not in x]
varlist_path = [x for x in varlist_path if '.DS_Store' not in x]
paths_label = [x for x in paths_label if '.DS_Store' not in x]

paths_log.sort()
paths_traces.sort()
varlist_path.sort()
paths_label.sort()

test_data_path = paths_traces
test_label_path = paths_label

Train base path: ./trace_data/theft_protection/single_thread/version_3/normal\train_data
Current working directory: c:\Uni Bremen\Job\Comnets\Dustminer_vardiag


In [5]:
train_data_path

['./trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_0_110.json',
 './trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_1250_2000.json',
 './trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_150_900.json',
 './trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_2050_2300.json']

In [6]:
# Check consistency
if VER == 3:
    check_con, _ = is_consistent([train_varlist_path[0]] + varlist_path)
    if check_con:
        to_number = read_json(varlist_path[0])
        from_number = mapint2var(to_number)
    else:
        to_number = read_json(train_varlist_path[0])
        from_number = mapint2var(to_number)

sorted_keys = list(from_number.keys())
sorted_keys.sort()
var_list = [from_number[key] for key in sorted_keys]

varlist 1 is consistent with varlist 0
varlist 2 is consistent with varlist 0
varlist 3 is consistent with varlist 0


In [7]:
# Loading the event sequences
def load_data(file_paths):
    data = []
    for file in file_paths:
        traces = read_traces(file)
        if isinstance(traces, list):
            id_sequence = [int(trace[0]) for trace in traces if isinstance(trace, list) and len(trace) >= 2]
            data.append(id_sequence)
    return data

In [8]:
from collections import defaultdict
MIN_SUP = 2
SEGMENT_WIDTH = 50
TOP_K_SEGMENTS = 5
NORMALIZE_SUPPORT = True

good_log_directory = train_data_path
bad_log_directory = test_data_path

In [9]:
good_log_directory

['./trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_0_110.json',
 './trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_1250_2000.json',
 './trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_150_900.json',
 './trace_data/theft_protection/single_thread/version_3/normal\\train_data\\interval_2050_2300.json']

In [10]:
bad_log_directory

['./trace_data/theft_protection/single_thread/version_3/faulty_data\\trace_trial1.json',
 './trace_data/theft_protection/single_thread/version_3/faulty_data\\trace_trial2.json',
 './trace_data/theft_protection/single_thread/version_3/faulty_data\\trace_trial3.json']

In [11]:
good_sequences = load_data(good_log_directory)
bad_sequences = load_data(bad_log_directory)

In [12]:
import math
from collections import defaultdict

def max_gap_two_lists(sequences):
    """
    Two-list method:
      - events: ordered list of unique event IDs
      - global_max: maximum gap across all events
      - main: event id which has the maximum gap
    Returns (events, main, global_max)
    """
    events = [] 
    idx_of = {}          
    main = []            

    for seq in sequences:
        seq_best = [0] * len(events)   
        last_seen = [-1] * len(events) 

        for i, x in enumerate(seq):
            if x not in idx_of:
                idx_of[x] = len(events)
                events.append(x)
                main.append(0)
                seq_best.append(0)
                last_seen.append(-1)

            k = idx_of[x]
            if last_seen[k] != -1:
                gap = i - last_seen[k]       
                if gap > seq_best[k]:
                    seq_best[k] = gap        
                if seq_best[k] > main[k]:
                    main[k] = seq_best[k]    

            last_seen[k] = i                 

    global_max = max(main) if main else 0
    return events, main, global_max


In [13]:
all_sequences_1 = good_sequences
event_list, corr_event, MAX_PATTERN_LEN = max_gap_two_lists(all_sequences_1)
# MAX_PATTERN_LEN = 27

In [14]:
event_list

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [15]:
MAX_PATTERN_LEN

26

In [16]:
corr_event

[0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 26, 26, 26, 26, 26, 26]

In [17]:
good_sequences

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15],
 [13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,

In [18]:
good_sequences = [good_sequences[0][:150]]     
bad_sequences  = [bad_sequences[0][:150]]     

In [19]:
good_sequences

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15]]

In [20]:
bad_sequences

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  6,
  7,
  8,
  9,
  13,
  14,
  15,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7,
  8,
  9,
  6,
  7]]

In [None]:
from collections import defaultdict
from itertools import combinations

def is_subsequence(small, big):
    it = iter(big)
    return all(c in it for c in small)

def windows_dynamic(seq, e1):
    """
    dynamic windows:
    for each occurrence of e1 at index i, window = [i, next_e1_index) or [i, len(seq))
    """
    idxs = [i for i, x in enumerate(seq) if x == e1]
    if not idxs:
        return
    
    yield idxs[0], (idxs[1] if len(idxs) > 1 else len(seq))

    for k in range(1, len(idxs)):
        s = idxs[k] - 1
        e = idxs[k+1] if k+1 < len(idxs) else len(seq)
        if s < 0:
            s = 0
        yield s, e
    
    # for k, i in enumerate(idxs):
    #     j = idxs[k+1] if k+1 < len(idxs) else len(seq)
    #     yield i, j

def pattern_in_window(seq, s, e, pattern):
    """
    Check if the pattern appears as an ordered subsequence inside seq[s:e].
    """
    it = iter(seq[s:e])
    return all(p in it for p in pattern)

def support_dynamic(sequences, pattern):
    """
    Support counted via dynamic windows keyed by the pattern's first item.
    """
    if not pattern:
        return 0
    e1 = pattern[0]
    supp = 0
    for seq in sequences:
        for s, e in windows_dynamic(seq, e1):
            if pattern_in_window(seq, s, e, pattern):
                supp += 1
    return supp

def seq_pattern_mining(p, q):
    return p + (q[-1],) if p[1:] == q[:-1] else None

def subseqs_kminus1(pat):
    k = len(pat)
    for i in range(k):
        yield pat[:i] + pat[i+1:]


from collections import defaultdict

def generate_frequent_patterns(sequences, min_sup, max_len=27):
    """
    Frequent patterns with dynamic windows, printed step-by-step.
    Candidate generation uses combinations-only:
      - k=2: (a,b) with a < b
      - k>=3: Apriori join with increasing constraint (p[1:]==q[:-1] and p[-1] < q[-1])
    Returns: {pattern(tuple): support(int)}
    """

    item_counts = defaultdict(int)
    for seq in sequences:
        for e in seq:
            item_counts[(e,)] += 1

    print("Step 1 length=1: counts")
    print("  " + ", ".join(f"{str(p[0])} - {c}" for p, c in sorted(item_counts.items())))
    fre_pattern = {p for p, c in item_counts.items() if c >= min_sup}
    supports = {p: item_counts[p] for p in fre_pattern}
    print(f"S1 = {{{', '.join(str(p[0]) for p in sorted(fre_pattern))}}}\n")

    k = 2
    while fre_pattern and k <= max_len:
        candidates = set()

        if k == 2:
            singles = sorted(p[0] for p in fre_pattern)
            for i in range(len(singles)):
                for j in range(i + 1, len(singles)):
                    candidates.add((singles[i], singles[j]))
        else:
            prev = sorted(fre_pattern)
            for p in prev:
                for q in prev:
                    if p[1:] == q[:-1] and p[-1] < q[-1]:
                        cand = p + (q[-1],)
                        if all(sub in fre_pattern for sub in subseqs_kminus1(cand)):
                            candidates.add(cand)

        print(f"Step {k} length={k}: sets")  
        if not candidates:
            print("  no sets, stopping.\n")
            break
        print("  " + ", ".join("-".join(map(str, c)) for c in sorted(candidates)))

        counts = {cand: support_dynamic(sequences, cand) for cand in sorted(candidates)}
        print("  support value: " + ", ".join(f"{'-'.join(map(str, c))} – {v}" for c, v in counts.items()))

        fre_pattern = {p for p, c in counts.items() if c >= min_sup}
        if fre_pattern:
            supports.update({p: counts[p] for p in fre_pattern})
            print(f"S{k} = {{{', '.join('-'.join(map(str, p)) for p in sorted(fre_pattern))}}}\n")
        else:
            print(f"no patterns satisfy minSup ({min_sup}) at length {k}\n")
            break

        k += 1

    # printing the final frequent patterns
    print("Final frequent patterns:")
    by_len = defaultdict(list)
    for p, c in supports.items():
        by_len[len(p)].append((p, c))
    for L in sorted(by_len):
        items = ", ".join("-".join(map(str, p)) for p, _ in sorted(by_len[L]))
        print(f"Length {L}: {items}")

    return supports


In [22]:
def is_subsequence(small, big):
    it = iter(big)
    return all(c in it for c in small)

def compress_patterns(patterns):
    """
    Remove redundant subsequences from a dictionary of patterns.
    This function keeps only the longest unique patterns and removes those that are subsequences of another pattern with the same support. 
    This avoids duplication in the frequent pattern set.
    Parameters - patterns : dict - A dictionary where keys are patterns as tuples and values are their support counts (int).
    Returns - dict - A compressed dictionary of patterns, with redundant subsequences removed.
    eg - patterns = {
    (1, 2): 3,
    (1, 2, 3): 3,
    (2, 3): 2
    }
    output - {(1, 2, 3): 3, (2, 3): 2}
    """
    compressed = {}
    # Sort patterns by descending length
    for pattern, support in sorted(patterns.items(), key=lambda x: (-len(x[0]), x[0])):
        is_subseq = False
        # If pattern is a subsequence of an already kept pattern with same support then we skip it
        for other_pattern in compressed:
            if is_subsequence(pattern, other_pattern) and patterns[other_pattern] == support:
                is_subseq = True
                break
        # Keeping the pattern if not redundant    
        if not is_subseq:
            compressed[pattern] = support
    return compressed


In [23]:
def build_full_counts(sequences, max_len):
    return generate_frequent_patterns(sequences, min_sup=1, max_len=max_len)

from collections import defaultdict

def normalize(patterns, sequences, max_len=27):
    """
    Chain-rule normalization:
      len=1: P(e) = count(e) / total_events
      len>1: P(e1..ek) = P(e1) * Π P(e_{i+1} | e1..ei),
                         with P(next|prefix) = count(prefix+next) / Σ_x count(prefix+x)
    Returns {pattern: probability}
    """
    full_counts = build_full_counts(sequences, max_len=max_len)
    total_events = sum(len(s) for s in sequences) or 1

    prefix_out_sum = defaultdict(int)
    for q, c in full_counts.items():
        if len(q) >= 2:
            prefix = q[:-1]
            prefix_out_sum[prefix] += c

    def supp(p): 
        return full_counts.get(p, 0)

    probs = {}
    for pat in patterns:
        k = len(pat)
        if k == 1:
            probs[pat] = supp(pat) / total_events
            continue

        p = supp((pat[0],)) / total_events
        if p == 0.0:
            probs[pat] = 0.0
            continue

        is_success = True
        for i in range(1, k):
            prefix = pat[:i]
            num = supp(pat[:i+1])
            den = prefix_out_sum.get(prefix, 0)
            if num == 0 or den == 0:
                p = 0.0
                is_success = False
                break
            p *= (num / den)
        probs[pat] = p if is_success else 0.0
    return probs
 

In [24]:
NORMALIZE_SUPPORT = True 

def mine_discriminative_patterns(good_seqs, bad_seqs, min_sup):
    """
    Sequential (paper-style) discriminative mining:
      - Mine frequent ordered patterns with dynamic windows in good & bad
      - Sequence-compress (drop redundant subseqs with same support)
      - normalize per pile
      - Return patterns with higher prominence in bad than good
    """
    # Stage-1 frequent patterns
    good_patterns = generate_frequent_patterns(good_seqs, min_sup, max_len=MAX_PATTERN_LEN)
    bad_patterns  = generate_frequent_patterns(bad_seqs,  min_sup, max_len=MAX_PATTERN_LEN)

    # Compression
    good_patterns = compress_patterns(good_patterns)
    bad_patterns  = compress_patterns(bad_patterns)

    # Normalization
    if NORMALIZE_SUPPORT:
        good_patterns = normalize(good_patterns, good_seqs, max_len=MAX_PATTERN_LEN)
        bad_patterns  = normalize(bad_patterns,  bad_seqs,  max_len=MAX_PATTERN_LEN)

    # Discriminative diff
    discriminative = {}
    for pattern, bad_val in bad_patterns.items():
        good_val = good_patterns.get(pattern, 0)
        diff = bad_val - good_val
        if diff > 0:
            discriminative[pattern] = {'bad': bad_val, 'good': good_val, 'delta': diff}

    return discriminative


In [None]:
SEGMENT_WIDTH = 50
TOP_K_SEGMENTS = 5

def segment_log(sequence, width):
    if width <=0:
        return []
    segments_list = []
    i = 0
    while i < len(sequence):
        j = min(i+width, len(sequence))
        segments_list.append(sequence[i:j])
        if j == len(sequence):
            break
        i = j - 1
    return segments_list


    # return [sequence[i:i+width] for i in range(0, len(sequence), width)]

def _score_segment(seg, discr_keys):
    return sum(1 for p in discr_keys if is_subsequence(p, seg))

def segment_and_mine(bad_seqs, discriminative_patterns, seg_width=SEGMENT_WIDTH, top_k=TOP_K_SEGMENTS, max_len=MAX_PATTERN_LEN):
    discr_keys = list(discriminative_patterns.keys()) if isinstance(discriminative_patterns, dict) else list(discriminative_patterns)

    segments, scores = [], []
    for seq in bad_seqs:
        for seg in segment_log(seq, seg_width):
            segments.append(seg)
            scores.append(_score_segment(seg, discr_keys))

    if not segments:
        return {}

    top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    top_segments = [segments[i] for i in top_idx]

    raw = generate_frequent_patterns(top_segments, min_sup=1, max_len=max_len)
    return compress_patterns(raw)


In [26]:
discriminative_patterns = mine_discriminative_patterns(good_sequences, bad_sequences, MIN_SUP)
#discriminative_patterns = mine_discriminative_patterns(good_20, bad_20, min_sup=2)

Step 1 length=1: counts
  0 - 1, 1 - 1, 2 - 1, 3 - 1, 4 - 1, 5 - 1, 6 - 20, 7 - 20, 8 - 20, 9 - 20, 10 - 4, 11 - 4, 12 - 4, 13 - 4, 14 - 4, 15 - 4
S1 = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15}

Step 2 length=2: sets
  6-7, 6-8, 6-9, 6-10, 6-11, 6-12, 6-13, 6-14, 6-15, 7-8, 7-9, 7-10, 7-11, 7-12, 7-13, 7-14, 7-15, 8-9, 8-10, 8-11, 8-12, 8-13, 8-14, 8-15, 9-10, 9-11, 9-12, 9-13, 9-14, 9-15, 10-11, 10-12, 10-13, 10-14, 10-15, 11-12, 11-13, 11-14, 11-15, 12-13, 12-14, 12-15, 13-14, 13-15, 14-15
  support value: 6-7 – 20, 6-8 – 20, 6-9 – 20, 6-10 – 4, 6-11 – 4, 6-12 – 4, 6-13 – 4, 6-14 – 4, 6-15 – 4, 7-8 – 20, 7-9 – 20, 7-10 – 4, 7-11 – 4, 7-12 – 4, 7-13 – 4, 7-14 – 4, 7-15 – 4, 8-9 – 20, 8-10 – 4, 8-11 – 4, 8-12 – 4, 8-13 – 4, 8-14 – 4, 8-15 – 4, 9-10 – 4, 9-11 – 4, 9-12 – 4, 9-13 – 4, 9-14 – 4, 9-15 – 4, 10-11 – 4, 10-12 – 4, 10-13 – 4, 10-14 – 4, 10-15 – 4, 11-12 – 4, 11-13 – 4, 11-14 – 4, 11-15 – 4, 12-13 – 4, 12-14 – 4, 12-15 – 4, 13-14 – 4, 13-15 – 4, 14-15 – 4
S2 = {6-7, 6-8, 6-9, 6-10, 

In [27]:
discriminative_patterns

{(6, 7, 8, 9, 10, 11, 12): {'bad': 0.0006385799348387821,
  'good': 0.0005124294897022169,
  'delta': 0.0001261504451365652},
 (6, 7, 8, 9, 13, 14, 15): {'bad': 0.0006385799348387821,
  'good': 0.0005124294897022169,
  'delta': 0.0001261504451365652},
 (6, 7, 8, 9): {'bad': 0.0076629592180653865,
  'good': 0.006149153876426603,
  'delta': 0.0015138053416387832},
 (6, 7): {'bad': 0.048753623188405794,
  'good': 0,
  'delta': 0.048753623188405794}}

In [28]:
stage2_patterns = segment_and_mine(bad_sequences, discriminative_patterns)

Step 1 length=1: counts
  0 - 1, 1 - 1, 2 - 1, 3 - 1, 4 - 1, 5 - 1, 6 - 29, 7 - 29, 8 - 28, 9 - 28, 10 - 5, 11 - 5, 12 - 5, 13 - 5, 14 - 5, 15 - 5
S1 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}

Step 2 length=2: sets
  0-1, 0-2, 0-3, 0-4, 0-5, 0-6, 0-7, 0-8, 0-9, 0-10, 0-11, 0-12, 0-13, 0-14, 0-15, 1-2, 1-3, 1-4, 1-5, 1-6, 1-7, 1-8, 1-9, 1-10, 1-11, 1-12, 1-13, 1-14, 1-15, 2-3, 2-4, 2-5, 2-6, 2-7, 2-8, 2-9, 2-10, 2-11, 2-12, 2-13, 2-14, 2-15, 3-4, 3-5, 3-6, 3-7, 3-8, 3-9, 3-10, 3-11, 3-12, 3-13, 3-14, 3-15, 4-5, 4-6, 4-7, 4-8, 4-9, 4-10, 4-11, 4-12, 4-13, 4-14, 4-15, 5-6, 5-7, 5-8, 5-9, 5-10, 5-11, 5-12, 5-13, 5-14, 5-15, 6-7, 6-8, 6-9, 6-10, 6-11, 6-12, 6-13, 6-14, 6-15, 7-8, 7-9, 7-10, 7-11, 7-12, 7-13, 7-14, 7-15, 8-9, 8-10, 8-11, 8-12, 8-13, 8-14, 8-15, 9-10, 9-11, 9-12, 9-13, 9-14, 9-15, 10-11, 10-12, 10-13, 10-14, 10-15, 11-12, 11-13, 11-14, 11-15, 12-13, 12-14, 12-15, 13-14, 13-15, 14-15
  support value: 0-1 – 1, 0-2 – 1, 0-3 – 1, 0-4 – 1, 0-5 – 1, 0-6 – 1, 0-7 – 1,

In [29]:
stage2_patterns

{(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12): 1,
 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 15): 1,
 (0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15): 1,
 (6, 7, 8, 9, 10, 11, 12): 3,
 (6, 7, 8, 9, 13, 14, 15): 5,
 (10, 11, 12, 13, 14, 15): 5,
 (8, 9, 10, 11, 12): 4,
 (6, 7, 8, 9): 27,
 (6, 7): 29,
 (8, 9): 28}

In [30]:
df1 = pd.DataFrame([
    {'Pattern': pattern, 'Bad Support': v['bad'], 'Good Support': v['good'], 'Difference': v['delta']}
    for pattern, v in sorted(discriminative_patterns.items(), key=lambda x: -x[1]['delta'])
])

df2 = pd.DataFrame([
    {'Pattern': pattern, 'Support': support}
    for pattern, support in sorted(stage2_patterns.items(), key=lambda x: -x[1])
])

from IPython.display import display
print("Stage 1: Discriminative Patterns")
display(df1)

print("\nStage 2: Infrequent Root-Cause Patterns (from top-K segments)")
display(df2)

Stage 1: Discriminative Patterns


Unnamed: 0,Pattern,Bad Support,Good Support,Difference
0,"(6, 7)",0.048754,0.0,0.048754
1,"(6, 7, 8, 9)",0.007663,0.006149,0.001514
2,"(6, 7, 8, 9, 10, 11, 12)",0.000639,0.000512,0.000126
3,"(6, 7, 8, 9, 13, 14, 15)",0.000639,0.000512,0.000126



Stage 2: Infrequent Root-Cause Patterns (from top-K segments)


Unnamed: 0,Pattern,Support
0,"(6, 7)",29
1,"(8, 9)",28
2,"(6, 7, 8, 9)",27
3,"(6, 7, 8, 9, 13, 14, 15)",5
4,"(10, 11, 12, 13, 14, 15)",5
5,"(8, 9, 10, 11, 12)",4
6,"(6, 7, 8, 9, 10, 11, 12)",3
7,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)",1
8,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 15)",1
9,"(0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15)",1


In [31]:
import os
import time

def test_single(file_path):
    detections = []

    # Read the test trace (list of [event_id, timestamp])
    trace = read_traces(file_path)
    sequence = [int(ev[0]) for ev in trace if isinstance(ev, list) and len(ev) >= 2]
    timestamps = [int(ev[1]) for ev in trace if isinstance(ev, list) and len(ev) >= 2]
    filename = os.path.basename(file_path)

    pattern_list = list(discriminative_patterns.keys()) + list(stage2_patterns.keys())
    start_time = time.time()

    for i in range(len(sequence)):
        for pattern in pattern_list:
            pattern_len = len(pattern)
            if i + pattern_len <= len(sequence) and tuple(sequence[i:i+pattern_len]) == pattern:
                start_idx = i
                end_idx = i + pattern_len - 1
                start_ts = timestamps[start_idx]
                end_ts = timestamps[end_idx]
                detections.append([
                    (start_idx, end_idx),
                    (start_ts, end_ts),
                    filename
                ])

    inference_time = (time.time() - start_time) * 1000 

    return detections, inference_time

In [32]:
from libraries.anomaly_detection import merge_detections, get_correct_detections


## checking the detections against the ground truth
DIFF_VAL = 0 
all_detections = []         # To store detections for each file
y_pred_all = []             # To store the predicted labels
y_true_all = []             # To store the ground truth labels
all_tp = []                 # To store all true positives
all_fp = []                 # To store all false positives
all_fn = []                 # To store all false negatives
all_gt = []                 # To store the ground truth


# Iterating through each test data file and label file
for test_data, test_label in zip(test_data_path, test_label_path):
    detection, inference_time = test_single(test_data)            # Detecting anomalies in the test data
    print("Detection : ", detection)

    all_detections.append((test_data, detection, test_label))
    merge_detection, agg_ts = merge_detections(detection, diff_val=DIFF_VAL)

    print("Merge detection : ", merge_detection)
    
    ground_truth_raw = read_traces(test_label)                                               # read ground truth labels from the label file
    ground_truth = ground_truth_raw['labels']                                                # extract labels from dictionary from ground truth data

    label_trace_name = list(ground_truth.keys())[0]
    ground_truth = ground_truth[label_trace_name]

    correct_pred, rest_pred, y_pred, y_true, false_neg = get_correct_detections(merge_detection, ground_truth)  # Comparing detected anomaly with ground truth

    y_pred_all.extend(y_pred)          # predicted labels
    y_true_all.extend(y_true)          # actual ground truth labels
    all_tp.append((test_data, correct_pred, test_label))
    all_fp.append((test_data, rest_pred, test_label))
    all_fn.append((test_data, false_neg, test_label))
    all_gt.append((test_data, ground_truth, test_label))

    print("Inference time : ", (inference_time/32))

Detection :  [[(6, 9), (1877, 2924), 'trace_trial1.json'], [(6, 7), (1877, 1903), 'trace_trial1.json'], [(6, 9), (1877, 2924), 'trace_trial1.json'], [(6, 7), (1877, 1903), 'trace_trial1.json'], [(8, 9), (2919, 2924), 'trace_trial1.json'], [(10, 13), (2928, 3975), 'trace_trial1.json'], [(10, 11), (2928, 2953), 'trace_trial1.json'], [(10, 13), (2928, 3975), 'trace_trial1.json'], [(10, 11), (2928, 2953), 'trace_trial1.json'], [(12, 13), (3970, 3975), 'trace_trial1.json'], [(14, 17), (3979, 5025), 'trace_trial1.json'], [(14, 15), (3979, 4004), 'trace_trial1.json'], [(14, 17), (3979, 5025), 'trace_trial1.json'], [(14, 15), (3979, 4004), 'trace_trial1.json'], [(16, 17), (5021, 5025), 'trace_trial1.json'], [(18, 21), (5030, 6076), 'trace_trial1.json'], [(18, 19), (5030, 5055), 'trace_trial1.json'], [(18, 21), (5030, 6076), 'trace_trial1.json'], [(18, 19), (5030, 5055), 'trace_trial1.json'], [(20, 21), (6071, 6076), 'trace_trial1.json'], [(22, 28), (6080, 7140), 'trace_trial1.json'], [(22, 25)

In [33]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred_all = np.array(y_pred_all)
y_true_all = np.array(y_true_all)

# Calculate evaluation metrics
precision = precision_score(y_true_all, y_pred_all)
recall = recall_score(y_true_all, y_pred_all)
f1 = f1_score(y_true_all, y_pred_all)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.1011
Recall: 1.0000
F1 Score: 0.1837
