In [1]:
import pandas as pd
import os
import glob
import re
import numpy as np
from pathlib import Path
import itertools

In [2]:
all_rules_crm_path = os.path.join('5_analysis', 'random', 'combined_sorted_all.csv')
all_rules_crm = pd.read_csv(all_rules_crm_path, sep=',')

all_rules_dt_path = os.path.join('5_analysis', 'dt', 'rules_dt.csv')
all_rules_dt = pd.read_csv(all_rules_dt_path, sep=',')

all_rules_ripperk_path = os.path.join('5_analysis', 'ripperk', 'rules_ripperk.csv')
all_rules_ripperk = pd.read_csv(all_rules_ripperk_path, sep=',')

In [3]:
# all_rules_crm
all_rules_dt

Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule
0,BPI15A,declare,baseline,[01_HOOFD_011 = 0] --> Label
1,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 0] --> Label
2,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 1] --> Label
3,BPI15A,declare,bs_data,[01_HOOFD_011 = 0] --> Label
4,BPI15A,declare,bs_data,[01_HOOFD_011 = 1 ∧ 01_HOOFD_494a = 0] --> Label
...,...,...,...,...
2059,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
2060,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
2061,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
2062,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...


In [4]:
# Count unique values in 'encoding' per 'labeling'
unique_counts = (
    all_rules_crm
    .groupby("Labeling")["Feature Encoding"]
    .nunique()
    .reset_index(name="unique_encodings")
)

display(unique_counts)


Unnamed: 0,Labeling,unique_encodings
0,BPI15A_decl2,1
1,BPI15A_mr_tr,1
2,BPI15A_payload_560925,9
3,sepsis_decl,8
4,sepsis_mr_tr,13
5,sepsis_payload2,4
6,traffic_decl3,15
7,traffic_mr_tr,15
8,traffic_payload_Pay36,15


In [5]:
# --- Normalize 'labeling' values and keep only desired columns ---

def _normalize_labeling_column(df: pd.DataFrame) -> None:
    """In-place normalize the labeling/Labeling column by substring:
       'decl' -> 'declare', 'mr_tr' -> 'sequential', 'payload' -> 'payload'."""
    for col in ("labeling", "Labeling"):
        if col in df.columns:
            lower = df[col].astype(str).str.lower()
            df[col] = np.select(
                [
                    lower.str.contains("decl", na=False),
                    lower.str.contains("mr_tr", na=False),
                    lower.str.contains("payload", na=False),
                ],
                ["declare", "sequential", "payload"],
                default=df[col],
            )

def _canonicalize_and_subset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Rename common variants to canonical column names and return only:
    ['Dataset','Labeling','Feature Encoding','Rule'] if present.
    """
    rename_map = {}
    for c in df.columns:
        cl = c.lower()
        if cl == "dataset":
            rename_map[c] = "Dataset"
        elif cl == "labeling":
            rename_map[c] = "Labeling"
        elif cl in {"feature encoding", "encoding"}:
            rename_map[c] = "Feature Encoding"
        elif cl == "rule":
            rename_map[c] = "Rule"
    df2 = df.rename(columns=rename_map)

    keep = [c for c in ["Dataset", "Labeling", "Feature Encoding", "Rule"] if c in df2.columns]
    return df2[keep].copy() if keep else df2.copy()

# Apply to all three
for name in ("all_rules_crm", "all_rules_dt", "all_rules_ripperk"):
    df = globals()[name]
    _normalize_labeling_column(df)
    df = _canonicalize_and_subset(df)
    globals()[name] = df

# Filter DT & RIPPERK to traffic only, then drop 'Dataset'
# for name in ("all_rules_dt", "all_rules_ripperk"):
#     df = globals()[name]
#     if "Dataset" in df.columns:
#         mask = df["Dataset"].astype(str).str.lower().eq("traffic")
#         df = df.loc[mask].drop(columns=["Dataset"])
#     globals()[name] = df

# (Optional) quick peek
display(all_rules_crm)
display(all_rules_dt)
display(all_rules_ripperk)


Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule
0,BPI15A,declare,payload,['monitoringResource|first|literal_binned_(560...
1,BPI15A,declare,payload,"['length_binned_(44.0, 101.0]'] --> Label"
2,BPI15A,declare,payload,"['length_binned_(1.999, 44.0]'] --> !Label"
3,BPI15A,declare,payload,"['org:resource|first|literal_binned_(560912.0,..."
4,BPI15A,declare,payload,['monitoringResource|first|literal_binned_(560...
...,...,...,...,...
604,traffic,payload,seq_combined_data,"['amount|first|continuous_binned_(-0.001, 33.6..."
605,traffic,payload,seq_combined_data,"['mr[Create Fine-complete, Send Fine-complete,..."
606,traffic,payload,seq_combined_data,"['mr[Create Fine-complete, Send Fine-complete]..."
607,traffic,payload,seq_combined_data,['mr[Send for Credit Collection-complete]_0.0'...


Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule
0,BPI15A,declare,baseline,[01_HOOFD_011 = 0] --> Label
1,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 0] --> Label
2,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 1] --> Label
3,BPI15A,declare,bs_data,[01_HOOFD_011 = 0] --> Label
4,BPI15A,declare,bs_data,[01_HOOFD_011 = 1 ∧ 01_HOOFD_494a = 0] --> Label
...,...,...,...,...
2059,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
2060,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
2061,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
2062,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...


Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule
0,BPI15A,declare,baseline,[01_HOOFD_011 = 1.0] --> Label
1,BPI15A,declare,bs_data,[01_HOOFD_011 = 1.0] --> Label
2,BPI15A,declare,bs_dwd,"[alternate_precedence:(01_HOOFD_011,01_HOOFD_0..."
3,BPI15A,declare,dec_data,"[choice:('01_HOOFD_010', '01_HOOFD_011') = -1...."
4,BPI15A,declare,dec_dwd,"[choice:('01_HOOFD_010', '01_HOOFD_011') = -1...."
...,...,...,...,...
438,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
439,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
440,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
441,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...


## Splitting CRM

In [6]:
# ---------- 1) Extract exact LHS and RHS from all_rules_crm['Rule'] ----------
# Reuses your robust parsers and outputs: LHS_features (string), RHS_label (1/0)

def extract_lhs_exact(rule_str: str) -> str:
    """Everything before the arrow '-->' (preserve quotes/brackets exactly)."""
    m = re.search(r"^(.*?)(?=\s*-->)", str(rule_str))
    return m.group(1) if m else str(rule_str)

def parse_rhs_label(rule_str: str):
    """Return 1 for 'Label', 0 for '!Label', or None if not found."""
    m = re.search(r"-->\s*(Label|!Label)", str(rule_str))
    if not m:
        return None
    return 1 if m.group(1) == "Label" else 0

crm_df = all_rules_crm.copy()
crm_df['LHS_features'] = crm_df['Rule'].apply(extract_lhs_exact)
crm_df['RHS_label']    = crm_df['Rule'].apply(parse_rhs_label)

# ---------- 2) Split LHS into up to 3 features ----------
def _find_outer_brackets_span(text: str):
    """Return (start_idx, end_idx) of the outermost [...] in `text`."""
    s = str(text)
    start = s.find('[')
    if start < 0:
        return None, None

    depth = 0
    in_s = in_d = esc = False
    end = None
    for i, ch in enumerate(s[start:], start):
        if esc:
            esc = False
            continue
        if ch == '\\':
            esc = True
            continue

        if in_s:
            if ch == "'":
                in_s = False
            continue
        if in_d:
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            in_s = True
            continue
        if ch == '"':
            in_d = True
            continue

        if ch == '[':
            depth += 1
            continue
        if ch == ']':
            depth -= 1
            if depth == 0:
                end = i
                break
    return (start, end)

def _split_top_level_commas(content: str):
    """Split `content` on commas that are outside quotes."""
    parts, curr = [], ""
    in_s = in_d = esc = False
    for ch in content:
        if esc:
            curr += ch
            esc = False
            continue
        if ch == '\\':
            curr += ch
            esc = True
            continue

        if in_s:
            curr += ch
            if ch == "'":
                in_s = False
            continue
        if in_d:
            curr += ch
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            curr += ch
            in_s = True
            continue
        if ch == '"':
            curr += ch
            in_d = True
            continue

        if ch == ',':
            parts.append(curr.strip())
            curr = ""
        else:
            curr += ch
    parts.append(curr.strip())
    return parts

def _strip_one_layer_quotes(s: str):
    """Remove a single layer of outer quotes if present; keep inner brackets intact."""
    s = s.strip()
    if len(s) >= 2 and ((s[0] == s[-1] == "'") or (s[0] == s[-1] == '"')):
        return s[1:-1]
    return s

def split_lhs_items(lhs_text: str):
    """
    lhs_text is exactly what's before '-->', e.g. "['A', 'B', 'C']" or "['A']".
    Return list like ['A','B','C'] (no outer quotes/brackets).
    """
    s = str(lhs_text)
    start, end = _find_outer_brackets_span(s)
    if start is None or end is None:
        return []
    inner = s[start+1:end]  # inside [...]
    raw_items = _split_top_level_commas(inner)
    return [_strip_one_layer_quotes(x).strip() for x in raw_items if x != ""]

def _pad3(items):
    items = items[:3]
    return items + [""] * (3 - len(items))

lhs_split = crm_df['LHS_features'].apply(split_lhs_items).apply(_pad3)
lhs_df = pd.DataFrame(lhs_split.tolist(), columns=['feature_1_lhs','feature_2_lhs','feature_3_lhs'])

# ---------- 3) Final expanded table ----------
cols_present = [c for c in ['Dataset','Labeling','Feature Encoding','Rule','LHS_features','RHS_label'] if c in crm_df.columns]

all_rules_crm_expanded = pd.concat(
    [crm_df[cols_present].reset_index(drop=True),
     lhs_df.reset_index(drop=True)],
    axis=1
).reset_index(drop=True)

# Optional: sort for readability if keys exist
sort_keys = [c for c in ['Dataset','Labeling','Encoding'] if c in all_rules_crm_expanded.columns]
if sort_keys:
    all_rules_crm_expanded = all_rules_crm_expanded.sort_values(by=sort_keys, ascending=True).reset_index(drop=True)

# Show result
all_rules_crm_expanded

Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule,LHS_features,RHS_label,feature_1_lhs,feature_2_lhs,feature_3_lhs
0,BPI15A,declare,payload,['monitoringResource|first|literal_binned_(560...,['monitoringResource|first|literal_binned_(560...,1,monitoringResource|first|literal_binned_(56092...,,
1,BPI15A,declare,payload,"['length_binned_(44.0, 101.0]'] --> Label","['length_binned_(44.0, 101.0]']",1,"length_binned_(44.0, 101.0]",,
2,BPI15A,declare,payload,"['length_binned_(1.999, 44.0]'] --> !Label","['length_binned_(1.999, 44.0]']",0,"length_binned_(1.999, 44.0]",,
3,BPI15A,declare,payload,"['org:resource|first|literal_binned_(560912.0,...","['org:resource|first|literal_binned_(560912.0,...",0,"org:resource|first|literal_binned_(560912.0, 1...",,
4,BPI15A,declare,payload,['monitoringResource|first|literal_binned_(560...,['monitoringResource|first|literal_binned_(560...,0,monitoringResource|first|literal_binned_(56046...,,
...,...,...,...,...,...,...,...,...,...
604,traffic,sequential,seq_combined_data,"['mr[Create Fine-complete, Send Fine-complete,...","['mr[Create Fine-complete, Send Fine-complete,...",0,"mr[Create Fine-complete, Send Fine-complete, I...",,
605,traffic,sequential,seq_combined_data,['mr[Add penalty-complete]_1.0'] --> Label,['mr[Add penalty-complete]_1.0'],1,mr[Add penalty-complete]_1.0,,
606,traffic,sequential,seq_combined_data,['paymentAmount|first|continuous_binned_(-0.00...,['paymentAmount|first|continuous_binned_(-0.00...,0,"paymentAmount|first|continuous_binned_(-0.001,...",,
607,traffic,sequential,seq_combined_data,"['paymentAmount|first|continuous_binned_(35.0,...","['paymentAmount|first|continuous_binned_(35.0,...",1,"paymentAmount|first|continuous_binned_(35.0, 1...",,


In [7]:
# ---------- Expand DT and RIPPERk rules: support up to 15 LHS features ----------

def extract_lhs_exact(rule_str: str) -> str:
    """Everything before the arrow '-->' (preserve characters exactly)."""
    m = re.search(r"^(.*?)(?=\s*-->)", str(rule_str))
    return m.group(1).strip() if m else str(rule_str).strip()

def parse_rhs_label(rule_str: str):
    """Return 1 for 'Label', 0 for '!Label', or None if not found."""
    m = re.search(r"-->\s*(Label|!Label)", str(rule_str))
    if not m:
        return None
    return 1 if m.group(1) == "Label" else 0

# Splitter for DT/RIPPERK: use logical-and '∧' (U+2227); also accept ASCII '&' as fallback.
_AND_SPLIT_RE = re.compile(r"\s*(?:∧|&)\s*")

def split_lhs_items_dt(lhs_text: str):
    """
    For DT/RIPPERK rule format, LHS looks like:
      [feature1 ∧ feature2 ∧ feature3 ∧ ...]
    We split on '∧' (and '&' as fallback), strip outer [ ], then trim items.
    """
    s = str(lhs_text).strip()
    if len(s) >= 2 and s[0] == '[' and s[-1] == ']':
        s = s[1:-1]
    if not s:
        return []
    parts = _AND_SPLIT_RE.split(s)
    return [p.strip() for p in parts if p.strip() != ""]

def _padN(items, n=15):
    items = items[:n]
    return items + [""] * (n - len(items))

def _expand_df_with_lhs_rhs(df_in: pd.DataFrame, name_hint: str, max_features: int = 15):
    """
    Given a dataframe with at least ['Rule'] column, produce an expanded version with:
      - LHS_features: exact text before -->
      - RHS_label: {1,0,None}
      - feature_1_lhs ... feature_{max_features}_lhs (split on ∧ / & for DT/RIPPERK)
    Keeps any of ['Dataset','Labeling','Feature Encoding','Rule'] that exist.
    """
    if "Rule" not in df_in.columns:
        raise KeyError(f"{name_hint}: expected a 'Rule' column.")

    df = df_in.copy()
    df["LHS_features"] = df["Rule"].apply(extract_lhs_exact)
    df["RHS_label"]    = df["Rule"].apply(parse_rhs_label)

    lhs_split = df["LHS_features"].apply(split_lhs_items_dt).apply(lambda xs: _padN(xs, max_features))
    feat_cols = [f"feature_{i}_lhs" for i in range(1, max_features+1)]
    lhs_df = pd.DataFrame(lhs_split.tolist(), columns=feat_cols)

    keep = [c for c in ["Dataset","Labeling","Feature Encoding","Rule","LHS_features","RHS_label"] if c in df.columns or c in ["LHS_features","RHS_label"]]
    expanded = pd.concat([df[keep].reset_index(drop=True), lhs_df.reset_index(drop=True)], axis=1)

    sort_keys = [c for c in ["Dataset","Labeling","Feature Encoding"] if c in expanded.columns]
    if sort_keys:
        expanded = expanded.sort_values(by=sort_keys, ascending=True).reset_index(drop=True)
    return expanded

# Build the expanded tables (now with up to 15 features)
all_rules_dt_expanded       = _expand_df_with_lhs_rhs(all_rules_dt, "DT", max_features=15)
all_rules_ripperk_expanded  = _expand_df_with_lhs_rhs(all_rules_ripperk, "RIPPERk", max_features=15)

# Quick peek
display(all_rules_dt_expanded)
display(all_rules_ripperk_expanded)


Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule,LHS_features,RHS_label,feature_1_lhs,feature_2_lhs,feature_3_lhs,feature_4_lhs,...,feature_6_lhs,feature_7_lhs,feature_8_lhs,feature_9_lhs,feature_10_lhs,feature_11_lhs,feature_12_lhs,feature_13_lhs,feature_14_lhs,feature_15_lhs
0,BPI15A,declare,baseline,[01_HOOFD_011 = 0] --> Label,[01_HOOFD_011 = 0],1,01_HOOFD_011 = 0,,,,...,,,,,,,,,,
1,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 0] --> Label,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 0],1,01_HOOFD_011 = 1,01_HOOFD_099 = 0,,,...,,,,,,,,,,
2,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 1] --> Label,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 1],1,01_HOOFD_011 = 1,01_HOOFD_099 = 1,,,...,,,,,,,,,,
3,BPI15A,declare,bs_data,[01_HOOFD_011 = 0] --> Label,[01_HOOFD_011 = 0],1,01_HOOFD_011 = 0,,,,...,,,,,,,,,,
4,BPI15A,declare,bs_data,[01_HOOFD_011 = 1 ∧ 01_HOOFD_494a = 0] --> Label,[01_HOOFD_011 = 1 ∧ 01_HOOFD_494a = 0],1,01_HOOFD_011 = 1,01_HOOFD_494a = 0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 0","length_binned_(5.0, 20.0] = 1","paymentAmount|first|continuous_binned_(-0.001,...","mra[Payment-complete, Create Fine-complete, Se...",...,mr[Send for Credit Collection-complete] = 0,mra[Receive Result Appeal from Prefecture-comp...,"article|first|discrete_binned_(142.0, 157.0] = 1",,,,,,,
2060,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 0","length_binned_(5.0, 20.0] = 1","paymentAmount|first|continuous_binned_(-0.001,...","mra[Payment-complete, Create Fine-complete, Se...",...,mr[Send for Credit Collection-complete] = 0,mra[Receive Result Appeal from Prefecture-comp...,,,,,,,,
2061,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 0","length_binned_(5.0, 20.0] = 1","paymentAmount|first|continuous_binned_(-0.001,...","mra[Payment-complete, Create Fine-complete, Se...",...,mr[Send for Credit Collection-complete] = 1,,,,,,,,,
2062,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 0","length_binned_(5.0, 20.0] = 1","paymentAmount|first|continuous_binned_(-0.001,...","mra[Payment-complete, Create Fine-complete, Se...",...,,,,,,,,,,


Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule,LHS_features,RHS_label,feature_1_lhs,feature_2_lhs,feature_3_lhs,feature_4_lhs,...,feature_6_lhs,feature_7_lhs,feature_8_lhs,feature_9_lhs,feature_10_lhs,feature_11_lhs,feature_12_lhs,feature_13_lhs,feature_14_lhs,feature_15_lhs
0,BPI15A,declare,baseline,[01_HOOFD_011 = 1.0] --> Label,[01_HOOFD_011 = 1.0],1,01_HOOFD_011 = 1.0,,,,...,,,,,,,,,,
1,BPI15A,declare,bs_data,[01_HOOFD_011 = 1.0] --> Label,[01_HOOFD_011 = 1.0],1,01_HOOFD_011 = 1.0,,,,...,,,,,,,,,,
2,BPI15A,declare,bs_dwd,"[alternate_precedence:(01_HOOFD_011,01_HOOFD_0...","[alternate_precedence:(01_HOOFD_011,01_HOOFD_0...",1,"alternate_precedence:(01_HOOFD_011,01_HOOFD_01...",,,,...,,,,,,,,,,
3,BPI15A,declare,dec_data,"[choice:('01_HOOFD_010', '01_HOOFD_011') = -1....","[choice:('01_HOOFD_010', '01_HOOFD_011') = -1.0]",1,"choice:('01_HOOFD_010', '01_HOOFD_011') = -1.0",,,,...,,,,,,,,,,
4,BPI15A,declare,dec_dwd,"[choice:('01_HOOFD_010', '01_HOOFD_011') = -1....","[choice:('01_HOOFD_010', '01_HOOFD_011') = -1.0]",1,"choice:('01_HOOFD_010', '01_HOOFD_011') = -1.0",,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,traffic,sequential,seq_combined_data,"[length_binned_(1.999, 5.0] = 0.0 ∧ paymentAmo...","[length_binned_(1.999, 5.0] = 0.0 ∧ paymentAmo...",1,"length_binned_(1.999, 5.0] = 0.0","paymentAmount|first|continuous_binned_(-0.001,...",,,...,,,,,,,,,,
439,traffic,sequential,seq_combined_data,"[mr[Payment-complete]_binned_(-0.001, 1.0] = 0...","[mr[Payment-complete]_binned_(-0.001, 1.0] = 0...",1,"mr[Payment-complete]_binned_(-0.001, 1.0] = 0.0",mr[Add penalty-complete] = 1.0,,,...,,,,,,,,,,
440,traffic,sequential,seq_combined_data,[mr[Receive Result Appeal from Prefecture-comp...,[mr[Receive Result Appeal from Prefecture-comp...,1,mr[Receive Result Appeal from Prefecture-compl...,"paymentAmount|first|continuous_binned_(-0.001,...",,,...,,,,,,,,,,
441,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 1.0",,,,...,,,,,,,,,,


## Direct rule comparison

In [8]:
display(all_rules_dt_expanded)
display(all_rules_ripperk_expanded)
display(all_rules_crm_expanded)

Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule,LHS_features,RHS_label,feature_1_lhs,feature_2_lhs,feature_3_lhs,feature_4_lhs,...,feature_6_lhs,feature_7_lhs,feature_8_lhs,feature_9_lhs,feature_10_lhs,feature_11_lhs,feature_12_lhs,feature_13_lhs,feature_14_lhs,feature_15_lhs
0,BPI15A,declare,baseline,[01_HOOFD_011 = 0] --> Label,[01_HOOFD_011 = 0],1,01_HOOFD_011 = 0,,,,...,,,,,,,,,,
1,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 0] --> Label,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 0],1,01_HOOFD_011 = 1,01_HOOFD_099 = 0,,,...,,,,,,,,,,
2,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 1] --> Label,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 1],1,01_HOOFD_011 = 1,01_HOOFD_099 = 1,,,...,,,,,,,,,,
3,BPI15A,declare,bs_data,[01_HOOFD_011 = 0] --> Label,[01_HOOFD_011 = 0],1,01_HOOFD_011 = 0,,,,...,,,,,,,,,,
4,BPI15A,declare,bs_data,[01_HOOFD_011 = 1 ∧ 01_HOOFD_494a = 0] --> Label,[01_HOOFD_011 = 1 ∧ 01_HOOFD_494a = 0],1,01_HOOFD_011 = 1,01_HOOFD_494a = 0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 0","length_binned_(5.0, 20.0] = 1","paymentAmount|first|continuous_binned_(-0.001,...","mra[Payment-complete, Create Fine-complete, Se...",...,mr[Send for Credit Collection-complete] = 0,mra[Receive Result Appeal from Prefecture-comp...,"article|first|discrete_binned_(142.0, 157.0] = 1",,,,,,,
2060,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 0","length_binned_(5.0, 20.0] = 1","paymentAmount|first|continuous_binned_(-0.001,...","mra[Payment-complete, Create Fine-complete, Se...",...,mr[Send for Credit Collection-complete] = 0,mra[Receive Result Appeal from Prefecture-comp...,,,,,,,,
2061,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 0","length_binned_(5.0, 20.0] = 1","paymentAmount|first|continuous_binned_(-0.001,...","mra[Payment-complete, Create Fine-complete, Se...",...,mr[Send for Credit Collection-complete] = 1,,,,,,,,,
2062,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 0","length_binned_(5.0, 20.0] = 1","paymentAmount|first|continuous_binned_(-0.001,...","mra[Payment-complete, Create Fine-complete, Se...",...,,,,,,,,,,


Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule,LHS_features,RHS_label,feature_1_lhs,feature_2_lhs,feature_3_lhs,feature_4_lhs,...,feature_6_lhs,feature_7_lhs,feature_8_lhs,feature_9_lhs,feature_10_lhs,feature_11_lhs,feature_12_lhs,feature_13_lhs,feature_14_lhs,feature_15_lhs
0,BPI15A,declare,baseline,[01_HOOFD_011 = 1.0] --> Label,[01_HOOFD_011 = 1.0],1,01_HOOFD_011 = 1.0,,,,...,,,,,,,,,,
1,BPI15A,declare,bs_data,[01_HOOFD_011 = 1.0] --> Label,[01_HOOFD_011 = 1.0],1,01_HOOFD_011 = 1.0,,,,...,,,,,,,,,,
2,BPI15A,declare,bs_dwd,"[alternate_precedence:(01_HOOFD_011,01_HOOFD_0...","[alternate_precedence:(01_HOOFD_011,01_HOOFD_0...",1,"alternate_precedence:(01_HOOFD_011,01_HOOFD_01...",,,,...,,,,,,,,,,
3,BPI15A,declare,dec_data,"[choice:('01_HOOFD_010', '01_HOOFD_011') = -1....","[choice:('01_HOOFD_010', '01_HOOFD_011') = -1.0]",1,"choice:('01_HOOFD_010', '01_HOOFD_011') = -1.0",,,,...,,,,,,,,,,
4,BPI15A,declare,dec_dwd,"[choice:('01_HOOFD_010', '01_HOOFD_011') = -1....","[choice:('01_HOOFD_010', '01_HOOFD_011') = -1.0]",1,"choice:('01_HOOFD_010', '01_HOOFD_011') = -1.0",,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,traffic,sequential,seq_combined_data,"[length_binned_(1.999, 5.0] = 0.0 ∧ paymentAmo...","[length_binned_(1.999, 5.0] = 0.0 ∧ paymentAmo...",1,"length_binned_(1.999, 5.0] = 0.0","paymentAmount|first|continuous_binned_(-0.001,...",,,...,,,,,,,,,,
439,traffic,sequential,seq_combined_data,"[mr[Payment-complete]_binned_(-0.001, 1.0] = 0...","[mr[Payment-complete]_binned_(-0.001, 1.0] = 0...",1,"mr[Payment-complete]_binned_(-0.001, 1.0] = 0.0",mr[Add penalty-complete] = 1.0,,,...,,,,,,,,,,
440,traffic,sequential,seq_combined_data,[mr[Receive Result Appeal from Prefecture-comp...,[mr[Receive Result Appeal from Prefecture-comp...,1,mr[Receive Result Appeal from Prefecture-compl...,"paymentAmount|first|continuous_binned_(-0.001,...",,,...,,,,,,,,,,
441,traffic,sequential,seq_combined_data,"[mra[Payment-complete, Add penalty-complete] =...","[mra[Payment-complete, Add penalty-complete] =...",1,"mra[Payment-complete, Add penalty-complete] = 1.0",,,,...,,,,,,,,,,


Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule,LHS_features,RHS_label,feature_1_lhs,feature_2_lhs,feature_3_lhs
0,BPI15A,declare,payload,['monitoringResource|first|literal_binned_(560...,['monitoringResource|first|literal_binned_(560...,1,monitoringResource|first|literal_binned_(56092...,,
1,BPI15A,declare,payload,"['length_binned_(44.0, 101.0]'] --> Label","['length_binned_(44.0, 101.0]']",1,"length_binned_(44.0, 101.0]",,
2,BPI15A,declare,payload,"['length_binned_(1.999, 44.0]'] --> !Label","['length_binned_(1.999, 44.0]']",0,"length_binned_(1.999, 44.0]",,
3,BPI15A,declare,payload,"['org:resource|first|literal_binned_(560912.0,...","['org:resource|first|literal_binned_(560912.0,...",0,"org:resource|first|literal_binned_(560912.0, 1...",,
4,BPI15A,declare,payload,['monitoringResource|first|literal_binned_(560...,['monitoringResource|first|literal_binned_(560...,0,monitoringResource|first|literal_binned_(56046...,,
...,...,...,...,...,...,...,...,...,...
604,traffic,sequential,seq_combined_data,"['mr[Create Fine-complete, Send Fine-complete,...","['mr[Create Fine-complete, Send Fine-complete,...",0,"mr[Create Fine-complete, Send Fine-complete, I...",,
605,traffic,sequential,seq_combined_data,['mr[Add penalty-complete]_1.0'] --> Label,['mr[Add penalty-complete]_1.0'],1,mr[Add penalty-complete]_1.0,,
606,traffic,sequential,seq_combined_data,['paymentAmount|first|continuous_binned_(-0.00...,['paymentAmount|first|continuous_binned_(-0.00...,0,"paymentAmount|first|continuous_binned_(-0.001,...",,
607,traffic,sequential,seq_combined_data,"['paymentAmount|first|continuous_binned_(35.0,...","['paymentAmount|first|continuous_binned_(35.0,...",1,"paymentAmount|first|continuous_binned_(35.0, 1...",,


In [9]:
import re
import pandas as pd

# --- identify feature columns ------------------------------------------------
def feature_cols(df, prefix='feature_', suffix='_lhs'):
    pat = re.compile(rf'^{re.escape(prefix)}\d+{re.escape(suffix)}$')
    return [c for c in df.columns if pat.match(c)]

# --- normalizers -------------------------------------------------------------
# For DT & RIPPERk: drop trailing " <op> <something>" at the end of the string
# Examples: "… Data <= 0.0" -> "… Data", "… X = yes" -> "… X"
_OP_TAIL = re.compile(r'\s*(?:<=|>=|==|=|<|>)\s*.+\s*$', flags=re.IGNORECASE)

def _norm_dt_ripperk_token(x):
    if pd.isna(x):
        return x
    s = str(x).strip()
    s = _OP_TAIL.sub('', s)
    return s

# For CRM: drop trailing "_<scalar>" (number or boolean) at EOL; keep binned "(...)" or "(...]" intact
# Examples: "…')_1.0" -> "…')", "…')_True" -> "…')"
# Will NOT match strings ending with "…(a,b]" because they end in "]", not in "_<scalar>"
_CRM_SCALAR_SUFFIX = re.compile(r'_(?:[-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?|true|false)$', flags=re.IGNORECASE)

def _norm_crm_token(x):
    if pd.isna(x):
        return x
    s = str(x).strip()
    s = _CRM_SCALAR_SUFFIX.sub('', s)
    return s

def normalize_df_features_inplace(df, mode: str):
    """
    mode in {"dt", "ripperk", "crm"}.
    Normalizes feature_*_lhs columns in-place.
    """
    cols = feature_cols(df)
    if not cols:
        return 0, 0

    if mode.lower() in {"dt", "ripperk"}:
        fn = _norm_dt_ripperk_token
    elif mode.lower() == "crm":
        fn = _norm_crm_token
    else:
        raise ValueError("mode must be one of: 'dt', 'ripperk', 'crm'")

    changed = 0
    total = 0
    for c in cols:
        orig = df[c].astype("object")
        norm = orig.map(fn)
        # treat empty strings as missing (optional but handy)
        norm = norm.mask(norm == '', pd.NA)
        changed += (orig != norm).fillna(False).sum()
        total   += orig.notna().sum()
        df[c] = norm
    return changed, total

# --- run normalization on your three dataframes ------------------------------
dt_changed, dt_total   = normalize_df_features_inplace(all_rules_dt_expanded,       mode="dt")
rip_changed, rip_total = normalize_df_features_inplace(all_rules_ripperk_expanded,  mode="ripperk")
crm_changed, crm_total = normalize_df_features_inplace(all_rules_crm_expanded,      mode="crm")

print(f"DT:    normalized {dt_changed} / {dt_total} feature tokens")
print(f"RIPPERk: normalized {rip_changed} / {rip_total} feature tokens")
print(f"CRM:   normalized {crm_changed} / {crm_total} feature tokens")


DT:    normalized 30960 / 30960 feature tokens
RIPPERk: normalized 6645 / 6645 feature tokens
CRM:   normalized 1594 / 1827 feature tokens


### Subsetting

In [10]:
# import re
# import pandas as pd

# KEY = ['Dataset', 'Labeling', 'Feature Encoding']

# # --- helpers ---------------------------------------------------------------
# def _feature_cols(df, prefix='feature_', suffix='_lhs'):
#     pat = re.compile(rf'^{re.escape(prefix)}\d+{re.escape(suffix)}$')
#     return [c for c in df.columns if pat.match(c)]

# def _lhs_set_from_row(row, feat_cols, lower=True, strip=True):
#     vals = []
#     for c in feat_cols:
#         v = row.get(c)
#         if pd.isna(v):
#             continue
#         s = str(v)
#         if strip:
#             s = s.strip()
#         if s == '':
#             continue
#         if lower:
#             s = s.lower()
#         vals.append(s)
#     return frozenset(vals)

# def df_to_rule_sets(df, key_cols=KEY, lower=True, dedup=True):
#     feat_cols = _feature_cols(df)
#     if not feat_cols:
#         raise ValueError("No feature_*_lhs columns found in df.")
#     tmp = df.copy()
#     tmp['__lhs_set__'] = tmp.apply(lambda r: _lhs_set_from_row(r, feat_cols, lower=lower), axis=1)
#     tmp = tmp[tmp['__lhs_set__'].map(len) > 0]  # keep non-empty rules
#     groups = {}
#     for key, g in tmp.groupby(key_cols, dropna=False):
#         sets = list(g['__lhs_set__'])
#         if dedup:
#             sets = list(set(sets))  # avoid counting identical rules twice
#         groups[key] = sets
#     return groups

# def count_subset_matches(A_sets, B_sets):
#     """# of rules in A whose LHS-set is a subset of at least one rule in B."""
#     if not A_sets or not B_sets:
#         return 0
#     B_list = list(B_sets)  # re-use for scans
#     return sum(1 for a in A_sets if any(a.issubset(b) for b in B_list))

# # --- main summary ----------------------------------------------------------
# def build_match_summary(dt_df, ripperk_df, crm_df):
#     dt_map  = df_to_rule_sets(dt_df)
#     rip_map = df_to_rule_sets(ripperk_df)
#     crm_map = df_to_rule_sets(crm_df)

#     all_keys = set(dt_map) | set(rip_map) | set(crm_map)
#     rows = []

#     for key in sorted(all_keys):
#         dt_sets  = dt_map.get(key, [])
#         rip_sets = rip_map.get(key, [])
#         crm_sets = crm_map.get(key, [])

#         n_dt, n_rip, n_crm = len(dt_sets), len(rip_sets), len(crm_sets)

#         # Pairwise (directional) subset matches
#         dt_in_rip  = count_subset_matches(dt_sets,  rip_sets)
#         dt_in_crm  = count_subset_matches(dt_sets,  crm_sets)
#         rip_in_dt  = count_subset_matches(rip_sets, dt_sets)
#         rip_in_crm = count_subset_matches(rip_sets, crm_sets)
#         crm_in_dt  = count_subset_matches(crm_sets, dt_sets)
#         crm_in_rip = count_subset_matches(crm_sets, rip_sets)

#         # In both of the other models
#         dt_in_both  = sum(1 for s in dt_sets  if any(s.issubset(b) for b in rip_sets) and any(s.issubset(c) for c in crm_sets))
#         rip_in_both = sum(1 for s in rip_sets if any(s.issubset(b) for b in dt_sets)  and any(s.issubset(c) for c in crm_sets))
#         crm_in_both = sum(1 for s in crm_sets if any(s.issubset(b) for b in dt_sets)  and any(s.issubset(c) for c in rip_sets))

#         # Unique to a model (not matched by either of the other two)
#         dt_unique  = sum(1 for s in dt_sets  if not any(s.issubset(b) for b in rip_sets) and not any(s.issubset(c) for c in crm_sets))
#         rip_unique = sum(1 for s in rip_sets if not any(s.issubset(b) for b in dt_sets)  and not any(s.issubset(c) for c in crm_sets))
#         crm_unique = sum(1 for s in crm_sets if not any(s.issubset(b) for b in dt_sets)  and not any(s.issubset(c) for c in rip_sets))

#         row = {
#             'Dataset': key[0], 'Labeling': key[1], 'Feature Encoding': key[2],
#             'total_dt': n_dt, 'total_ripperk': n_rip, 'total_crm': n_crm,

#             'dt→ripperk': dt_in_rip,     'dt→ripperk_%':  (dt_in_rip / n_dt * 100) if n_dt else 0.0,
#             'dt→crm':     dt_in_crm,     'dt→crm_%':      (dt_in_crm / n_dt * 100) if n_dt else 0.0,

#             'ripperk→dt': rip_in_dt,     'ripperk→dt_%':  (rip_in_dt / n_rip * 100) if n_rip else 0.0,
#             'ripperk→crm':rip_in_crm,    'ripperk→crm_%': (rip_in_crm / n_rip * 100) if n_rip else 0.0,

#             'crm→dt':     crm_in_dt,     'crm→dt_%':      (crm_in_dt / n_crm * 100) if n_crm else 0.0,
#             'crm→ripperk':crm_in_rip,    'crm→ripperk_%': (crm_in_rip / n_crm * 100) if n_crm else 0.0,

#             'dt→both(ripperk,crm)':  dt_in_both,  'dt→both_%':  (dt_in_both  / n_dt  * 100) if n_dt  else 0.0,
#             'ripperk→both(dt,crm)':  rip_in_both, 'ripperk→both_%': (rip_in_both / n_rip * 100) if n_rip else 0.0,
#             'crm→both(dt,ripperk)':  crm_in_both, 'crm→both_%': (crm_in_both / n_crm * 100) if n_crm else 0.0,

#             'unique_dt': dt_unique, 'unique_ripperk': rip_unique, 'unique_crm': crm_unique,
#         }
#         rows.append(row)

#     summary = pd.DataFrame(rows).sort_values(KEY).reset_index(drop=True)
#     pct_cols = [c for c in summary.columns if c.endswith('_%')]
#     summary[pct_cols] = summary[pct_cols].round(1)
#     display(summary)
#     return summary

# # Run:
# summary = build_match_summary(all_rules_dt_expanded, all_rules_ripperk_expanded, all_rules_crm_expanded)


### symmetric

In [11]:
# import re
# import pandas as pd

# KEY = ['Dataset', 'Labeling', 'Feature Encoding']

# # --- helpers (same as before) -----------------------------------------------
# def _feature_cols(df, prefix='feature_', suffix='_lhs'):
#     pat = re.compile(rf'^{re.escape(prefix)}\d+{re.escape(suffix)}$')
#     return [c for c in df.columns if pat.match(c)]

# def _lhs_set_from_row(row, feat_cols, lower=True, strip=True):
#     vals = []
#     for c in feat_cols:
#         v = row.get(c)
#         if pd.isna(v):
#             continue
#         s = str(v)
#         if strip:
#             s = s.strip()
#         if s == '':
#             continue
#         if lower:
#             s = s.lower()
#         vals.append(s)
#     return frozenset(vals)

# def df_to_rule_sets(df, key_cols=KEY, lower=True, dedup=True):
#     feat_cols = _feature_cols(df)
#     if not feat_cols:
#         raise ValueError("No feature_*_lhs columns found in df.")
#     tmp = df.copy()
#     tmp['__lhs_set__'] = tmp.apply(lambda r: _lhs_set_from_row(r, feat_cols, lower=lower), axis=1)
#     tmp = tmp[tmp['__lhs_set__'].map(len) > 0]  # keep non-empty rules
#     groups = {}
#     for key, g in tmp.groupby(key_cols, dropna=False):
#         sets = list(g['__lhs_set__'])
#         if dedup:
#             sets = list(set(sets))  # avoid counting identical rules twice
#         groups[key] = sets
#     return groups

# # Directional: count rules in A that are subset of some rule in B
# def count_subset_matches(A_sets, B_sets):
#     if not A_sets or not B_sets:
#         return 0
#     B_list = list(B_sets)
#     return sum(1 for a in A_sets if any(a.issubset(b) for b in B_list))

# # Symmetric: count rules in A that have ANY rule in B with (a ⊆ b) OR (b ⊆ a)
# def _has_bisubset_match(a, B_sets):
#     return any(a.issubset(b) or b.issubset(a) for b in B_sets)

# def count_bisubset_matches(A_sets, B_sets):
#     if not A_sets or not B_sets:
#         return 0
#     return sum(1 for a in A_sets if _has_bisubset_match(a, B_sets))

# # --- main summary ------------------------------------------------------------
# def build_match_summary(dt_df, ripperk_df, crm_df):
#     dt_map  = df_to_rule_sets(dt_df)
#     rip_map = df_to_rule_sets(ripperk_df)
#     crm_map = df_to_rule_sets(crm_df)

#     all_keys = set(dt_map) | set(rip_map) | set(crm_map)
#     rows = []

#     for key in sorted(all_keys):
#         dt_sets  = dt_map.get(key, [])
#         rip_sets = rip_map.get(key, [])
#         crm_sets = crm_map.get(key, [])

#         n_dt, n_rip, n_crm = len(dt_sets), len(rip_sets), len(crm_sets)

#         # --- existing directional (keep) ---
#         dt_in_rip  = count_subset_matches(dt_sets,  rip_sets)
#         dt_in_crm  = count_subset_matches(dt_sets,  crm_sets)
#         rip_in_dt  = count_subset_matches(rip_sets, dt_sets)
#         rip_in_crm = count_subset_matches(rip_sets, crm_sets)
#         crm_in_dt  = count_subset_matches(crm_sets, dt_sets)
#         crm_in_rip = count_subset_matches(crm_sets, rip_sets)

#         dt_in_both  = sum(1 for s in dt_sets  if any(s.issubset(b) for b in rip_sets) and any(s.issubset(c) for c in crm_sets))
#         rip_in_both = sum(1 for s in rip_sets if any(s.issubset(b) for b in dt_sets)  and any(s.issubset(c) for c in crm_sets))
#         crm_in_both = sum(1 for s in crm_sets if any(s.issubset(b) for b in dt_sets)  and any(s.issubset(c) for c in rip_sets))

#         # --- NEW: symmetric (↔) matches ---
#         dt_bi_rip  = count_bisubset_matches(dt_sets,  rip_sets)
#         dt_bi_crm  = count_bisubset_matches(dt_sets,  crm_sets)
#         rip_bi_dt  = count_bisubset_matches(rip_sets, dt_sets)
#         rip_bi_crm = count_bisubset_matches(rip_sets, crm_sets)
#         crm_bi_dt  = count_bisubset_matches(crm_sets, dt_sets)
#         crm_bi_rip = count_bisubset_matches(crm_sets, rip_sets)

#         # Symmetric "in both": rule in model X has a ↔ match in each of the other two models
#         dt_bi_both  = sum(1 for s in dt_sets  if _has_bisubset_match(s, rip_sets) and _has_bisubset_match(s, crm_sets))
#         rip_bi_both = sum(1 for s in rip_sets if _has_bisubset_match(s, dt_sets)  and _has_bisubset_match(s, crm_sets))
#         crm_bi_both = sum(1 for s in crm_sets if _has_bisubset_match(s, dt_sets)  and _has_bisubset_match(s, rip_sets))

#         # Unique under SYMMETRIC criterion: no ↔ match in either other model
#         dt_unique_sym  = sum(1 for s in dt_sets  if not _has_bisubset_match(s, rip_sets) and not _has_bisubset_match(s, crm_sets))
#         rip_unique_sym = sum(1 for s in rip_sets if not _has_bisubset_match(s, dt_sets)  and not _has_bisubset_match(s, crm_sets))
#         crm_unique_sym = sum(1 for s in crm_sets if not _has_bisubset_match(s, dt_sets)  and not _has_bisubset_match(s, rip_sets))

#         row = {
#             'Dataset': key[0], 'Labeling': key[1], 'Feature Encoding': key[2],
#             'total_dt': n_dt, 'total_ripperk': n_rip, 'total_crm': n_crm,

#             # Directional (kept for reference)
#             'dt→ripperk': dt_in_rip,     'dt→ripperk_%':  (dt_in_rip / n_dt * 100) if n_dt else 0.0,
#             'dt→crm':     dt_in_crm,     'dt→crm_%':      (dt_in_crm / n_dt * 100) if n_dt else 0.0,
#             'ripperk→dt': rip_in_dt,     'ripperk→dt_%':  (rip_in_dt / n_rip * 100) if n_rip else 0.0,
#             'ripperk→crm':rip_in_crm,    'ripperk→crm_%': (rip_in_crm / n_rip * 100) if n_rip else 0.0,
#             'crm→dt':     crm_in_dt,     'crm→dt_%':      (crm_in_dt / n_crm * 100) if n_crm else 0.0,
#             'crm→ripperk':crm_in_rip,    'crm→ripperk_%': (crm_in_rip / n_crm * 100) if n_crm else 0.0,
#             'dt→both(ripperk,crm)':  dt_in_both,  'dt→both_%':  (dt_in_both  / n_dt  * 100) if n_dt  else 0.0,
#             'ripperk→both(dt,crm)':  rip_in_both, 'ripperk→both_%': (rip_in_both / n_rip * 100) if n_rip else 0.0,
#             'crm→both(dt,ripperk)':  crm_in_both, 'crm→both_%': (crm_in_both / n_crm * 100) if n_crm else 0.0,

#             # Symmetric (↔): A matches B if A⊆B or B⊆A
#             'dt↔ripperk': dt_bi_rip,     'dt↔ripperk_%':  (dt_bi_rip / n_dt * 100) if n_dt else 0.0,
#             'dt↔crm':     dt_bi_crm,     'dt↔crm_%':      (dt_bi_crm / n_dt * 100) if n_dt else 0.0,
#             'ripperk↔dt': rip_bi_dt,     'ripperk↔dt_%':  (rip_bi_dt / n_rip * 100) if n_rip else 0.0,
#             'ripperk↔crm':rip_bi_crm,    'ripperk↔crm_%': (rip_bi_crm / n_rip * 100) if n_rip else 0.0,
#             'crm↔dt':     crm_bi_dt,     'crm↔dt_%':      (crm_bi_dt / n_crm * 100) if n_crm else 0.0,
#             'crm↔ripperk':crm_bi_rip,    'crm↔ripperk_%': (crm_bi_rip / n_crm * 100) if n_crm else 0.0,

#             'dt↔both(ripperk,crm)':  dt_bi_both,  'dt↔both_%':  (dt_bi_both  / n_dt  * 100) if n_dt  else 0.0,
#             'ripperk↔both(dt,crm)':  rip_bi_both, 'ripperk↔both_%': (rip_bi_both / n_rip * 100) if n_rip else 0.0,
#             'crm↔both(dt,ripperk)':  crm_bi_both, 'crm↔both_%': (crm_bi_both / n_crm * 100) if n_crm else 0.0,

#             # Unique under symmetric criterion
#             'unique_dt_sym': dt_unique_sym,
#             'unique_ripperk_sym': rip_unique_sym,
#             'unique_crm_sym': crm_unique_sym,
#         }
#         rows.append(row)

#     summary = pd.DataFrame(rows).sort_values(KEY).reset_index(drop=True)
#     pct_cols = [c for c in summary.columns if c.endswith('_%')]
#     summary[pct_cols] = summary[pct_cols].round(1)
#     display(summary)
#     return summary

# # Run:
# summary = build_match_summary(all_rules_dt_expanded, all_rules_ripperk_expanded, all_rules_crm_expanded)


### Perfect match

In [12]:
# import re
# import pandas as pd

# KEY = ['Dataset', 'Labeling', 'Feature Encoding']

# # --- helpers (same as before) -----------------------------------------------
# def _feature_cols(df, prefix='feature_', suffix='_lhs'):
#     pat = re.compile(rf'^{re.escape(prefix)}\d+{re.escape(suffix)}$')
#     return [c for c in df.columns if pat.match(c)]

# def _lhs_set_from_row(row, feat_cols, lower=True, strip=True):
#     vals = []
#     for c in feat_cols:
#         v = row.get(c)
#         if pd.isna(v):
#             continue
#         s = str(v)
#         if strip:
#             s = s.strip()
#         if s == '':
#             continue
#         if lower:
#             s = s.lower()
#         vals.append(s)
#     return frozenset(vals)

# def df_to_rule_sets(df, key_cols=KEY, lower=True, dedup=True):
#     feat_cols = _feature_cols(df)
#     if not feat_cols:
#         raise ValueError("No feature_*_lhs columns found in df.")
#     tmp = df.copy()
#     tmp['__lhs_set__'] = tmp.apply(lambda r: _lhs_set_from_row(r, feat_cols, lower=lower), axis=1)
#     tmp = tmp[tmp['__lhs_set__'].map(len) > 0]  # keep non-empty rules
#     groups = {}
#     for key, g in tmp.groupby(key_cols, dropna=False):
#         sets = list(g['__lhs_set__'])
#         if dedup:
#             sets = list(set(sets))  # avoid counting identical rules twice
#         groups[key] = sets
#     return groups

# # Exact-equality matches (no subsets)
# def count_exact_matches(A_sets, B_sets):
#     if not A_sets or not B_sets:
#         return 0
#     B = set(B_sets)
#     return sum(1 for a in A_sets if a in B)

# # --- main (exact only) -------------------------------------------------------
# def build_match_summary_exact(dt_df, ripperk_df, crm_df):
#     dt_map  = df_to_rule_sets(dt_df)
#     rip_map = df_to_rule_sets(ripperk_df)
#     crm_map = df_to_rule_sets(crm_df)

#     all_keys = set(dt_map) | set(rip_map) | set(crm_map)
#     rows = []

#     for key in sorted(all_keys):
#         dt_sets  = dt_map.get(key, [])
#         rip_sets = rip_map.get(key, [])
#         crm_sets = crm_map.get(key, [])

#         n_dt, n_rip, n_crm = len(dt_sets), len(rip_sets), len(crm_sets)

#         rip_set = set(rip_sets)
#         crm_set = set(crm_sets)
#         dt_set  = set(dt_sets)

#         # Pairwise exact matches (directional counts; equality is symmetric but denominators differ)
#         dt_eq_rip   = count_exact_matches(dt_sets,  rip_sets)
#         dt_eq_crm   = count_exact_matches(dt_sets,  crm_sets)
#         rip_eq_dt   = count_exact_matches(rip_sets, dt_sets)
#         rip_eq_crm  = count_exact_matches(rip_sets, crm_sets)
#         crm_eq_dt   = count_exact_matches(crm_sets, dt_sets)
#         crm_eq_rip  = count_exact_matches(crm_sets, rip_sets)

#         # Exact match present in BOTH other models
#         dt_eq_both   = sum(1 for s in dt_sets  if (s in rip_set) and (s in crm_set))
#         rip_eq_both  = sum(1 for s in rip_sets if (s in dt_set)  and (s in crm_set))
#         crm_eq_both  = sum(1 for s in crm_sets if (s in dt_set)  and (s in rip_set))

#         # Unique under EXACT criterion: not equal to any rule in either other model
#         dt_unique_exact   = sum(1 for s in dt_sets  if (s not in rip_set) and (s not in crm_set))
#         rip_unique_exact  = sum(1 for s in rip_sets if (s not in dt_set)  and (s not in crm_set))
#         crm_unique_exact  = sum(1 for s in crm_sets if (s not in dt_set)  and (s not in rip_set))

#         row = {
#             'Dataset': key[0], 'Labeling': key[1], 'Feature Encoding': key[2],
#             'total_dt': n_dt, 'total_ripperk': n_rip, 'total_crm': n_crm,

#             # Pairwise exact-equality counts & %
#             'dt==ripperk': dt_eq_rip,   'dt==ripperk_%':  (dt_eq_rip / n_dt * 100) if n_dt else 0.0,
#             'dt==crm':     dt_eq_crm,   'dt==crm_%':      (dt_eq_crm / n_dt * 100) if n_dt else 0.0,

#             'ripperk==dt': rip_eq_dt,   'ripperk==dt_%':  (rip_eq_dt / n_rip * 100) if n_rip else 0.0,
#             'ripperk==crm':rip_eq_crm,  'ripperk==crm_%': (rip_eq_crm / n_rip * 100) if n_rip else 0.0,

#             'crm==dt':     crm_eq_dt,   'crm==dt_%':      (crm_eq_dt / n_crm * 100) if n_crm else 0.0,
#             'crm==ripperk':crm_eq_rip,  'crm==ripperk_%': (crm_eq_rip / n_crm * 100) if n_crm else 0.0,

#             # Exact equality with both other models
#             'dt==both(ripperk,crm)':  dt_eq_both,  'dt==both_%':  (dt_eq_both  / n_dt  * 100) if n_dt  else 0.0,
#             'ripperk==both(dt,crm)':  rip_eq_both, 'ripperk==both_%': (rip_eq_both / n_rip * 100) if n_rip else 0.0,
#             'crm==both(dt,ripperk)':  crm_eq_both, 'crm==both_%': (crm_eq_both / n_crm * 100) if n_crm else 0.0,

#             # Unique (exact)
#             # 'unique_dt_exact': dt_unique_exact,
#             # 'unique_ripperk_exact': rip_unique_exact,
#             # 'unique_crm_exact': crm_unique_exact,
#         }
#         rows.append(row)

#     summary_exact = pd.DataFrame(rows).sort_values(KEY).reset_index(drop=True)
#     pct_cols = [c for c in summary_exact.columns if c.endswith('_%')]
#     summary_exact[pct_cols] = summary_exact[pct_cols].round(1)
#     display(summary_exact)
#     return summary_exact

# # Run:
# summary_exact = build_match_summary_exact(all_rules_dt_expanded, all_rules_ripperk_expanded, all_rules_crm_expanded)


In [13]:
import re
import pandas as pd

KEY = ['Dataset', 'Labeling', 'Feature Encoding']

# --- helpers ---------------------------------------------------------------
def _feature_cols(df, prefix='feature_', suffix='_lhs'):
    pat = re.compile(rf'^{re.escape(prefix)}\d+{re.escape(suffix)}$')
    return [c for c in df.columns if pat.match(c)]

def _lhs_set_from_row(row, feat_cols, lower=True, strip=True):
    vals = []
    for c in feat_cols:
        v = row.get(c)
        if pd.isna(v):
            continue
        s = str(v)
        if strip:
            s = s.strip()
        if s == '':
            continue
        if lower:
            s = s.lower()
        vals.append(s)
    return frozenset(vals)

def df_to_rule_lists_and_totals(df, key_cols=KEY, lower=True):
    """
    Per (Dataset, Labeling, Feature Encoding), returns:
      - 'list': list of LHS sets (with duplicates kept) for RAW matching
      - 'total': raw number of rules with non-empty LHS
      - 'set':   DISTINCT LHS sets (still useful for diagnostics if needed)
    """
    feat_cols = _feature_cols(df)
    if not feat_cols:
        raise ValueError("No feature_*_lhs columns found in df.")
    tmp = df.copy()
    tmp['__lhs_set__'] = tmp.apply(lambda r: _lhs_set_from_row(r, feat_cols, lower=lower), axis=1)
    tmp = tmp[tmp['__lhs_set__'].map(len) > 0]  # keep non-empty rules

    groups = {}
    for key, g in tmp.groupby(key_cols, dropna=False):
        lst = list(g['__lhs_set__'])        # keep duplicates
        groups[key] = {
            'list':  lst,
            'total': len(lst),
            'set':   list(set(lst)),        # distinct (optional)
        }
    return groups

# --- exact matching using RAW occurrences -----------------------------------
def count_exact_matches_raw(A_list, B_list):
    """Count A occurrences that have an exact-equal in B (existence in B)."""
    if not A_list or not B_list:
        return 0
    B_set = set(B_list)  # membership test; duplicates in B won't inflate counts
    return sum(1 for a in A_list if a in B_set)

def count_exact_matches_both_raw(A_list, B_list, C_list):
    """Count A occurrences that match something in BOTH B and C (exact)."""
    if not A_list or not B_list or not C_list:
        return 0
    B_set, C_set = set(B_list), set(C_list)
    return sum(1 for a in A_list if (a in B_set) and (a in C_set))

def count_unique_exact_raw(A_list, B_list, C_list):
    """Count A occurrences that have NO exact match in either B or C."""
    if not A_list:
        return 0
    B_set, C_set = set(B_list), set(C_list)
    return sum(1 for a in A_list if (a not in B_set) and (a not in C_set))

# --- main (exact + RAW matching) -------------------------------------------
def build_match_summary_exact_raw(dt_df, ripperk_df, crm_df):
    dt_map  = df_to_rule_lists_and_totals(dt_df)
    rip_map = df_to_rule_lists_and_totals(ripperk_df)
    crm_map = df_to_rule_lists_and_totals(crm_df)

    all_keys = set(dt_map) | set(rip_map) | set(crm_map)
    rows = []

    for key in sorted(all_keys):
        dt_info  = dt_map.get(key, {'list': [], 'total': 0, 'set': []})
        rip_info = rip_map.get(key, {'list': [], 'total': 0, 'set': []})
        crm_info = crm_map.get(key, {'list': [], 'total': 0, 'set': []})

        dt_list,  total_dt  = dt_info['list'],  dt_info['total']
        rip_list, total_rip = rip_info['list'], rip_info['total']
        crm_list, total_crm = crm_info['list'], crm_info['total']

        # Pairwise exact matches (RAW occurrences; % over RAW totals)
        dt_eq_rip   = count_exact_matches_raw(dt_list,  rip_list)
        dt_eq_crm   = count_exact_matches_raw(dt_list,  crm_list)
        rip_eq_dt   = count_exact_matches_raw(rip_list, dt_list)
        rip_eq_crm  = count_exact_matches_raw(rip_list, crm_list)
        crm_eq_dt   = count_exact_matches_raw(crm_list, dt_list)
        crm_eq_rip  = count_exact_matches_raw(crm_list, rip_list)

        # Exact match in BOTH other models (RAW occurrences)
        dt_eq_both   = count_exact_matches_both_raw(dt_list,  rip_list, crm_list)
        rip_eq_both  = count_exact_matches_both_raw(rip_list, dt_list,  crm_list)
        crm_eq_both  = count_exact_matches_both_raw(crm_list, dt_list,  rip_list)

        # Unique under EXACT criterion (RAW occurrences)
        dt_unique_exact   = count_unique_exact_raw(dt_list,  rip_list, crm_list)
        rip_unique_exact  = count_unique_exact_raw(rip_list, dt_list,  crm_list)
        crm_unique_exact  = count_unique_exact_raw(crm_list, dt_list,  rip_list)

        row = {
            'Dataset': key[0], 'Labeling': key[1], 'Feature Encoding': key[2],

            # RAW totals
            'total_dt': total_dt,
            'total_ripperk': total_rip,
            'total_crm': total_crm,

            # Pairwise exact-equality counts & % (RAW)
            'dt==ripperk': dt_eq_rip,   'dt==ripperk_%':  (dt_eq_rip / total_dt * 100) if total_dt else 0.0,
            'dt==crm':     dt_eq_crm,   'dt==crm_%':      (dt_eq_crm / total_dt * 100) if total_dt else 0.0,

            'ripperk==dt': rip_eq_dt,   'ripperk==dt_%':  (rip_eq_dt / total_rip * 100) if total_rip else 0.0,
            'ripperk==crm':rip_eq_crm,  'ripperk==crm_%': (rip_eq_crm / total_rip * 100) if total_rip else 0.0,

            'crm==dt':     crm_eq_dt,   'crm==dt_%':      (crm_eq_dt / total_crm * 100) if total_crm else 0.0,
            'crm==ripperk':crm_eq_rip,  'crm==ripperk_%': (crm_eq_rip / total_crm * 100) if total_crm else 0.0,

            # Exact equality with both other models (RAW)
            'dt==both(ripperk,crm)':  dt_eq_both,  'dt==both_%':  (dt_eq_both  / total_dt  * 100) if total_dt  else 0.0,
            'ripperk==both(dt,crm)':  rip_eq_both, 'ripperk==both_%': (rip_eq_both / total_rip * 100) if total_rip else 0.0,
            'crm==both(dt,ripperk)':  crm_eq_both, 'crm==both_%': (crm_eq_both / total_crm * 100) if total_crm else 0.0,

            # Unique (exact) using RAW occurrences
            'unique_dt_exact': dt_unique_exact,
            'unique_ripperk_exact': rip_unique_exact,
            'unique_crm_exact': crm_unique_exact,
        }
        rows.append(row)

    summary_exact_raw = pd.DataFrame(rows).sort_values(KEY).reset_index(drop=True)
    pct_cols = [c for c in summary_exact_raw.columns if c.endswith('_%')]
    summary_exact_raw[pct_cols] = summary_exact_raw[pct_cols].round(1)
    display(summary_exact_raw)
    return summary_exact_raw

# Run:
summary_exact_raw = build_match_summary_exact_raw(
    all_rules_dt_expanded, all_rules_ripperk_expanded, all_rules_crm_expanded
)

Unnamed: 0,Dataset,Labeling,Feature Encoding,total_dt,total_ripperk,total_crm,dt==ripperk,dt==ripperk_%,dt==crm,dt==crm_%,...,crm==ripperk_%,"dt==both(ripperk,crm)",dt==both_%,"ripperk==both(dt,crm)",ripperk==both_%,"crm==both(dt,ripperk)",crm==both_%,unique_dt_exact,unique_ripperk_exact,unique_crm_exact
0,BPI15A,declare,baseline,3,1,0,1,33.3,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,2,0,0
1,BPI15A,declare,bs_data,3,1,0,1,33.3,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,2,0,0
2,BPI15A,declare,bs_dwd,3,1,0,1,33.3,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,2,0,0
3,BPI15A,declare,dec_data,3,1,0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,3,1,0
4,BPI15A,declare,dec_dwd,3,1,0,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,traffic,sequential,hybrid_dwd,2,1,1,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,2,1,1
134,traffic,sequential,hybrid_dwd_data,2,1,2,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,2,1,2
135,traffic,sequential,payload,133,43,14,0,0.0,0,0.0,...,0.0,0,0.0,0,0.0,0,0.0,133,43,14
136,traffic,sequential,seq_combined,33,10,33,1,3.0,1,3.0,...,3.0,1,3.0,1,10.0,1,3.0,32,9,32


In [17]:
cols = [
    "Labeling",
    "Feature Encoding",
    "total_dt",
    "total_ripperk",
    "total_crm",
    "crm==ripperk_%",
    "crm==dt_%",
    "ripperk==dt_%"
]

traffic_summary = summary_exact_raw[summary_exact_raw['Dataset'].str.lower() == 'traffic'].reset_index(drop=True)
traffic_summary = traffic_summary[cols]
display(traffic_summary)


Unnamed: 0,Labeling,Feature Encoding,total_dt,total_ripperk,total_crm,crm==ripperk_%,crm==dt_%,ripperk==dt_%
0,declare,baseline,14,5,7,0.0,0.0,20.0
1,declare,bs_data,69,23,16,0.0,6.2,0.0
2,declare,bs_dwd,3,1,3,0.0,33.3,0.0
3,declare,dec_data,3,1,10,0.0,0.0,100.0
4,declare,dec_dwd,3,1,9,0.0,0.0,100.0
5,declare,dec_dwd_data,2,1,8,0.0,0.0,100.0
6,declare,declare,4,1,18,5.6,0.0,0.0
7,declare,dwd,4,1,2,0.0,0.0,0.0
8,declare,hybrid,4,1,6,0.0,0.0,0.0
9,declare,hybrid_data,2,1,6,0.0,0.0,100.0


In [18]:
import numpy as np
import pandas as pd

ts = traffic_summary.copy()

# 1) Detokenize "Feature Encoding" (kept from previous cell)
cols_lower = {c.lower(): c for c in ts.columns}
fe_col = cols_lower.get('feature encoding')
if fe_col is None:
    raise KeyError("Column 'Feature Encoding' not found in traffic_summary.")
ts[fe_col] = ts[fe_col].map(lambda v: v if pd.isna(v) or (isinstance(v, str) and v.startswith(r'\detokenize{')) 
                            else r'\detokenize{' + str(v) + '}')

# 2) Format all numeric columns to max 2 decimals (integers remain integers)
num_cols = ts.select_dtypes(include=['number']).columns

def fmt_max2(x):
    if pd.isna(x):
        return x
    v = float(x)
    s = f"{v:.2f}".rstrip('0').rstrip('.')  # e.g., 3.00->3, 3.40->3.4, 3.456->3.46
    return s

for c in num_cols:
    ts[c] = ts[c].map(fmt_max2)

# 3) Export to LaTeX
from pathlib import Path
out_dir = Path("tables")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "traffic_summary.tex"

latex_str = ts.to_latex(
    index=False,
    escape=False,  # keep \detokenize{...}
    longtable=True,
    caption="Traffic: rule matching summary (exact, raw totals).",
    label="tab:traffic_summary",
    na_rep="--"
)

with open(out_path, "w", encoding="utf-8") as f:
    f.write(latex_str)

print(f"LaTeX table written to: {out_path.resolve()}")


LaTeX table written to: /Users/rowangriffioen/Github/Thesis/causal_deviance_mining_selection/tables/traffic_summary.tex


## Coverage calculation

In [None]:
# ---------- Recompute coverage with fixed labeling folder names + add all_case_ids (robust assignment) ----------

BASE_DIR = "3.2_binned_features"

# Map normalized Labeling -> fixed folder name
LABELING_FOLDER_MAP = {
    "declare": "declare_features",
    "sequential": "sequential_features",
    "payload": "payload_features",
}

def _find_ci_subdir(parent: str, target: str) -> str | None:
    """Case-insensitive lookup of subdir 'target' inside 'parent'."""
    t = str(target).lower()
    try:
        for d in os.listdir(parent):
            full = os.path.join(parent, d)
            if os.path.isdir(full) and d.lower() == t:
                return full
    except FileNotFoundError:
        return None
    return None

def _resolve_enc_path(dataset: str, labeling: str, encoding: str, base_dir: str) -> str | None:
    """
    Expect structure:
      {base_dir}/{Dataset}/{declare|sequential|payload}_features/{Encoding}
    Uses case-insensitive matching for the final Encoding subdir.
    """
    if dataset is None or labeling is None or encoding is None:
        return None

    ds_dir = os.path.join(base_dir, str(dataset))
    if not os.path.isdir(ds_dir):
        return None

    lab_norm = str(labeling).strip().lower()
    lab_folder = LABELING_FOLDER_MAP.get(lab_norm) or f"{lab_norm}_features"

    lab_dir = os.path.join(ds_dir, lab_folder)
    if not os.path.isdir(lab_dir):
        ci_lab = _find_ci_subdir(ds_dir, lab_folder)
        if not ci_lab:
            return None
        lab_dir = ci_lab

    # Encoding folder: exact first, then case-insensitive
    enc_exact = os.path.join(lab_dir, str(encoding))
    if os.path.isdir(enc_exact):
        return enc_exact

    return _find_ci_subdir(lab_dir, str(encoding))

# ---- Recompute per-rule coverage using the updated resolver ----

def _infer_case_col(df: pd.DataFrame) -> str:
    for c in ["Case_ID", "case:concept:name", "Case ID", "case_id"]:
        if c in df.columns:
            return c
    raise KeyError("No Case ID column found (tried: Case_ID, case:concept:name, Case ID, case_id)")

def _norm_numeric(col: pd.Series) -> pd.Series:
    if col.dtype == bool:
        return col.astype(int)
    out = pd.to_numeric(col, errors='coerce')
    if out.isna().all() and col.dtype == object:
        return col
    return out

NUM_SUFFIX_RE = re.compile(r"_(\-?\d+(?:\.\d+)?)$")

def _match_single_feature(df: pd.DataFrame, feat: str) -> pd.Series:
    feat = str(feat).strip().strip('"').strip("'")

    # A) exact one-hot column
    if feat in df.columns:
        col = _norm_numeric(df[feat])
        return (col == 1) if pd.api.types.is_numeric_dtype(col) else (col.astype(str) == "1")

    # B) base_<num>
    m = NUM_SUFFIX_RE.search(feat)
    if m:
        base_col = feat[:m.start()]
        desired_str = m.group(1)
        desired = float(desired_str)
        if base_col in df.columns:
            col = _norm_numeric(df[base_col])
            if pd.api.types.is_numeric_dtype(col):
                return (col == desired).fillna(False)
            else:
                return (col.astype(str) == desired_str).fillna(False)
        # rare: indicator named with suffix
        if feat in df.columns:
            col = _norm_numeric(df[feat])
            return ((col == 1) if pd.api.types.is_numeric_dtype(col) else (col.astype(str) == "1")).fillna(False)

    # C) binned: base_(...) or base_[...]
    pos1 = feat.rfind("_(")
    pos2 = feat.rfind("_[")
    split_pos = max(pos1, pos2)
    if split_pos != -1:
        base_col = feat[:split_pos]
        bin_val  = feat[split_pos+1:]  # includes the bracket
        if base_col in df.columns:
            return (df[base_col].astype(str) == bin_val).fillna(False)

    # fallback
    return pd.Series(False, index=df.index)

def _match_rule(df: pd.DataFrame, features: list, rhs_label: int) -> pd.Series:
    mask = pd.Series(True, index=df.index)
    for f in features:
        if f:
            mask &= _match_single_feature(df, f)
            if not mask.any():
                break
    # enforce RHS label
    if rhs_label in (0, 1):
        mask &= (pd.to_numeric(df["Label"], errors="coerce") == rhs_label)
    else:
        mask &= False
    return mask

# Input validation
if 'all_rules_crm_expanded' not in globals() or not isinstance(all_rules_crm_expanded, pd.DataFrame):
    raise ValueError("Expected all_rules_crm_expanded to be present as a DataFrame.")

ENC_COL = "Encoding" if "Encoding" in all_rules_crm_expanded.columns else \
          ("Feature Encoding" if "Feature Encoding" in all_rules_crm_expanded.columns else None)
if ENC_COL is None:
    raise KeyError("Could not find 'Encoding' or 'Feature Encoding' in all_rules_crm_expanded.")

# Prepare output frame + columns
crm_rules_all_with_coverage = all_rules_crm_expanded.copy()
crm_rules_all_with_coverage["covered_case_ids"] = [[] for _ in range(len(crm_rules_all_with_coverage))]
crm_rules_all_with_coverage["n_covered_cases"] = 0
# NEW: add all_case_ids column (ensure object dtype)
crm_rules_all_with_coverage["all_case_ids"] = pd.Series([[]] * len(crm_rules_all_with_coverage), dtype="object")

# Evaluate per (Dataset, Labeling, Encoding)
group_cols = [c for c in ["Dataset", "Labeling", ENC_COL] if c in crm_rules_all_with_coverage.columns]

for keys, g in crm_rules_all_with_coverage.groupby(group_cols):
    vals = dict(zip(group_cols, keys))
    ds  = vals.get("Dataset")
    lab = vals.get("Labeling")
    enc = vals.get(ENC_COL)

    enc_path = _resolve_enc_path(ds, lab, enc, BASE_DIR)
    if enc_path is None:
        continue

    csv_files = [f for f in os.listdir(enc_path) if f.lower().endswith(".csv")]
    if not csv_files:
        continue
    csv_path = os.path.join(enc_path, csv_files[0])

    df_enc = pd.read_csv(csv_path)
    if "Label" not in df_enc.columns:
        continue
    case_col = _infer_case_col(df_enc)

    # Compute all case IDs once per group and assign via an index-aligned Series (avoids broadcasting errors)
    all_ids = df_enc[case_col].dropna().astype(str).unique().tolist()
    crm_rules_all_with_coverage.loc[g.index, "all_case_ids"] = pd.Series(
        [all_ids] * len(g), index=g.index, dtype="object"
    )

    # find all feature_*_lhs columns in numeric order
    feat_cols = [c for c in crm_rules_all_with_coverage.columns if re.fullmatch(r"feature_\d+_lhs", c)]
    feat_cols = sorted(feat_cols, key=lambda x: int(re.findall(r"\d+", x)[0])) if feat_cols else []

    for idx, row in g.iterrows():
        feats = [row.get(c, "") for c in feat_cols]
        feats = [f for f in feats if isinstance(f, str) and f.strip() != ""]
        rhs = row.get("RHS_label", None)

        mask = _match_rule(df_enc, feats, rhs)
        covered = df_enc.loc[mask, case_col].dropna().astype(str).unique().tolist()

        crm_rules_all_with_coverage.at[idx, "covered_case_ids"] = covered
        crm_rules_all_with_coverage.at[idx, "n_covered_cases"] = len(covered)

# Optional: sort for readability
sort_keys = [c for c in ["Dataset","Labeling",ENC_COL,'n_covered_cases'] if c in crm_rules_all_with_coverage.columns]
if sort_keys:
    crm_rules_all_with_coverage = crm_rules_all_with_coverage.sort_values(by=sort_keys).reset_index(drop=True)

# Show the enriched result
crm_rules_all_with_coverage


In [None]:
# ---------- Compute per-rule coverage for DT & RIPPERK expanded rules + add all_case_ids ----------
# Supports feature predicates like:
#   01_HOOFD_011 = 0
#   alternate_precedence:(01_HOOFD_011,01_HOOFD_015):Data <= 0.0
#   monitoringResource|first|literal_binned_(560925.0, 12941730.0] = 0/1
#
# Folder layout (fixed):
#   3.2_binned_features/{Dataset}/{declare_features|sequential_features|payload_features}/{Encoding}/*.csv

import os
import re
import pandas as pd
import numpy as np

BASE_DIR = "3.2_binned_features"
LABELING_FOLDER_MAP = {
    "declare": "declare_features",
    "sequential": "sequential_features",
    "payload": "payload_features",
}

# -------- Path & dataframe utilities --------

def _find_ci_subdir(parent: str, target: str) -> str | None:
    """Case-insensitive lookup of subdir 'target' inside 'parent'."""
    t = str(target).lower()
    try:
        for d in os.listdir(parent):
            full = os.path.join(parent, d)
            if os.path.isdir(full) and d.lower() == t:
                return full
    except FileNotFoundError:
        return None
    return None

def _resolve_enc_path(dataset: str, labeling: str, encoding: str, base_dir: str) -> str | None:
    """
    Expect structure:
      {base_dir}/{Dataset}/{declare|sequential|payload}_features/{Encoding}
    Uses case-insensitive matching for the final Encoding subdir.
    """
    if dataset is None or labeling is None or encoding is None:
        return None

    ds_dir = os.path.join(base_dir, str(dataset))
    if not os.path.isdir(ds_dir):
        return None

    lab_norm = str(labeling).strip().lower()
    lab_folder = LABELING_FOLDER_MAP.get(lab_norm, f"{lab_norm}_features")

    lab_dir = os.path.join(ds_dir, lab_folder)
    if not os.path.isdir(lab_dir):
        ci_lab = _find_ci_subdir(ds_dir, lab_folder)
        if not ci_lab:
            return None
        lab_dir = ci_lab

    # Encoding folder: exact first, then case-insensitive
    enc_exact = os.path.join(lab_dir, str(encoding))
    if os.path.isdir(enc_exact):
        return enc_exact
    return _find_ci_subdir(lab_dir, str(encoding))

def _infer_case_col(df: pd.DataFrame) -> str:
    for c in ["Case_ID", "case:concept:name", "Case ID", "case_id"]:
        if c in df.columns:
            return c
    raise KeyError("No Case ID column found (tried: Case_ID, case:concept:name, Case ID, case_id)")

def _to_numeric(series: pd.Series) -> pd.Series:
    if series.dtype == bool:
        return series.astype(int)
    return pd.to_numeric(series, errors="coerce")

def _strip_one_layer_quotes(s: str) -> str:
    s = str(s).strip()
    if len(s) >= 2 and ((s[0] == s[-1] == "'") or (s[0] == s[-1] == '"')):
        return s[1:-1]
    return s

# -------- Feature expression parsing & evaluation --------

# Detects: left [op] right  (op in >=, <=, !=, ==, =, >, <)
_OP_RE = re.compile(r"^(?P<col>.+?)\s*(?P<op>>=|<=|!=|==|=|>|<)\s*(?P<val>.+?)\s*$")

def _parse_feature_expr(expr: str):
    """
    Parse a DT/RIPPERK feature expression like:
      "X = 1", "X <= 0.3", "foo_binned_(a,b] = 0"
    Returns (col, op, val_str) or (None,None,None) if not parsed.
    """
    s = str(expr).strip()
    m = _OP_RE.match(s)
    if not m:
        return None, None, None
    col = m.group("col").strip()
    op  = m.group("op")
    if op == "=":  # normalize single '='
        op = "=="
    val_str = _strip_one_layer_quotes(m.group("val"))
    return col, op, val_str

def _coerce_value(val_str: str):
    """
    Try to coerce RHS value to number if possible; else return string.
    Accepts ints/floats like '0', '1', '0.0', '-2.5'. Falls back to raw string.
    """
    low = str(val_str).strip().lower()
    if low in {"true", "false"}:
        return 1 if low == "true" else 0
    try:
        num = float(val_str)
        return int(num) if num.is_integer() else num
    except Exception:
        return val_str

def _cmp_op(series: pd.Series, op: str, rhs):
    """
    Compare series to rhs using op. Handles numeric vs string gracefully.
    For inequalities, coerces series to numeric; NaNs -> False.
    """
    if op in ("==", "!="):
        rhs_is_num = isinstance(rhs, (int, float, np.number))
        if rhs_is_num:
            s_num = _to_numeric(series)
            res = (s_num == rhs) if op == "==" else (s_num != rhs)
            if op == "==":
                s_str = series.astype(str).str.strip()
                rhs_str = str(rhs)
                res = res.fillna(s_str == rhs_str)
            else:
                res = res.fillna(True)
            return res.fillna(False)
        else:
            s_str = series.astype(str).str.strip()
            rhs_str = str(rhs).strip()
            return (s_str == rhs_str) if op == "==" else (s_str != rhs_str)

    # Inequalities -> numeric compare
    try:
        rhs_num = float(rhs)
    except Exception:
        return pd.Series(False, index=series.index)
    s_num = _to_numeric(series)
    if op == ">":
        return (s_num > rhs_num).fillna(False)
    if op == "<":
        return (s_num < rhs_num).fillna(False)
    if op == ">=":
        return (s_num >= rhs_num).fillna(False)
    if op == "<=":
        return (s_num <= rhs_num).fillna(False)
    return pd.Series(False, index=series.index)

_BINVAL_SPLIT_RE = re.compile(r"(.+?)(_[(\[][^)\]]+[)\]])$")  # splits "..._binned_(..]" into (base, "_(..]")

def _match_single_feature_dt(df: pd.DataFrame, expr: str) -> pd.Series:
    """
    Evaluate one DT/RIPPERK feature expression against df.
    Handles:
      - direct column comparisons (==, !=, >, <, >=, <=)
      - binned features in one-hot form: "<col>_(bin) == 0/1"
      - binned features in label form: base column holds string "(bin)"; interpret "== 1" as base == bin, "== 0" as base != bin
    """
    col, op, val_str = _parse_feature_expr(expr)
    if col is None:
        return pd.Series(False, index=df.index)

    rhs = _coerce_value(val_str)

    # Case 1: exact column exists -> direct compare
    if col in df.columns:
        return _cmp_op(df[col], op, rhs).fillna(False)

    # Case 2: binned notation: split "<base>_(binlabel)" into base + binlabel
    m = _BINVAL_SPLIT_RE.match(col)
    if m:
        base_col = m.group(1)
        bin_label = m.group(2)[1:]  # drop leading underscore to get "(...]" or "[...]"

        # 2a) Try case-insensitive exact column match
        lower_map = {c.lower(): c for c in df.columns}
        candidate = lower_map.get(col.lower())
        if candidate:
            return _cmp_op(df[candidate], op, rhs).fillna(False)

        # 2b) Base column holds the bin label as string; interpret comparisons to 0/1
        if base_col in df.columns and op in ("==", "!="):
            base_series = df[base_col].astype(str)
            if isinstance(rhs, (int, float, np.number)) and rhs in (0, 1):
                is_bin = (base_series == bin_label)
                if op == "==":
                    return (is_bin if rhs == 1 else ~is_bin).fillna(False)
                else:  # "!="
                    return (~is_bin if rhs == 1 else is_bin).fillna(False)

    # No match
    return pd.Series(False, index=df.index)

def _match_rule_dt(df: pd.DataFrame, features: list, rhs_label: int) -> pd.Series:
    """AND all feature conditions and enforce Label == rhs_label."""
    mask = pd.Series(True, index=df.index)
    for f in features:
        if f:
            mask &= _match_single_feature_dt(df, f)
            if not mask.any():
                break
    if rhs_label in (0, 1):
        mask &= (pd.to_numeric(df["Label"], errors="coerce") == rhs_label)
    else:
        mask &= False
    return mask

def _compute_coverage_for_rules(expanded_df: pd.DataFrame, name_hint: str):
    """Generic coverage driver for DT/RIPPERK expanded tables. Adds covered_case_ids, n_covered_cases, all_case_ids."""
    if not isinstance(expanded_df, pd.DataFrame):
        raise ValueError(f"{name_hint}: expanded_df must be a DataFrame.")

    # Determine encoding column name
    ENC_COL = "Encoding" if "Encoding" in expanded_df.columns else \
              ("Feature Encoding" if "Feature Encoding" in expanded_df.columns else None)
    if ENC_COL is None:
        raise KeyError(f"{name_hint}: Could not find 'Encoding' or 'Feature Encoding'.")

    out = expanded_df.copy()
    out["covered_case_ids"] = [[] for _ in range(len(out))]
    out["n_covered_cases"] = 0
    # NEW: add all_case_ids as object-dtype Series to avoid broadcasting issues
    out["all_case_ids"] = pd.Series([[]] * len(out), dtype="object")

    group_cols = [c for c in ["Dataset", "Labeling", ENC_COL] if c in out.columns]
    # find all feature_*_lhs columns in numeric order
    feat_cols = [c for c in out.columns if re.fullmatch(r"feature_\d+_lhs", c)]
    feat_cols = sorted(feat_cols, key=lambda x: int(re.findall(r"\d+", x)[0])) if feat_cols else []

    for keys, g in out.groupby(group_cols):
        vals = dict(zip(group_cols, keys))
        ds  = vals.get("Dataset")
        lab = vals.get("Labeling")
        enc = vals.get(ENC_COL)

        enc_path = _resolve_enc_path(ds, lab, enc, BASE_DIR)
        if enc_path is None:
            continue

        csv_files = [f for f in os.listdir(enc_path) if f.lower().endswith(".csv")]
        if not csv_files:
            continue
        csv_path = os.path.join(enc_path, csv_files[0])

        df_enc = pd.read_csv(csv_path)
        if "Label" not in df_enc.columns:
            continue
        case_col = _infer_case_col(df_enc)

        # NEW: compute all case IDs once per group; assign as index-aligned Series
        all_ids = df_enc[case_col].dropna().astype(str).unique().tolist()
        out.loc[g.index, "all_case_ids"] = pd.Series([all_ids] * len(g), index=g.index, dtype="object")

        for idx, row in g.iterrows():
            feats = [row.get(c, "") for c in feat_cols]
            feats = [f for f in feats if isinstance(f, str) and f.strip() != ""]
            rhs = row.get("RHS_label", None)

            mask = _match_rule_dt(df_enc, feats, rhs)
            covered = df_enc.loc[mask, case_col].dropna().astype(str).unique().tolist()

            out.at[idx, "covered_case_ids"] = covered
            out.at[idx, "n_covered_cases"] = len(covered)

    sort_keys = [c for c in ["Dataset","Labeling",ENC_COL,'n_covered_cases'] if c in out.columns]
    if sort_keys:
        out = out.sort_values(by=sort_keys).reset_index(drop=True)
    return out

# ---- Build the coverage-enriched tables ----
if 'all_rules_dt_expanded' in globals():
    all_rules_dt_with_coverage = _compute_coverage_for_rules(all_rules_dt_expanded, "DT")
else:
    raise ValueError("all_rules_dt_expanded not found. Run the expansion cell first.")

if 'all_rules_ripperk_expanded' in globals():
    all_rules_ripperk_with_coverage = _compute_coverage_for_rules(all_rules_ripperk_expanded, "RIPPERK")
else:
    raise ValueError("all_rules_ripperk_expanded not found. Run the expansion cell first.")


### (DEBUG)

In [None]:
# # ---------- Recompute DT/RIPPERK coverage with diagnostics (feature-only vs with RHS) ----------

# DEBUG_FEATURE_ONLY = True  # set False to skip the diagnostic columns

# def _compute_coverage_for_rules_with_debug(expanded_df: pd.DataFrame, name_hint: str):
#     if not isinstance(expanded_df, pd.DataFrame):
#         raise ValueError(f"{name_hint}: expanded_df must be a DataFrame.")

#     ENC_COL = "Encoding" if "Encoding" in expanded_df.columns else \
#               ("Feature Encoding" if "Feature Encoding" in expanded_df.columns else None)
#     if ENC_COL is None:
#         raise KeyError(f"{name_hint}: Could not find 'Encoding' or 'Feature Encoding'.")

#     out = expanded_df.copy()
#     out["covered_case_ids"] = [[] for _ in range(len(out))]
#     out["n_covered_cases"] = 0
#     if DEBUG_FEATURE_ONLY:
#         out["feature_only_case_ids"] = [[] for _ in range(len(out))]
#         out["n_feature_only"] = 0
#         out["n_feature_only_label1"] = 0
#         out["n_feature_only_label0"] = 0

#     group_cols = [c for c in ["Dataset", "Labeling", ENC_COL] if c in out.columns]
#     feat_cols = [c for c in out.columns if re.fullmatch(r"feature_\d+_lhs", c)]
#     feat_cols = sorted(feat_cols, key=lambda x: int(re.findall(r"\d+", x)[0])) if feat_cols else []

#     for keys, g in out.groupby(group_cols):
#         vals = dict(zip(group_cols, keys))
#         ds  = vals.get("Dataset")
#         lab = vals.get("Labeling")
#         enc = vals.get(ENC_COL)

#         enc_path = _resolve_enc_path(ds, lab, enc, BASE_DIR)
#         if enc_path is None:
#             continue

#         csv_files = [f for f in os.listdir(enc_path) if f.lower().endswith(".csv")]
#         if not csv_files:
#             continue
#         csv_path = os.path.join(enc_path, csv_files[0])

#         df_enc = pd.read_csv(csv_path)
#         if "Label" not in df_enc.columns:
#             continue
#         case_col = _infer_case_col(df_enc)
#         label_num = pd.to_numeric(df_enc["Label"], errors="coerce")

#         for idx, row in g.iterrows():
#             feats = [row.get(c, "") for c in feat_cols]
#             feats = [f for f in feats if isinstance(f, str) and f.strip() != ""]
#             rhs = row.get("RHS_label", None)

#             # --- feature-only mask (no RHS enforcement) ---
#             mask_feat_only = pd.Series(True, index=df_enc.index)
#             for fexpr in feats:
#                 if fexpr:
#                     mask_feat_only &= _match_single_feature_dt(df_enc, fexpr)
#                     if not mask_feat_only.any():
#                         break

#             if DEBUG_FEATURE_ONLY:
#                 covered_feat_only = df_enc.loc[mask_feat_only, case_col].dropna().astype(str).unique().tolist()
#                 out.at[idx, "feature_only_case_ids"] = covered_feat_only
#                 out.at[idx, "n_feature_only"] = len(covered_feat_only)
#                 # also show how many of those are Label==1 vs Label==0 (helps see the mismatch)
#                 if len(covered_feat_only) > 0:
#                     lbl_series = label_num[mask_feat_only]
#                     out.at[idx, "n_feature_only_label1"] = int((lbl_series == 1).sum())
#                     out.at[idx, "n_feature_only_label0"] = int((lbl_series == 0).sum())

#             # --- final coverage with RHS enforcement (this is your original definition) ---
#             mask_with_rhs = mask_feat_only.copy()
#             if rhs in (0, 1):
#                 mask_with_rhs &= (label_num == rhs)
#             else:
#                 mask_with_rhs &= False

#             covered = df_enc.loc[mask_with_rhs, case_col].dropna().astype(str).unique().tolist()
#             out.at[idx, "covered_case_ids"] = covered
#             out.at[idx, "n_covered_cases"] = len(covered)

#     sort_keys = [c for c in ["Dataset","Labeling",ENC_COL] if c in out.columns]
#     if sort_keys:
#         out = out.sort_values(by=sort_keys).reset_index(drop=True)
#     return out

# # Recompute with diagnostics
# all_rules_dt_with_coverage = _compute_coverage_for_rules_with_debug(all_rules_dt_expanded, "DT")
# all_rules_ripperk_with_coverage = _compute_coverage_for_rules_with_debug(all_rules_ripperk_expanded, "RIPPERK")


In [None]:
# def move_coverage_cols(df: pd.DataFrame) -> pd.DataFrame:
#     target_order = ["covered_case_ids", "n_covered_cases"]
#     cols = list(df.columns)

#     # Remove the targets if present
#     for c in target_order:
#         if c in cols:
#             cols.remove(c)

#     # Insert them at positions 7 and 8 (0-based)
#     insert_pos = 6
#     for c in target_order:
#         if c in df.columns:
#             cols.insert(insert_pos, c)
#             insert_pos += 1

#     return df[cols]

# # Apply to both dataframes
# crm_rules_all_with_coverage = move_coverage_cols(crm_rules_all_with_coverage)
# all_rules_dt_with_coverage = move_coverage_cols(all_rules_dt_with_coverage)
# all_rules_ripperk_with_coverage = move_coverage_cols(all_rules_ripperk_with_coverage)


In [None]:
display(all_rules_dt_with_coverage)
display(all_rules_ripperk_with_coverage)
display(crm_rules_all_with_coverage)

In [None]:
# ---------- Greedy 80% coverage (POSITIVE CLASS ONLY: RHS_label == 1) for CRM, DT, RIPPERK ----------

import pandas as pd
import numpy as np
from itertools import chain
import os
import re

TARGET_COVERAGE = 0.80  # 80%

# --- Reuse helpers if already defined; otherwise define light fallbacks ---

def _get_enc_col(df: pd.DataFrame) -> str:
    if "Encoding" in df.columns:
        return "Encoding"
    if "Feature Encoding" in df.columns:
        return "Feature Encoding"
    raise KeyError("Neither 'Encoding' nor 'Feature Encoding' found.")

def _tie_break_score(row: pd.Series) -> tuple:
    """Higher is better when gains tie."""
    return (
        float(row.get("LB odds ratio", -np.inf)) if pd.notna(row.get("LB odds ratio", np.nan)) else -np.inf,
        float(row.get("Precision", -np.inf)) if pd.notna(row.get("Precision", np.nan)) else -np.inf,
        float(row.get("Recall", -np.inf)) if pd.notna(row.get("Recall", np.nan)) else -np.inf,
        float(row.get("F1", -np.inf)) if pd.notna(row.get("F1", np.nan)) else -np.inf,
        -len(str(row.get("Rule", ""))),
    )

def _greedy_set_cover(group_df: pd.DataFrame, all_cases: set, covered_lists_col: str = "covered_case_ids",
                      max_rules: int | None = None) -> tuple[list, set]:
    """
    Greedy set cover on POSITIVE-class rules only (caller should pre-filter).
    Returns (selected_rule_indices, covered_cases_set).
    """
    if not len(all_cases):
        return [], set()

    idxs = list(group_df.index)
    rule_sets = {}
    for i in idxs:
        v = group_df.at[i, covered_lists_col]
        if isinstance(v, list):
            rule_sets[i] = set(map(str, v))
        else:
            rule_sets[i] = set()

    selected = []
    covered = set()

    achievable = set().union(*rule_sets.values()) if rule_sets else set()
    if not achievable:
        return [], set()

    while len(covered) / len(all_cases) < TARGET_COVERAGE:
        best_idx = None
        best_gain = 0
        best_tiebreak = None

        for i in idxs:
            if i in selected:
                continue
            gain = len(rule_sets[i] - covered)
            if gain > best_gain:
                best_idx = i
                best_gain = gain
                best_tiebreak = _tie_break_score(group_df.loc[i])
            elif gain == best_gain and gain > 0:
                t = _tie_break_score(group_df.loc[i])
                if best_tiebreak is None or t > best_tiebreak:
                    best_idx = i
                    best_tiebreak = t

        if best_gain == 0 or best_idx is None:
            break

        selected.append(best_idx)
        covered |= rule_sets[best_idx]

        if max_rules is not None and len(selected) >= max_rules:
            break

    return selected, covered

# --- Use the existing path helpers from earlier cells if present; else define minimal versions ---
try:
    BASE_DIR
except NameError:
    BASE_DIR = "3.2_binned_features"

try:
    LABELING_FOLDER_MAP
except NameError:
    LABELING_FOLDER_MAP = {
        "declare": "declare_features",
        "sequential": "sequential_features",
        "payload": "payload_features",
    }

def _find_ci_subdir(parent: str, target: str) -> str | None:
    t = str(target).lower()
    try:
        for d in os.listdir(parent):
            full = os.path.join(parent, d)
            if os.path.isdir(full) and d.lower() == t:
                return full
    except FileNotFoundError:
        return None
    return None

def _resolve_enc_path(dataset: str, labeling: str, encoding: str, base_dir: str) -> str | None:
    if dataset is None or labeling is None or encoding is None:
        return None
    ds_dir = os.path.join(base_dir, str(dataset))
    if not os.path.isdir(ds_dir):
        return None
    lab_norm = str(labeling).strip().lower()
    lab_folder = LABELING_FOLDER_MAP.get(lab_norm, f"{lab_norm}_features")
    lab_dir = os.path.join(ds_dir, lab_folder)
    if not os.path.isdir(lab_dir):
        ci_lab = _find_ci_subdir(ds_dir, lab_folder)
        if not ci_lab:
            return None
        lab_dir = ci_lab
    enc_exact = os.path.join(lab_dir, str(encoding))
    if os.path.isdir(enc_exact):
        return enc_exact
    return _find_ci_subdir(lab_dir, str(encoding))

def _infer_case_col(df: pd.DataFrame) -> str:
    for c in ["Case_ID", "case:concept:name", "Case ID", "case_id"]:
        if c in df.columns:
            return c
    raise KeyError("No Case ID column found (tried: Case_ID, case:concept:name, Case ID, case_id)")

def _positive_universe_for_group(ds, lab, enc, group_df: pd.DataFrame, enc_col: str) -> set:
    """
    Read the encoded CSV for (ds,lab,enc) and return the set of case IDs with Label == 1.
    Fallback: if path/Label missing, use union of covered_case_ids for positive rules (may under-estimate denominator).
    """
    path = _resolve_enc_path(ds, lab, enc, BASE_DIR)
    if path:
        csv_files = [f for f in os.listdir(path) if f.lower().endswith(".csv")]
        if csv_files:
            csv_path = os.path.join(path, csv_files[0])
            df_enc = pd.read_csv(csv_path)
            if "Label" in df_enc.columns:
                case_col = _infer_case_col(df_enc)
                pos_ids = (
                    df_enc.loc[pd.to_numeric(df_enc["Label"], errors="coerce") == 1, case_col]
                    .dropna().astype(str).unique().tolist()
                )
                return set(pos_ids)

    # Fallback: union of covered cases for positive rules only (achievable positives)
    pos_lists = group_df.loc[group_df.get("RHS_label", 1) == 1, "covered_case_ids"]
    pos_lists = [lst for lst in pos_lists if isinstance(lst, list)]
    return set(chain.from_iterable(pos_lists)) if pos_lists else set()

def _summarize_model_pos(df_with_cov: pd.DataFrame, model_name: str) -> pd.DataFrame:
    """
    Positive-class (RHS_label==1) greedy 80% coverage per (Dataset,Labeling,Encoding).
    Returns columns:
      Dataset, Labeling, Encoding, {MODEL}_achieved_coverage, {MODEL}_nr_rules
    """
    if not isinstance(df_with_cov, pd.DataFrame):
        raise ValueError(f"{model_name}: input must be a DataFrame.")

    enc_col = _get_enc_col(df_with_cov)
    out_rows = []
    group_cols = [c for c in ["Dataset", "Labeling", enc_col] if c in df_with_cov.columns]

    for keys, g in df_with_cov.groupby(group_cols, dropna=False):
        vals = dict(zip(group_cols, keys))
        ds  = vals.get("Dataset")
        lab = vals.get("Labeling")
        enc = vals.get(enc_col)

        # Positive rules only
        g_pos = g[g.get("RHS_label", 1) == 1]
        # Positive universe (all positive cases in the dataset)
        pos_all = _positive_universe_for_group(ds, lab, enc, g_pos, enc_col)

        selected, covered = _greedy_set_cover(g_pos, pos_all)

        achieved = (len(covered) / len(pos_all)) if len(pos_all) else 0.0
        out_rows.append({
            "Dataset": ds,
            "Labeling": lab,
            "Encoding": enc,
            f"{model_name}_achieved_coverage": round(achieved, 4),
            f"{model_name}_nr_rules": len(selected),
        })

    out = pd.DataFrame(out_rows)
    if not out.empty:
        out = out.sort_values(by=["Dataset", "Labeling", "Encoding"], kind="mergesort").reset_index(drop=True)
    return out

# ---- Run POSITIVE-class summaries for each model ----
if 'crm_rules_all_with_coverage' not in globals():
    raise RuntimeError("crm_rules_all_with_coverage not found. Run the CRM coverage cell first.")
if 'all_rules_dt_with_coverage' not in globals():
    raise RuntimeError("all_rules_dt_with_coverage not found. Run the DT coverage cell first.")
if 'all_rules_ripperk_with_coverage' not in globals():
    raise RuntimeError("all_rules_ripperk_with_coverage not found. Run the RIPPERK coverage cell first.")

crm_summary_pos       = _summarize_model_pos(crm_rules_all_with_coverage, "CRM")
dt_summary_pos        = _summarize_model_pos(all_rules_dt_with_coverage, "DT")
ripperk_summary_pos   = _summarize_model_pos(all_rules_ripperk_with_coverage, "RIPPERK")

# ---- Merge into a single table with requested columns (same names as before) ----
summary_pos_merged = crm_summary_pos.merge(dt_summary_pos, how="outer", on=["Dataset","Labeling","Encoding"]) \
                                    .merge(ripperk_summary_pos, how="outer", on=["Dataset","Labeling","Encoding"])

# Fill missing
for col in ["CRM_achieved_coverage","DT_achieved_coverage","RIPPERK_achieved_coverage"]:
    if col in summary_pos_merged.columns:
        summary_pos_merged[col] = summary_pos_merged[col].fillna(0.0)
for col in ["CRM_nr_rules","DT_nr_rules","RIPPERK_nr_rules"]:
    if col in summary_pos_merged.columns:
        summary_pos_merged[col] = summary_pos_merged[col].fillna(0).astype(int)

# Order columns exactly as requested
desired_cols = [
    "Dataset","Labeling","Encoding",
    "CRM_achieved_coverage","CRM_nr_rules",
    "DT_achieved_coverage","DT_nr_rules",
    "RIPPERK_achieved_coverage","RIPPERK_nr_rules",
]
summary_pos_merged = summary_pos_merged.reindex(columns=desired_cols)

summary_pos_merged


In [None]:
# --- Export POS coverage to LaTeX: only Traffic rows, drop 'Dataset' column, keep 2-dec coverage ---

import pandas as pd
import numpy as np

if 'summary_pos_merged' not in globals() or not isinstance(summary_pos_merged, pd.DataFrame):
    raise ValueError("summary_pos_merged not found. Run the previous cell that builds it first.")

df = summary_pos_merged.copy()

# 1) Filter to Traffic only (case-insensitive)
if "Dataset" not in df.columns:
    raise KeyError("Expected a 'Dataset' column in summary_pos_merged.")
mask_traffic = df["Dataset"].astype(str).str.strip().str.lower() == "traffic"
df = df.loc[mask_traffic].copy()

# 2) Identify encoding column
enc_col = None
for cand in ("Encoding", "Feature Encoding", "encoding", "feature encoding"):
    if cand in df.columns:
        enc_col = cand
        break
if enc_col is None:
    raise KeyError("No encoding column found (looked for 'Encoding' / 'Feature Encoding').")

# 3) Detokenize Encoding for LaTeX
df[enc_col] = df[enc_col].astype(str).apply(lambda s: rf"\detokenize{{{s}}}")

# 4) Ensure types and format coverage columns to 2 decimals
cov_cols = ["CRM_achieved_coverage", "DT_achieved_coverage", "RIPPERK_achieved_coverage"]
for c in cov_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

int_cols = ["CRM_nr_rules","DT_nr_rules","RIPPERK_nr_rules"]
for c in int_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

def fmt2(x):
    return "" if pd.isna(x) else f"{float(x):.2f}"
formatters = {c: fmt2 for c in cov_cols if c in df.columns}

# 5) Drop Dataset and order remaining columns
desired_order = [
    "Labeling", enc_col,
    "CRM_achieved_coverage", "DT_achieved_coverage", "RIPPERK_achieved_coverage",
    "CRM_nr_rules", "DT_nr_rules", "RIPPERK_nr_rules",
]
present = [c for c in desired_order if c in df.columns]
df = df.reindex(columns=present)

# 6) Export to LaTeX
latex_path = "5_analysis/rule_coverage_final.tex"
with open(latex_path, "w", encoding="utf-8") as f:
    f.write(df.to_latex(index=False, escape=False, formatters=formatters))

print(f"LaTeX table written to: {latex_path}")
df


### Filter best performing rules

In [None]:
# all_rules_crm_path = os.path.join('5_analysis', 'random','DHL', 'dhl_features' ,'combined_sorted.csv')
all_rules_crm_path = os.path.join('5_analysis', 'random', 'combined_sorted_all.csv')
crm_rules = pd.read_csv(all_rules_crm_path, sep=',')

# all_rules_dt_path = os.path.join('5_analysis', 'dt', 'rules_dt.csv')
# dt_rules = pd.read_csv(all_rules_dt_path, sep=',')

# all_rules_ripperk_path = os.path.join('5_analysis', 'ripperk', 'rules_ripperk.csv')
# ripperk_rules = pd.read_csv(all_rules_ripperk_path, sep=',')

In [None]:
# Extract the RHS (Label / !Label) from the Rule column
rhs = crm_rules["Rule"].astype(str).str.extract(r"-->\s*(!?Label)\s*$", expand=False)

# Keep only rows where RHS is exactly "Label" (i.e., positive rules)
mask_pos = rhs.eq("Label")
crm_rules = crm_rules.loc[mask_pos].copy()

print(f"Kept {crm_rules.shape[0]} positive rules (Label).")


### Top 10 overall per labeling

In [None]:
# --- Configuration ---
LAB_COL = "Labeling"
METRICS = ["LB odds ratio", "Support LHS", "Confidence", "Lift", "Conviction"]
TOP_N = 10
TIE_BREAKERS = ["Support LHS", "LB OR"]  # higher is better

# --- Sanity checks ---
missing = [c for c in [LAB_COL] + METRICS if c not in crm_rules.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

def _rank_within_group(g: pd.DataFrame, col: str) -> pd.Series:
    """
    Rank a single metric within a labeling group (higher is better).
    NaNs get worst rank (group_size + 1).
    """
    r = g[col].rank(method="dense", ascending=False)
    return r.fillna(len(g) + 1)

# --- Compute per-metric ranks within each labeling ---
df_ranked = crm_rules.copy()
rank_cols = []

for m in METRICS:
    rcol = f"rank::{m}"
    df_ranked[rcol] = (
        df_ranked.groupby(LAB_COL, group_keys=False)
                 .apply(_rank_within_group, col=m)
    )
    rank_cols.append(rcol)

# --- Aggregate ranks: lower is better (since rank 1 is best) ---
df_ranked["rank_agg"] = df_ranked[rank_cols].mean(axis=1)

# --- Build per-labeling top-N (sort within each labeling by rank_agg asc, then tie-breakers desc) ---
sort_cols = ["rank_agg"] + [c for c in TIE_BREAKERS if c in df_ranked.columns]
ascending = [True] + [False] * (len(sort_cols) - 1)

top10_by_labeling = {
    lab: grp.sort_values(sort_cols, ascending=ascending).head(TOP_N).reset_index(drop=True)
    for lab, grp in df_ranked.groupby(LAB_COL, dropna=False)
}

# --- Convenience variables for the common three label values (only if they exist) ---
top10_declare = top10_by_labeling.get("declare")
top10_sequential = top10_by_labeling.get("sequential")
top10_payload = top10_by_labeling.get("payload")

# (Optional) quick peek
for lab, df_top in top10_by_labeling.items():
    print(f"\nTop {TOP_N} for labeling = {lab}")
    display(df_top)


In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

# --- Config ---
out_dir = Path("5_analysis")
out_path = out_dir / "top10_labeling_tables.tex"
out_dir.mkdir(parents=True, exist_ok=True)

TARGET_DEC_COLS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]

# Columns to drop if present
DROP_COLS = {
    "labeling", "Labeling",
    "Odds ratio", "UB OR", "n12", "n21",
    "Fair set count", "Stratified",
    "rank::LB OR", "rank::Support LHS", "rank::Confidence",
    "rank::Lift", "rank::Conviction", "rank_agg"
}

def detok(s: pd.Series) -> pd.Series:
    """Wrap strings in \\detokenize{...}. Leave NaN as empty strings."""
    return s.astype(str).fillna("").map(lambda x: rf"\detokenize{{{x}}}" if x != "" else "")

def prepare_for_latex(df: pd.DataFrame) -> pd.DataFrame:
    """Drop unwanted cols, detokenize 'Rule' and 'Encoding' (or 'Feature Encoding'),
    and ensure numeric dtype for TARGET_DEC_COLS (so we can format them)."""
    if df is None or len(df) == 0:
        return df

    # Drop columns if present
    cols_to_drop = [c for c in df.columns if c in DROP_COLS]
    df2 = df.drop(columns=cols_to_drop, errors="ignore").copy()

    # Detokenize Rule
    if "Rule" in df2.columns:
        df2["Rule"] = detok(df2["Rule"])

    # Detokenize Encoding (handle either 'Encoding' or 'Feature Encoding')
    if "Encoding" in df2.columns:
        df2["Encoding"] = detok(df2["Encoding"])
    elif "Feature Encoding" in df2.columns:
        df2["Feature Encoding"] = detok(df2["Feature Encoding"])

    # Make sure target metric columns are numeric (coerce errors to NaN)
    for col in TARGET_DEC_COLS:
        if col in df2.columns:
            df2[col] = pd.to_numeric(df2[col], errors="coerce")

    return df2

def df_to_latex_block(df: pd.DataFrame, title: str) -> str:
    """Convert a dataframe to a LaTeX block, formatting selected columns to 2 decimals."""
    if df is None or len(df) == 0:
        return f"% {title}: (no rows)\n\n"

    # Build per-column formatters only for TARGET_DEC_COLS (others unchanged)
    formatters = {}
    for col in TARGET_DEC_COLS:
        if col in df.columns:
            formatters[col] = (lambda x, c=col: "" if pd.isna(x) else f"{float(x):.2f}")

    return (
        f"% ===== {title} =====\n"
        + df.to_latex(index=False, escape=False, formatters=formatters)
        + "\n\n"
    )

# Prepare dataframes
declare_tex_df    = prepare_for_latex(top10_declare)
sequential_tex_df = prepare_for_latex(top10_sequential)
payload_tex_df    = prepare_for_latex(top10_payload)

# Build one combined .tex file
latex_parts = []
latex_parts.append(df_to_latex_block(declare_tex_df,    "Top 10 — declare"))
latex_parts.append(df_to_latex_block(sequential_tex_df, "Top 10 — sequential"))
latex_parts.append(df_to_latex_block(payload_tex_df,    "Top 10 — payload"))

with open(out_path, "w", encoding="utf-8") as f:
    f.write("% Auto-generated top-10 tables per labeling\n\n")
    f.writelines(latex_parts)

print(f"Saved LaTeX tables to: {out_path}")


### DHL top 10

In [None]:
# # --- Configuration ---
# METRICS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]
# TOP_N = 20
# TIE_BREAKERS = ["Support LHS", "LB OR"]  # higher is better

# # --- Sanity checks ---
# missing = [c for c in METRICS if c not in crm_rules.columns]
# if missing:
#     raise ValueError(f"Missing expected columns: {missing}")

# def _rank_global(df: pd.DataFrame, col: str) -> pd.Series:
#     """
#     Rank a single metric across the full dataset (higher is better).
#     NaNs get worst rank (n + 1).
#     """
#     r = df[col].rank(method="dense", ascending=False)
#     return r.fillna(len(df) + 1)

# # --- Compute per-metric ranks (global, not grouped) ---
# df_ranked = crm_rules.copy()
# rank_cols = []
# for m in METRICS:
#     rcol = f"rank::{m}"
#     df_ranked[rcol] = _rank_global(df_ranked, m)
#     rank_cols.append(rcol)

# # --- Aggregate ranks: lower is better (since rank 1 is best) ---
# df_ranked["rank_agg"] = df_ranked[rank_cols].mean(axis=1)

# # --- Build overall top-N (sort by rank_agg asc, then tie-breakers desc) ---
# sort_cols = ["rank_agg"] + [c for c in TIE_BREAKERS if c in df_ranked.columns]
# ascending = [True] + [False] * (len(sort_cols) - 1)

# top10_overall = (
#     df_ranked
#     .sort_values(sort_cols, ascending=ascending)
#     .head(TOP_N)
#     .reset_index(drop=True)
# )

# # (Optional) quick peek
# display(top10_overall)


In [None]:
# # --- Config ---
# out_dir = Path("5_analysis")
# out_path = out_dir / "top10_overall.tex"
# out_dir.mkdir(parents=True, exist_ok=True)

# TARGET_DEC_COLS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]

# # Columns to drop if present
# DROP_COLS = {
#     "labeling", "Labeling",
#     "Odds ratio", "UB OR", "n12", "n21",
#     "Fair set count", "Stratified",
#     "rank::LB OR", "rank::Support LHS", "rank::Confidence",
#     "rank::Lift", "rank::Conviction", "rank_agg"
# }

# def detok(s: pd.Series) -> pd.Series:
#     """Wrap strings in \\detokenize{...}. Leave NaN as empty strings."""
#     return s.astype(str).fillna("").map(lambda x: rf"\detokenize{{{x}}}" if x != "" else "")

# def prepare_for_latex(df: pd.DataFrame) -> pd.DataFrame:
#     """
#     Drop unwanted cols, detokenize 'Rule' and 'Encoding' (or 'Feature Encoding'),
#     and ensure numeric dtype for TARGET_DEC_COLS (so we can format them).
#     """
#     if df is None or len(df) == 0:
#         return df

#     # Drop columns if present
#     cols_to_drop = [c for c in df.columns if c in DROP_COLS]
#     df2 = df.drop(columns=cols_to_drop, errors="ignore").copy()

#     # Detokenize Rule
#     if "Rule" in df2.columns:
#         df2["Rule"] = detok(df2["Rule"])

#     # Detokenize Encoding (handle either 'Encoding' or 'Feature Encoding')
#     if "Encoding" in df2.columns:
#         df2["Encoding"] = detok(df2["Encoding"])
#     elif "Feature Encoding" in df2.columns:
#         df2["Feature Encoding"] = detok(df2["Feature Encoding"])

#     # Ensure numeric for metrics
#     for col in TARGET_DEC_COLS:
#         if col in df2.columns:
#             df2[col] = pd.to_numeric(df2[col], errors="coerce")

#     # (Optional) Put 'Rule' first if present
#     cols = list(df2.columns)
#     if "Rule" in cols:
#         cols = ["Rule"] + [c for c in cols if c != "Rule"]
#         df2 = df2[cols]

#     return df2

# def df_to_latex_block(df: pd.DataFrame, title: str) -> str:
#     """Convert a dataframe to a LaTeX block, formatting selected columns to 2 decimals."""
#     if df is None or len(df) == 0:
#         return f"% {title}: (no rows)\n\n"

#     # Build per-column formatters only for TARGET_DEC_COLS (others unchanged)
#     formatters = {}
#     for col in TARGET_DEC_COLS:
#         if col in df.columns:
#             formatters[col] = (lambda x, c=col: "" if pd.isna(x) else f"{float(x):.2f}")

#     return (
#         f"% ===== {title} =====\n"
#         + df.to_latex(index=False, escape=False, formatters=formatters)
#         + "\n\n"
#     )

# # Expect `top10_overall` to be defined from the previous ranking step
# overall_tex_df = prepare_for_latex(top10_overall)

# latex_text = df_to_latex_block(overall_tex_df, "Top 10 — Overall")

# with open(out_path, "w", encoding="utf-8") as f:
#     f.write("% Auto-generated top-10 table (overall)\n\n")
#     f.write(latex_text)

# print(f"Saved LaTeX table to: {out_path}")


### Top 3 per encoding

In [None]:
# # --- Configuration ---
# LAB_COL = "labeling"   # expects normalized values like 'declare', 'sequential', 'payload'
# METRICS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]
# TOP_PER_ENCODING = 3
# TIE_BREAKERS = ["Support LHS", "LB OR"]  # higher is better

# # --- Resolve encoding column name robustly ---
# ENC_CANDIDATES = ["Encoding", "Feature Encoding", "Feature encoding", "encoding", "feature encoding"]
# ENC_COL = next((c for c in ENC_CANDIDATES if c in crm_rules.columns), None)
# if ENC_COL is None:
#     raise ValueError(f"Could not find an encoding column among: {ENC_CANDIDATES}")

# # --- Sanity checks ---
# missing = [c for c in [LAB_COL, ENC_COL] + METRICS if c not in crm_rules.columns]
# if missing:
#     raise ValueError(f"Missing expected columns: {missing}")

# def _rank_within_group(g: pd.DataFrame, col: str) -> pd.Series:
#     """Rank a single metric within a labeling group (higher is better)."""
#     r = g[col].rank(method="dense", ascending=False)
#     return r.fillna(len(g) + 1)

# # --- Compute per-metric ranks within each labeling ---
# df_ranked = crm_rules.copy()
# rank_cols = []
# for m in METRICS:
#     rcol = f"rank::{m}"
#     df_ranked[rcol] = df_ranked.groupby(LAB_COL, group_keys=False).apply(_rank_within_group, col=m)
#     rank_cols.append(rcol)

# # --- Aggregate ranks: lower is better (since rank 1 is best) ---
# df_ranked["rank_agg"] = df_ranked[rank_cols].mean(axis=1)

# # --- Sort keys: rank_agg asc, then tie-breakers desc ---
# sort_cols = ["rank_agg"] + [c for c in TIE_BREAKERS if c in df_ranked.columns]
# ascending = [True] + [False] * (len(sort_cols) - 1)

# # --- Get top-3 per encoding within each labeling ---
# def _topk_per_encoding_within_label(grp: pd.DataFrame) -> pd.DataFrame:
#     srt = grp.sort_values(sort_cols, ascending=ascending)
#     out = (
#         srt.groupby(ENC_COL, group_keys=False)
#            .head(TOP_PER_ENCODING)
#     )
#     # Sort final selection per Encoding (alphabetical order)
#     return out.sort_values(by=ENC_COL).reset_index(drop=True)

# top_by_labeling = {
#     lab: _topk_per_encoding_within_label(grp)
#           .reset_index(drop=True)
#     for lab, grp in df_ranked.groupby(LAB_COL, dropna=False)
# }

# # --- Convenience variables for the common three label values (if they exist) ---
# top3_declare    = top_by_labeling.get("declare")
# top3_sequential = top_by_labeling.get("sequential")
# top3_payload    = top_by_labeling.get("payload")

# # (Optional) quick peek
# for lab, df_top in top_by_labeling.items():
#     print(f"\nTop {TOP_PER_ENCODING} per encoding for labeling = {lab}")
#     display(df_top)


In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

# --- Config ---
out_dir = Path("5_analysis")
out_path = out_dir / "top10_labeling_tables.tex"
out_dir.mkdir(parents=True, exist_ok=True)

TARGET_DEC_COLS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]

# Columns to drop if present
DROP_COLS = {
    "labeling", "Labeling",
    "Odds ratio", "UB OR", "n12", "n21",
    "Fair set count", "Stratified",
    "rank::LB OR", "rank::Support LHS", "rank::Confidence",
    "rank::Lift", "rank::Conviction", "rank_agg"
}

def detok(s: pd.Series) -> pd.Series:
    """Wrap strings in \\detokenize{...}. Leave NaN as empty strings."""
    return s.astype(str).fillna("").map(lambda x: rf"\detokenize{{{x}}}" if x != "" else "")

def prepare_for_latex(df: pd.DataFrame) -> pd.DataFrame:
    """Drop unwanted cols, detokenize 'Rule' and 'Encoding' (or 'Feature Encoding'),
    and ensure numeric dtype for TARGET_DEC_COLS (so we can format them)."""
    if df is None or len(df) == 0:
        return df

    # Drop columns if present
    cols_to_drop = [c for c in df.columns if c in DROP_COLS]
    df2 = df.drop(columns=cols_to_drop, errors="ignore").copy()

    # Detokenize Rule
    if "Rule" in df2.columns:
        df2["Rule"] = detok(df2["Rule"])

    # Detokenize Encoding (handle either 'Encoding' or 'Feature Encoding')
    if "Encoding" in df2.columns:
        df2["Encoding"] = detok(df2["Encoding"])
    elif "Feature Encoding" in df2.columns:
        df2["Feature Encoding"] = detok(df2["Feature Encoding"])

    # Make sure target metric columns are numeric (coerce errors to NaN)
    for col in TARGET_DEC_COLS:
        if col in df2.columns:
            df2[col] = pd.to_numeric(df2[col], errors="coerce")

    return df2

def df_to_latex_block(df: pd.DataFrame, title: str) -> str:
    """Convert a dataframe to a LaTeX block, formatting selected columns to 2 decimals."""
    if df is None or len(df) == 0:
        return f"% {title}: (no rows)\n\n"

    # Build per-column formatters only for TARGET_DEC_COLS (others unchanged)
    formatters = {}
    for col in TARGET_DEC_COLS:
        if col in df.columns:
            formatters[col] = (lambda x, c=col: "" if pd.isna(x) else f"{float(x):.2f}")

    return (
        f"% ===== {title} =====\n"
        + df.to_latex(index=False, escape=False, formatters=formatters)
        + "\n\n"
    )

# Prepare dataframes
declare_tex_df    = prepare_for_latex(top10_declare)
sequential_tex_df = prepare_for_latex(top10_sequential)
payload_tex_df    = prepare_for_latex(top10_payload)

# Build one combined .tex file
latex_parts = []
latex_parts.append(df_to_latex_block(declare_tex_df,    "Top 10 — declare"))
latex_parts.append(df_to_latex_block(sequential_tex_df, "Top 10 — sequential"))
latex_parts.append(df_to_latex_block(payload_tex_df,    "Top 10 — payload"))

with open(out_path, "w", encoding="utf-8") as f:
    f.write("% Auto-generated top-10 tables per labeling\n\n")
    f.writelines(latex_parts)

print(f"Saved LaTeX tables to: {out_path}")
