In [None]:
# Imports
from pathlib import Path
import os
import glob
import re
import itertools

import numpy as np
import pandas as pd


In [None]:
# Load rule datasets
all_rules_crm_path = Path("5_analysis/random/combined_sorted_all.csv")
all_rules_dt_path = Path("5_analysis/dt/rules_dt.csv")
all_rules_ripperk_path = Path("5_analysis/ripperk/rules_ripperk.csv")

all_rules_crm = pd.read_csv(all_rules_crm_path)
all_rules_dt = pd.read_csv(all_rules_dt_path)
all_rules_ripperk = pd.read_csv(all_rules_ripperk_path)


In [None]:
# Distinct encodings per labeling
unique_counts = (
    all_rules_crm
    .groupby("Labeling")["Feature Encoding"]
    .nunique()
    .reset_index(name="unique_encodings")
)

display(unique_counts)

In [None]:
# Normalize labeling values and align column names across dataframes

def _normalize_labeling_column(df: pd.DataFrame) -> None:
    """
    Normalize the labeling/Labeling column to one of:
    {'declare', 'sequential', 'payload'} based on substrings.
    Operates in-place.
    """
    for col in ("labeling", "Labeling"):
        if col in df.columns:
            lower = df[col].astype(str).str.lower()
            df[col] = np.select(
                [
                    lower.str.contains("decl", na=False),
                    lower.str.contains("mr_tr", na=False),
                    lower.str.contains("payload", na=False),
                ],
                ["declare", "sequential", "payload"],
                default=df[col],
            )

def _canonicalize_and_subset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardize common column variants and return only the key columns
    if present: ['Dataset', 'Labeling', 'Feature Encoding', 'Rule'].
    """
    rename_map = {}
    for c in df.columns:
        cl = c.lower()
        if cl == "dataset":
            rename_map[c] = "Dataset"
        elif cl == "labeling":
            rename_map[c] = "Labeling"
        elif cl in {"feature encoding", "encoding"}:
            rename_map[c] = "Feature Encoding"
        elif cl == "rule":
            rename_map[c] = "Rule"
    df2 = df.rename(columns=rename_map)

    keep = [c for c in ["Dataset", "Labeling", "Feature Encoding", "Rule"] if c in df2.columns]
    return df2[keep].copy() if keep else df2.copy()

# Apply normalization and column alignment to all three dataframes
for name in ("all_rules_crm", "all_rules_dt", "all_rules_ripperk"):
    df = globals()[name]
    _normalize_labeling_column(df)
    df = _canonicalize_and_subset(df)
    globals()[name] = df

# Quick check
display(all_rules_crm)
display(all_rules_dt)
display(all_rules_ripperk)


## Splitting CRM

In [None]:
# 1) Parse LHS and RHS from all_rules_crm["Rule"]

def extract_lhs_exact(rule_str: str) -> str:
    """Return the substring before '-->' (keeps quotes/brackets as-is)."""
    m = re.search(r"^(.*?)(?=\s*-->)", str(rule_str))
    return m.group(1) if m else str(rule_str)

def parse_rhs_label(rule_str: str):
    """Map RHS to 1 for 'Label', 0 for '!Label'; None if not present."""
    m = re.search(r"-->\s*(Label|!Label)", str(rule_str))
    if not m:
        return None
    return 1 if m.group(1) == "Label" else 0

crm_df = all_rules_crm.copy()
crm_df["LHS_features"] = crm_df["Rule"].apply(extract_lhs_exact)
crm_df["RHS_label"]    = crm_df["Rule"].apply(parse_rhs_label)

# 2) Split LHS into up to 3 features

def _find_outer_brackets_span(text: str):
    """Indices of the outermost [...] in text; returns (start, end)."""
    s = str(text)
    start = s.find("[")
    if start < 0:
        return None, None

    depth = 0
    in_s = in_d = esc = False
    end = None
    for i, ch in enumerate(s[start:], start):
        if esc:
            esc = False
            continue
        if ch == "\\":
            esc = True
            continue

        if in_s:
            if ch == "'":
                in_s = False
            continue
        if in_d:
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            in_s = True
            continue
        if ch == '"':
            in_d = True
            continue

        if ch == "[":
            depth += 1
            continue
        if ch == "]":
            depth -= 1
            if depth == 0:
                end = i
                break
    return (start, end)

def _split_top_level_commas(content: str):
    """Split on commas that are not inside quotes."""
    parts, curr = [], ""
    in_s = in_d = esc = False
    for ch in content:
        if esc:
            curr += ch
            esc = False
            continue
        if ch == "\\":
            curr += ch
            esc = True
            continue

        if in_s:
            curr += ch
            if ch == "'":
                in_s = False
            continue
        if in_d:
            curr += ch
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            curr += ch
            in_s = True
            continue
        if ch == '"':
            curr += ch
            in_d = True
            continue

        if ch == ",":
            parts.append(curr.strip())
            curr = ""
        else:
            curr += ch
    parts.append(curr.strip())
    return parts

def _strip_one_layer_quotes(s: str):
    """Remove a single pair of outer quotes if present."""
    s = s.strip()
    if len(s) >= 2 and ((s[0] == s[-1] == "'") or (s[0] == s[-1] == '"')):
        return s[1:-1]
    return s

def split_lhs_items(lhs_text: str):
    """
    Input like "['A', 'B', 'C']" or "['A']" → list ['A','B','C'].
    """
    s = str(lhs_text)
    start, end = _find_outer_brackets_span(s)
    if start is None or end is None:
        return []
    inner = s[start + 1 : end]  # inside [...]
    raw_items = _split_top_level_commas(inner)
    return [_strip_one_layer_quotes(x).strip() for x in raw_items if x != ""]

def _pad3(items):
    """Keep at most 3 items; right-pad with empty strings."""
    items = items[:3]
    return items + [""] * (3 - len(items))

lhs_split = crm_df["LHS_features"].apply(split_lhs_items).apply(_pad3)
lhs_df = pd.DataFrame(lhs_split.tolist(), columns=["feature_1_lhs", "feature_2_lhs", "feature_3_lhs"])

# 3) Assemble expanded table
cols_present = [c for c in ["Dataset", "Labeling", "Feature Encoding", "Rule", "LHS_features", "RHS_label"] if c in crm_df.columns]

all_rules_crm_expanded = pd.concat(
    [crm_df[cols_present].reset_index(drop=True), lhs_df.reset_index(drop=True)],
    axis=1,
).reset_index(drop=True)

# Sort if keys are available
sort_keys = [c for c in ["Dataset", "Labeling", "Feature Encoding"] if c in all_rules_crm_expanded.columns]
if sort_keys:
    all_rules_crm_expanded = (
        all_rules_crm_expanded.sort_values(by=sort_keys, ascending=True).reset_index(drop=True)
    )

all_rules_crm_expanded

In [None]:
# ---------- Expand DT and RIPPERk rules: support up to 15 LHS features ----------

def extract_lhs_exact(rule_str: str) -> str:
    """Everything before the arrow '-->' (preserve characters exactly)."""
    m = re.search(r"^(.*?)(?=\s*-->)", str(rule_str))
    return m.group(1).strip() if m else str(rule_str).strip()

def parse_rhs_label(rule_str: str):
    """Return 1 for 'Label', 0 for '!Label', or None if not found."""
    m = re.search(r"-->\s*(Label|!Label)", str(rule_str))
    if not m:
        return None
    return 1 if m.group(1) == "Label" else 0

# Splitter for DT/RIPPERK: use logical-and '∧' (U+2227); also accept ASCII '&' as fallback.
_AND_SPLIT_RE = re.compile(r"\s*(?:∧|&)\s*")

def split_lhs_items_dt(lhs_text: str):
    """
    For DT/RIPPERK rule format, LHS looks like:
      [feature1 ∧ feature2 ∧ feature3 ∧ ...]
    We split on '∧' (and '&' as fallback), strip outer [ ], then trim items.
    """
    s = str(lhs_text).strip()
    if len(s) >= 2 and s[0] == '[' and s[-1] == ']':
        s = s[1:-1]
    if not s:
        return []
    parts = _AND_SPLIT_RE.split(s)
    return [p.strip() for p in parts if p.strip() != ""]

def _padN(items, n=15):
    items = items[:n]
    return items + [""] * (n - len(items))

def _expand_df_with_lhs_rhs(df_in: pd.DataFrame, name_hint: str, max_features: int = 15):
    """
    Given a dataframe with at least ['Rule'] column, produce an expanded version with:
      - LHS_features: exact text before -->
      - RHS_label: {1,0,None}
      - feature_1_lhs ... feature_{max_features}_lhs (split on ∧ / & for DT/RIPPERK)
    Keeps any of ['Dataset','Labeling','Feature Encoding','Rule'] that exist.
    """
    if "Rule" not in df_in.columns:
        raise KeyError(f"{name_hint}: expected a 'Rule' column.")

    df = df_in.copy()
    df["LHS_features"] = df["Rule"].apply(extract_lhs_exact)
    df["RHS_label"]    = df["Rule"].apply(parse_rhs_label)

    lhs_split = df["LHS_features"].apply(split_lhs_items_dt).apply(lambda xs: _padN(xs, max_features))
    feat_cols = [f"feature_{i}_lhs" for i in range(1, max_features+1)]
    lhs_df = pd.DataFrame(lhs_split.tolist(), columns=feat_cols)

    keep = [c for c in ["Dataset","Labeling","Feature Encoding","Rule","LHS_features","RHS_label"] if c in df.columns or c in ["LHS_features","RHS_label"]]
    expanded = pd.concat([df[keep].reset_index(drop=True), lhs_df.reset_index(drop=True)], axis=1)

    sort_keys = [c for c in ["Dataset","Labeling","Feature Encoding"] if c in expanded.columns]
    if sort_keys:
        expanded = expanded.sort_values(by=sort_keys, ascending=True).reset_index(drop=True)
    return expanded

# Build the expanded tables (now with up to 15 features)
all_rules_dt_expanded       = _expand_df_with_lhs_rhs(all_rules_dt, "DT", max_features=15)
all_rules_ripperk_expanded  = _expand_df_with_lhs_rhs(all_rules_ripperk, "RIPPERk", max_features=15)

# Quick peek
display(all_rules_dt_expanded)
display(all_rules_ripperk_expanded)


## Coverage calculation

In [None]:
# Recompute rule coverage using normalized labeling folder names and add per-group case IDs

BASE_DIR = "3.2_binned_features"

# Map normalized labeling → folder name
LABELING_FOLDER_MAP = {
    "declare": "declare_features",
    "sequential": "sequential_features",
    "payload": "payload_features",
}

def _find_ci_subdir(parent: str, target: str) -> str | None:
    """Case-insensitive subdirectory lookup."""
    t = str(target).lower()
    try:
        for d in os.listdir(parent):
            full = os.path.join(parent, d)
            if os.path.isdir(full) and d.lower() == t:
                return full
    except FileNotFoundError:
        return None
    return None

def _resolve_enc_path(dataset: str, labeling: str, encoding: str, base_dir: str) -> str | None:
    """
    Expected layout:
      {base_dir}/{Dataset}/{declare|sequential|payload}_features/{Encoding}
    Uses case-insensitive matching for the labeling/encoding folders.
    """
    if dataset is None or labeling is None or encoding is None:
        return None

    ds_dir = os.path.join(base_dir, str(dataset))
    if not os.path.isdir(ds_dir):
        return None

    lab_norm = str(labeling).strip().lower()
    lab_folder = LABELING_FOLDER_MAP.get(lab_norm) or f"{lab_norm}_features"

    lab_dir = os.path.join(ds_dir, lab_folder)
    if not os.path.isdir(lab_dir):
        lab_dir = _find_ci_subdir(ds_dir, lab_folder)
        if not lab_dir:
            return None

    enc_exact = os.path.join(lab_dir, str(encoding))
    if os.path.isdir(enc_exact):
        return enc_exact
    return _find_ci_subdir(lab_dir, str(encoding))

# --- Helpers for coverage ----------------------------------------------------

def _infer_case_col(df: pd.DataFrame) -> str:
    for c in ["Case_ID", "case:concept:name", "Case ID", "case_id"]:
        if c in df.columns:
            return c
    raise KeyError("No Case ID column found (tried: Case_ID, case:concept:name, Case ID, case_id)")

def _norm_numeric(col: pd.Series) -> pd.Series:
    if col.dtype == bool:
        return col.astype(int)
    out = pd.to_numeric(col, errors="coerce")
    if out.isna().all() and col.dtype == object:
        return col
    return out

NUM_SUFFIX_RE = re.compile(r"_(\-?\d+(?:\.\d+)?)$")

def _match_single_feature(df: pd.DataFrame, feat: str) -> pd.Series:
    """
    Match a single feature token against the event-level feature columns:
      - exact one-hot column
      - base_<num> (equality on numeric/string)
      - binned base_(...) or base_[...]
    Returns a boolean mask.
    """
    feat = str(feat).strip().strip('"').strip("'")

    # A) exact one-hot column
    if feat in df.columns:
        col = _norm_numeric(df[feat])
        return (col == 1) if pd.api.types.is_numeric_dtype(col) else (col.astype(str) == "1")

    # B) base_<num>
    m = NUM_SUFFIX_RE.search(feat)
    if m:
        base_col = feat[:m.start()]
        desired_str = m.group(1)
        desired = float(desired_str)
        if base_col in df.columns:
            col = _norm_numeric(df[base_col])
            if pd.api.types.is_numeric_dtype(col):
                return (col == desired).fillna(False)
            return (col.astype(str) == desired_str).fillna(False)
        # fallback: indicator with suffix
        if feat in df.columns:
            col = _norm_numeric(df[feat])
            return ((col == 1) if pd.api.types.is_numeric_dtype(col) else (col.astype(str) == "1")).fillna(False)

    # C) binned: base_(...) or base_[...]
    pos1 = feat.rfind("_(")
    pos2 = feat.rfind("_[")
    split_pos = max(pos1, pos2)
    if split_pos != -1:
        base_col = feat[:split_pos]
        bin_val  = feat[split_pos + 1 :]  # includes the bracket
        if base_col in df.columns:
            return (df[base_col].astype(str) == bin_val).fillna(False)

    return pd.Series(False, index=df.index)

def _match_rule(df: pd.DataFrame, features: list, rhs_label: int) -> pd.Series:
    """AND all feature matches and enforce RHS label (1=Label, 0=!Label)."""
    mask = pd.Series(True, index=df.index)
    for f in features:
        if f:
            mask &= _match_single_feature(df, f)
            if not mask.any():
                break
    if rhs_label in (0, 1):
        mask &= (pd.to_numeric(df["Label"], errors="coerce") == rhs_label)
    else:
        mask &= False
    return mask

# --- Prepare output frame ----------------------------------------------------

if "all_rules_crm_expanded" not in globals() or not isinstance(all_rules_crm_expanded, pd.DataFrame):
    raise ValueError("Expected all_rules_crm_expanded to be present as a DataFrame.")

ENC_COL = "Encoding" if "Encoding" in all_rules_crm_expanded.columns else \
          ("Feature Encoding" if "Feature Encoding" in all_rules_crm_expanded.columns else None)
if ENC_COL is None:
    raise KeyError("Could not find 'Encoding' or 'Feature Encoding' in all_rules_crm_expanded.")

crm_rules_all_with_coverage = all_rules_crm_expanded.copy()
crm_rules_all_with_coverage["covered_case_ids"] = [[] for _ in range(len(crm_rules_all_with_coverage))]
crm_rules_all_with_coverage["n_covered_cases"] = 0
crm_rules_all_with_coverage["all_case_ids"] = pd.Series([[]] * len(crm_rules_all_with_coverage), dtype="object")

# --- Compute coverage per (Dataset, Labeling, Encoding) ----------------------

group_cols = [c for c in ["Dataset", "Labeling", ENC_COL] if c in crm_rules_all_with_coverage.columns]

for keys, g in crm_rules_all_with_coverage.groupby(group_cols):
    vals = dict(zip(group_cols, keys))
    ds  = vals.get("Dataset")
    lab = vals.get("Labeling")
    enc = vals.get(ENC_COL)

    enc_path = _resolve_enc_path(ds, lab, enc, BASE_DIR)
    if enc_path is None:
        continue

    csv_files = [f for f in os.listdir(enc_path) if f.lower().endswith(".csv")]
    if not csv_files:
        continue
    csv_path = os.path.join(enc_path, csv_files[0])

    df_enc = pd.read_csv(csv_path)
    if "Label" not in df_enc.columns:
        continue
    case_col = _infer_case_col(df_enc)

    # Cache all case IDs for this (dataset, labeling, encoding)
    all_ids = df_enc[case_col].dropna().astype(str).unique().tolist()
    crm_rules_all_with_coverage.loc[g.index, "all_case_ids"] = pd.Series(
        [all_ids] * len(g), index=g.index, dtype="object"
    )

    # Gather feature columns in numeric order
    feat_cols = [c for c in crm_rules_all_with_coverage.columns if re.fullmatch(r"feature_\d+_lhs", c)]
    feat_cols = sorted(feat_cols, key=lambda x: int(re.findall(r"\d+", x)[0])) if feat_cols else []

    # Evaluate coverage per rule
    for idx, row in g.iterrows():
        feats = [row.get(c, "") for c in feat_cols]
        feats = [f for f in feats if isinstance(f, str) and f.strip() != ""]
        rhs = row.get("RHS_label", None)

        mask = _match_rule(df_enc, feats, rhs)
        covered = df_enc.loc[mask, case_col].dropna().astype(str).unique().tolist()

        crm_rules_all_with_coverage.at[idx, "covered_case_ids"] = covered
        crm_rules_all_with_coverage.at[idx, "n_covered_cases"] = len(covered)

# Optional: order for readability
sort_keys = [c for c in ["Dataset", "Labeling", ENC_COL, "n_covered_cases"] if c in crm_rules_all_with_coverage.columns]
if sort_keys:
    crm_rules_all_with_coverage = crm_rules_all_with_coverage.sort_values(by=sort_keys).reset_index(drop=True)

crm_rules_all_with_coverage


In [None]:
# Compute per-rule coverage for DT & RIPPERK rules and attach all_case_ids
# Supported feature predicates include:
#   01_HOOFD_011 = 0
#   alternate_precedence:(01_HOOFD_011,01_HOOFD_015):Data <= 0.0
#   monitoringResource|first|literal_binned_(560925.0, 12941730.0] = 0/1
#
# Expected folder layout:
#   3.2_binned_features/{Dataset}/{declare_features|sequential_features|payload_features}/{Encoding}/*.csv

BASE_DIR = "3.2_binned_features"
LABELING_FOLDER_MAP = {
    "declare": "declare_features",
    "sequential": "sequential_features",
    "payload": "payload_features",
}

# -------- Path & dataframe utilities ----------------------------------------

def _find_ci_subdir(parent: str, target: str) -> str | None:
    """Return a case-insensitive match for subdirectory `target` inside `parent`."""
    t = str(target).lower()
    try:
        for d in os.listdir(parent):
            full = os.path.join(parent, d)
            if os.path.isdir(full) and d.lower() == t:
                return full
    except FileNotFoundError:
        return None
    return None

def _resolve_enc_path(dataset: str, labeling: str, encoding: str, base_dir: str) -> str | None:
    """
    Resolve:
      {base_dir}/{Dataset}/{declare|sequential|payload}_features/{Encoding}
    Labeling/encoding are matched case-insensitively.
    """
    if dataset is None or labeling is None or encoding is None:
        return None

    ds_dir = os.path.join(base_dir, str(dataset))
    if not os.path.isdir(ds_dir):
        return None

    lab_norm = str(labeling).strip().lower()
    lab_folder = LABELING_FOLDER_MAP.get(lab_norm, f"{lab_norm}_features")

    lab_dir = os.path.join(ds_dir, lab_folder)
    if not os.path.isdir(lab_dir):
        lab_dir = _find_ci_subdir(ds_dir, lab_folder)
        if not lab_dir:
            return None

    enc_exact = os.path.join(lab_dir, str(encoding))
    if os.path.isdir(enc_exact):
        return enc_exact
    return _find_ci_subdir(lab_dir, str(encoding))

def _infer_case_col(df: pd.DataFrame) -> str:
    for c in ["Case_ID", "case:concept:name", "Case ID", "case_id"]:
        if c in df.columns:
            return c
    raise KeyError("No Case ID column found (tried: Case_ID, case:concept:name, Case ID, case_id)")

def _to_numeric(series: pd.Series) -> pd.Series:
    if series.dtype == bool:
        return series.astype(int)
    return pd.to_numeric(series, errors="coerce")

def _strip_one_layer_quotes(s: str) -> str:
    s = str(s).strip()
    if len(s) >= 2 and s[0] == s[-1] and s[0] in {"'", '"'}:
        return s[1:-1]
    return s

# -------- Feature expression parsing & evaluation ---------------------------

# Pattern: "<col> <op> <val>"  where op ∈ {>=, <=, !=, ==, =, >, <}
_OP_RE = re.compile(r"^(?P<col>.+?)\s*(?P<op>>=|<=|!=|==|=|>|<)\s*(?P<val>.+?)\s*$")

def _parse_feature_expr(expr: str):
    """Return (column, operator, rhs_string) or (None, None, None) if not parsable."""
    s = str(expr).strip()
    m = _OP_RE.match(s)
    if not m:
        return None, None, None
    col = m.group("col").strip()
    op  = "==" if m.group("op") == "=" else m.group("op")
    val_str = _strip_one_layer_quotes(m.group("val"))
    return col, op, val_str

def _coerce_value(val_str: str):
    """Coerce RHS to numeric/bool when possible; otherwise keep as string."""
    low = str(val_str).strip().lower()
    if low in {"true", "false"}:
        return 1 if low == "true" else 0
    try:
        num = float(val_str)
        return int(num) if num.is_integer() else num
    except Exception:
        return val_str

def _cmp_op(series: pd.Series, op: str, rhs):
    """
    Compare series to rhs. Equality works for numeric and string;
    inequalities coerce series to numeric; NaNs evaluate to False.
    """
    if op in ("==", "!="):
        rhs_is_num = isinstance(rhs, (int, float, np.number))
        if rhs_is_num:
            s_num = _to_numeric(series)
            res = (s_num == rhs) if op == "==" else (s_num != rhs)
            if op == "==":
                s_str = series.astype(str).str.strip()
                rhs_str = str(rhs)
                res = res.fillna(s_str == rhs_str)
            else:
                res = res.fillna(True)
            return res.fillna(False)
        else:
            s_str = series.astype(str).str.strip()
            rhs_str = str(rhs).strip()
            return (s_str == rhs_str) if op == "==" else (s_str != rhs_str)

    # Inequalities
    try:
        rhs_num = float(rhs)
    except Exception:
        return pd.Series(False, index=series.index)
    s_num = _to_numeric(series)
    if op == ">":
        return (s_num > rhs_num).fillna(False)
    if op == "<":
        return (s_num < rhs_num).fillna(False)
    if op == ">=":
        return (s_num >= rhs_num).fillna(False)
    if op == "<=":
        return (s_num <= rhs_num).fillna(False)
    return pd.Series(False, index=series.index)

# Split "<base>_(bin)" into (base, "_(bin)")
_BINVAL_SPLIT_RE = re.compile(r"(.+?)(_[(\[][^)\]]+[)\]])$")

def _match_single_feature_dt(df: pd.DataFrame, expr: str) -> pd.Series:
    """
    Evaluate one DT/RIPPERK predicate against df:
      • direct column comparisons (==, !=, >, <, >=, <=)
      • one-hot binned columns: "<col>_(bin) == 0/1"
      • label-encoded bins: base column stores "(bin)" → interpret "== 1" as base == bin
    """
    col, op, val_str = _parse_feature_expr(expr)
    if col is None:
        return pd.Series(False, index=df.index)

    rhs = _coerce_value(val_str)

    # Direct column
    if col in df.columns:
        return _cmp_op(df[col], op, rhs).fillna(False)

    # Binned notation
    m = _BINVAL_SPLIT_RE.match(col)
    if m:
        base_col = m.group(1)
        bin_label = m.group(2)[1:]  # drop leading underscore

        # Try case-insensitive exact column match
        lower_map = {c.lower(): c for c in df.columns}
        candidate = lower_map.get(col.lower())
        if candidate:
            return _cmp_op(df[candidate], op, rhs).fillna(False)

        # Base column holds the bin string
        if base_col in df.columns and op in ("==", "!="):
            base_series = df[base_col].astype(str)
            if isinstance(rhs, (int, float, np.number)) and rhs in (0, 1):
                is_bin = (base_series == bin_label)
                return (is_bin if (op == "==" and rhs == 1) else
                        ~is_bin if (op == "==" and rhs == 0) else
                        ~is_bin if (op == "!=" and rhs == 1) else
                        is_bin).fillna(False)

    return pd.Series(False, index=df.index)

def _match_rule_dt(df: pd.DataFrame, features: list, rhs_label: int) -> pd.Series:
    """Conjoin all feature matches and enforce Label == rhs_label."""
    mask = pd.Series(True, index=df.index)
    for f in features:
        if f:
            mask &= _match_single_feature_dt(df, f)
            if not mask.any():
                break
    if rhs_label in (0, 1):
        mask &= (pd.to_numeric(df["Label"], errors="coerce") == rhs_label)
    else:
        mask &= False
    return mask

def _compute_coverage_for_rules(expanded_df: pd.DataFrame, name_hint: str):
    """
    Compute coverage for a DT/RIPPERK expanded table.
    Adds: covered_case_ids, n_covered_cases, all_case_ids.
    """
    if not isinstance(expanded_df, pd.DataFrame):
        raise ValueError(f"{name_hint}: expanded_df must be a DataFrame.")

    # Encoding column name
    ENC_COL = "Encoding" if "Encoding" in expanded_df.columns else \
              ("Feature Encoding" if "Feature Encoding" in expanded_df.columns else None)
    if ENC_COL is None:
        raise KeyError(f"{name_hint}: Could not find 'Encoding' or 'Feature Encoding'.")

    out = expanded_df.copy()
    out["covered_case_ids"] = [[] for _ in range(len(out))]
    out["n_covered_cases"] = 0
    out["all_case_ids"] = pd.Series([[]] * len(out), dtype="object")

    group_cols = [c for c in ["Dataset", "Labeling", ENC_COL] if c in out.columns]
    feat_cols = [c for c in out.columns if re.fullmatch(r"feature_\d+_lhs", c)]
    feat_cols = sorted(feat_cols, key=lambda x: int(re.findall(r"\d+", x)[0])) if feat_cols else []

    for keys, g in out.groupby(group_cols):
        vals = dict(zip(group_cols, keys))
        ds  = vals.get("Dataset")
        lab = vals.get("Labeling")
        enc = vals.get(ENC_COL)

        enc_path = _resolve_enc_path(ds, lab, enc, BASE_DIR)
        if enc_path is None:
            continue

        csv_files = [f for f in os.listdir(enc_path) if f.lower().endswith(".csv")]
        if not csv_files:
            continue
        csv_path = os.path.join(enc_path, csv_files[0])

        df_enc = pd.read_csv(csv_path)
        if "Label" not in df_enc.columns:
            continue
        case_col = _infer_case_col(df_enc)

        # Cache case IDs once per (dataset, labeling, encoding)
        all_ids = df_enc[case_col].dropna().astype(str).unique().tolist()
        out.loc[g.index, "all_case_ids"] = pd.Series([all_ids] * len(g), index=g.index, dtype="object")

        for idx, row in g.iterrows():
            feats = [row.get(c, "") for c in feat_cols]
            feats = [f for f in feats if isinstance(f, str) and f.strip() != ""]
            rhs = row.get("RHS_label", None)

            mask = _match_rule_dt(df_enc, feats, rhs)
            covered = df_enc.loc[mask, case_col].dropna().astype(str).unique().tolist()

            out.at[idx, "covered_case_ids"] = covered
            out.at[idx, "n_covered_cases"] = len(covered)

    sort_keys = [c for c in ["Dataset", "Labeling", ENC_COL, "n_covered_cases"] if c in out.columns]
    if sort_keys:
        out = out.sort_values(by=sort_keys).reset_index(drop=True)
    return out

# ---- Build coverage tables --------------------------------------------------
if 'all_rules_dt_expanded' in globals():
    all_rules_dt_with_coverage = _compute_coverage_for_rules(all_rules_dt_expanded, "DT")
else:
    raise ValueError("all_rules_dt_expanded not found. Run the expansion cell first.")

if 'all_rules_ripperk_expanded' in globals():
    all_rules_ripperk_with_coverage = _compute_coverage_for_rules(all_rules_ripperk_expanded, "RIPPERK")
else:
    raise ValueError("all_rules_ripperk_expanded not found. Run the expansion cell first.")


In [None]:
# Greedy 80% coverage on positive class (RHS_label == 1) for CRM, DT, RIPPERK

TARGET_COVERAGE = 0.80  # 80%

# --- Utilities ---------------------------------------------------------------

def _get_enc_col(df: pd.DataFrame) -> str:
    if "Encoding" in df.columns:
        return "Encoding"
    if "Feature Encoding" in df.columns:
        return "Feature Encoding"
    raise KeyError("Neither 'Encoding' nor 'Feature Encoding' found.")

def _tie_break_score(row: pd.Series) -> tuple:
    """Used when two rules add the same number of new cases (higher is better)."""
    return (
        float(row.get("LB odds ratio", -np.inf)) if pd.notna(row.get("LB odds ratio", np.nan)) else -np.inf,
        float(row.get("Precision", -np.inf)) if pd.notna(row.get("Precision", np.nan)) else -np.inf,
        float(row.get("Recall", -np.inf)) if pd.notna(row.get("Recall", np.nan)) else -np.inf,
        float(row.get("F1", -np.inf)) if pd.notna(row.get("F1", np.nan)) else -np.inf,
        -len(str(row.get("Rule", ""))),
    )

def _greedy_set_cover(group_df: pd.DataFrame, all_cases: set, covered_lists_col: str = "covered_case_ids",
                      max_rules: int | None = None) -> tuple[list, set]:
    """
    Classic greedy set cover on positive rules.
    Returns (selected_rule_indices, covered_cases_set).
    """
    if not len(all_cases):
        return [], set()

    idxs = list(group_df.index)
    rule_sets = {i: set(map(str, group_df.at[i, covered_lists_col])) if isinstance(group_df.at[i, covered_lists_col], list) else set()
                 for i in idxs}

    selected, covered = [], set()
    achievable = set().union(*rule_sets.values()) if rule_sets else set()
    if not achievable:
        return [], set()

    while len(covered) / len(all_cases) < TARGET_COVERAGE:
        best_idx, best_gain, best_tiebreak = None, 0, None

        for i in idxs:
            if i in selected:
                continue
            gain = len(rule_sets[i] - covered)
            if gain > best_gain:
                best_idx, best_gain, best_tiebreak = i, gain, _tie_break_score(group_df.loc[i])
            elif gain == best_gain and gain > 0:
                t = _tie_break_score(group_df.loc[i])
                if best_tiebreak is None or t > best_tiebreak:
                    best_idx, best_tiebreak = i, t

        if best_gain == 0 or best_idx is None:
            break

        selected.append(best_idx)
        covered |= rule_sets[best_idx]

        if max_rules is not None and len(selected) >= max_rules:
            break

    return selected, covered

# Paths (reused in several notebooks)
try:
    BASE_DIR
except NameError:
    BASE_DIR = "3.2_binned_features"

try:
    LABELING_FOLDER_MAP
except NameError:
    LABELING_FOLDER_MAP = {
        "declare": "declare_features",
        "sequential": "sequential_features",
        "payload": "payload_features",
    }

def _find_ci_subdir(parent: str, target: str) -> str | None:
    t = str(target).lower()
    try:
        for d in os.listdir(parent):
            full = os.path.join(parent, d)
            if os.path.isdir(full) and d.lower() == t:
                return full
    except FileNotFoundError:
        return None
    return None

def _resolve_enc_path(dataset: str, labeling: str, encoding: str, base_dir: str) -> str | None:
    if dataset is None or labeling is None or encoding is None:
        return None
    ds_dir = os.path.join(base_dir, str(dataset))
    if not os.path.isdir(ds_dir):
        return None
    lab_norm = str(labeling).strip().lower()
    lab_folder = LABELING_FOLDER_MAP.get(lab_norm, f"{lab_norm}_features")
    lab_dir = os.path.join(ds_dir, lab_folder)
    if not os.path.isdir(lab_dir):
        lab_dir = _find_ci_subdir(ds_dir, lab_folder)
        if not lab_dir:
            return None
    enc_exact = os.path.join(lab_dir, str(encoding))
    if os.path.isdir(enc_exact):
        return enc_exact
    return _find_ci_subdir(lab_dir, str(encoding))

def _infer_case_col(df: pd.DataFrame) -> str:
    for c in ["Case_ID", "case:concept:name", "Case ID", "case_id"]:
        if c in df.columns:
            return c
    raise KeyError("No Case ID column found (tried: Case_ID, case:concept:name, Case ID, case_id)")

def _positive_universe_for_group(ds, lab, enc, group_df: pd.DataFrame, enc_col: str) -> set:
    """
    Return all positive-class case IDs for (ds, lab, enc).
    Fallback: union of covered_case_ids of positive rules when the source file is unavailable.
    """
    path = _resolve_enc_path(ds, lab, enc, BASE_DIR)
    if path:
        csv_files = [f for f in os.listdir(path) if f.lower().endswith(".csv")]
        if csv_files:
            csv_path = os.path.join(path, csv_files[0])
            df_enc = pd.read_csv(csv_path)
            if "Label" in df_enc.columns:
                case_col = _infer_case_col(df_enc)
                pos_ids = (
                    df_enc.loc[pd.to_numeric(df_enc["Label"], errors="coerce") == 1, case_col]
                    .dropna().astype(str).unique().tolist()
                )
                return set(pos_ids)

    pos_lists = group_df.loc[group_df.get("RHS_label", 1) == 1, "covered_case_ids"]
    pos_lists = [lst for lst in pos_lists if isinstance(lst, list)]
    return set(chain.from_iterable(pos_lists)) if pos_lists else set()

def _summarize_model_pos(df_with_cov: pd.DataFrame, model_name: str) -> pd.DataFrame:
    """
    Greedy selection per (Dataset, Labeling, Encoding) on positive rules.
    Output columns:
      Dataset, Labeling, Encoding, {MODEL}_achieved_coverage, {MODEL}_nr_rules
    """
    if not isinstance(df_with_cov, pd.DataFrame):
        raise ValueError(f"{model_name}: input must be a DataFrame.")

    enc_col = _get_enc_col(df_with_cov)
    out_rows = []
    group_cols = [c for c in ["Dataset", "Labeling", enc_col] if c in df_with_cov.columns]

    for keys, g in df_with_cov.groupby(group_cols, dropna=False):
        vals = dict(zip(group_cols, keys))
        ds, lab, enc = vals.get("Dataset"), vals.get("Labeling"), vals.get(enc_col)

        g_pos = g[g.get("RHS_label", 1) == 1]
        pos_all = _positive_universe_for_group(ds, lab, enc, g_pos, enc_col)

        selected, covered = _greedy_set_cover(g_pos, pos_all)
        achieved = (len(covered) / len(pos_all)) if len(pos_all) else 0.0

        out_rows.append({
            "Dataset": ds,
            "Labeling": lab,
            "Encoding": enc,
            f"{model_name}_achieved_coverage": round(achieved, 4),
            f"{model_name}_nr_rules": len(selected),
        })

    out = pd.DataFrame(out_rows)
    if not out.empty:
        out = out.sort_values(by=["Dataset", "Labeling", "Encoding"], kind="mergesort").reset_index(drop=True)
    return out

# --- Run per-model summaries -------------------------------------------------

if 'crm_rules_all_with_coverage' not in globals():
    raise RuntimeError("crm_rules_all_with_coverage not found. Run the CRM coverage cell first.")
if 'all_rules_dt_with_coverage' not in globals():
    raise RuntimeError("all_rules_dt_with_coverage not found. Run the DT coverage cell first.")
if 'all_rules_ripperk_with_coverage' not in globals():
    raise RuntimeError("all_rules_ripperk_with_coverage not found. Run the RIPPERK coverage cell first.")

crm_summary_pos     = _summarize_model_pos(crm_rules_all_with_coverage, "CRM")
dt_summary_pos      = _summarize_model_pos(all_rules_dt_with_coverage, "DT")
ripperk_summary_pos = _summarize_model_pos(all_rules_ripperk_with_coverage, "RIPPERK")

# --- Merge summaries ---------------------------------------------------------

summary_pos_merged = (
    crm_summary_pos
    .merge(dt_summary_pos, how="outer", on=["Dataset", "Labeling", "Encoding"])
    .merge(ripperk_summary_pos, how="outer", on=["Dataset", "Labeling", "Encoding"])
)

for col in ["CRM_achieved_coverage", "DT_achieved_coverage", "RIPPERK_achieved_coverage"]:
    if col in summary_pos_merged.columns:
        summary_pos_merged[col] = summary_pos_merged[col].fillna(0.0)

for col in ["CRM_nr_rules", "DT_nr_rules", "RIPPERK_nr_rules"]:
    if col in summary_pos_merged.columns:
        summary_pos_merged[col] = summary_pos_merged[col].fillna(0).astype(int)

desired_cols = [
    "Dataset", "Labeling", "Encoding",
    "CRM_achieved_coverage", "CRM_nr_rules",
    "DT_achieved_coverage", "DT_nr_rules",
    "RIPPERK_achieved_coverage", "RIPPERK_nr_rules",
]
summary_pos_merged = summary_pos_merged.reindex(columns=desired_cols)

summary_pos_merged

In [None]:
# Export positive-class coverage to LaTeX (Traffic only), with 2-decimal coverage values

if 'summary_pos_merged' not in globals() or not isinstance(summary_pos_merged, pd.DataFrame):
    raise ValueError("summary_pos_merged not found. Run the cell that builds it first.")

df = summary_pos_merged.copy()

# 1) Keep Traffic rows (case-insensitive)
if "Dataset" not in df.columns:
    raise KeyError("Expected a 'Dataset' column in summary_pos_merged.")
mask_traffic = df["Dataset"].astype(str).str.strip().str.lower() == "traffic"
df = df.loc[mask_traffic].copy()

# 2) Determine the encoding column name
enc_col = None
for cand in ("Encoding", "Feature Encoding", "encoding", "feature encoding"):
    if cand in df.columns:
        enc_col = cand
        break
if enc_col is None:
    raise KeyError("No encoding column found (looked for 'Encoding' / 'Feature Encoding').")

# 3) Make encoding LaTeX-safe without altering content
df[enc_col] = df[enc_col].astype(str).apply(lambda s: rf"\detokenize{{{s}}}")

# 4) Ensure numeric types and format coverage to 2 decimals
cov_cols = ["CRM_achieved_coverage", "DT_achieved_coverage", "RIPPERK_achieved_coverage"]
for c in cov_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

int_cols = ["CRM_nr_rules","DT_nr_rules","RIPPERK_nr_rules"]
for c in int_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

def fmt2(x):
    return "" if pd.isna(x) else f"{float(x):.2f}"
formatters = {c: fmt2 for c in cov_cols if c in df.columns}

# 5) Drop 'Dataset' and arrange columns
desired_order = [
    "Labeling", enc_col,
    "CRM_achieved_coverage", "DT_achieved_coverage", "RIPPERK_achieved_coverage",
    "CRM_nr_rules", "DT_nr_rules", "RIPPERK_nr_rules",
]
present = [c for c in desired_order if c in df.columns]
df = df.reindex(columns=present)

# 6) Write LaTeX
latex_path = "5_analysis/rule_coverage_final.tex"
with open(latex_path, "w", encoding="utf-8") as f:
    f.write(df.to_latex(index=False, escape=False, formatters=formatters))

print(f"LaTeX table written to: {latex_path}")
df


### Filter best performing rules

In [None]:
# all_rules_crm_path = os.path.join('5_analysis', 'random','DHL', 'dhl_features' ,'combined_sorted.csv')
all_rules_crm_path = os.path.join('5_analysis', 'random', 'combined_sorted_all.csv')
crm_rules = pd.read_csv(all_rules_crm_path, sep=',')

# all_rules_dt_path = os.path.join('5_analysis', 'dt', 'rules_dt.csv')
# dt_rules = pd.read_csv(all_rules_dt_path, sep=',')

# all_rules_ripperk_path = os.path.join('5_analysis', 'ripperk', 'rules_ripperk.csv')
# ripperk_rules = pd.read_csv(all_rules_ripperk_path, sep=',')

In [None]:
# Keep rules whose RHS is exactly 'Label' (positive class)
rhs = crm_rules["Rule"].astype(str).str.extract(r"-->\s*(!?Label)\s*$", expand=False)
crm_rules = crm_rules.loc[rhs.eq("Label")].copy()

print(f"Kept {len(crm_rules)} positive rules (Label).")


### Top 10 overall per labeling

In [None]:
# Configuration
LAB_COL = "Labeling"
METRICS = ["LB odds ratio", "Support LHS", "Confidence", "Lift", "Conviction"]
TOP_N = 10
TIE_BREAKERS = ["Support LHS", "LB odds ratio"]  # higher is better

# Validate inputs
missing = [c for c in [LAB_COL] + METRICS if c not in crm_rules.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

def _rank_within_group(g: pd.DataFrame, col: str) -> pd.Series:
    """
    Rank a single metric within each labeling group (higher = better).
    NaNs receive the lowest rank (group_size + 1).
    """
    r = g[col].rank(method="dense", ascending=False)
    return r.fillna(len(g) + 1)

# Rank metrics within labeling
df_ranked = crm_rules.copy()
rank_cols = []
for m in METRICS:
    rcol = f"rank::{m}"
    df_ranked[rcol] = df_ranked.groupby(LAB_COL, group_keys=False).apply(_rank_within_group, col=m)
    rank_cols.append(rcol)

# Aggregate ranks (lower = better)
df_ranked["rank_agg"] = df_ranked[rank_cols].mean(axis=1)

# Select top-N per labeling (primary: rank_agg asc; ties: higher tie-breakers)
sort_cols = ["rank_agg"] + [c for c in TIE_BREAKERS if c in df_ranked.columns]
ascending = [True] + [False] * (len(sort_cols) - 1)

top10_by_labeling = {
    lab: grp.sort_values(sort_cols, ascending=ascending).head(TOP_N).reset_index(drop=True)
    for lab, grp in df_ranked.groupby(LAB_COL, dropna=False)
}

# Convenience handles
top10_declare    = top10_by_labeling.get("declare")
top10_sequential = top10_by_labeling.get("sequential")
top10_payload    = top10_by_labeling.get("payload")

# Preview
for lab, df_top in top10_by_labeling.items():
    print(f"\nTop {TOP_N} for labeling = {lab}")
    display(df_top)


In [None]:
# Output paths
out_dir = Path("5_analysis")
out_path = out_dir / "top10_labeling_tables.tex"
out_dir.mkdir(parents=True, exist_ok=True)

# Numeric columns to format to 2 decimals (if present)
TARGET_DEC_COLS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]

# Columns that won’t be included in the LaTeX export
DROP_COLS = {
    "labeling", "Labeling",
    "Odds ratio", "UB OR", "n12", "n21",
    "Fair set count", "Stratified",
    "rank::LB OR", "rank::Support LHS", "rank::Confidence",
    "rank::Lift", "rank::Conviction", "rank_agg",
}

def detok(s: pd.Series) -> pd.Series:
    """Wrap strings in \\detokenize{...}; keep NaN as empty."""
    return s.astype(str).fillna("").map(lambda x: rf"\detokenize{{{x}}}" if x != "" else "")

def prepare_for_latex(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop unneeded columns, apply \\detokenize to 'Rule' and encoding,
    and coerce target metric columns to numeric for consistent formatting.
    """
    if df is None or len(df) == 0:
        return df

    df2 = df.drop(columns=[c for c in df.columns if c in DROP_COLS], errors="ignore").copy()

    if "Rule" in df2.columns:
        df2["Rule"] = detok(df2["Rule"])

    if "Encoding" in df2.columns:
        df2["Encoding"] = detok(df2["Encoding"])
    elif "Feature Encoding" in df2.columns:
        df2["Feature Encoding"] = detok(df2["Feature Encoding"])

    for col in TARGET_DEC_COLS:
        if col in df2.columns:
            df2[col] = pd.to_numeric(df2[col], errors="coerce")

    return df2

def df_to_latex_block(df: pd.DataFrame, title: str) -> str:
    """Render a DataFrame to LaTeX with 2-decimal formatting for target columns."""
    if df is None or len(df) == 0:
        return f"% {title}: (no rows)\n\n"

    formatters = {
        col: (lambda x: "" if pd.isna(x) else f"{float(x):.2f}")
        for col in TARGET_DEC_COLS
        if col in df.columns
    }

    return (
        f"% ===== {title} =====\n"
        + df.to_latex(index=False, escape=False, formatters=formatters)
        + "\n\n"
    )

# Prepare per-labeling tables
declare_tex_df    = prepare_for_latex(top10_declare)
sequential_tex_df = prepare_for_latex(top10_sequential)
payload_tex_df    = prepare_for_latex(top10_payload)

# Write a single .tex file with three blocks
latex_parts = [
    df_to_latex_block(declare_tex_df,    "Top 10 — declare"),
    df_to_latex_block(sequential_tex_df, "Top 10 — sequential"),
    df_to_latex_block(payload_tex_df,    "Top 10 — payload"),
]

with open(out_path, "w", encoding="utf-8") as f:
    f.write("% Auto-generated top-10 tables per labeling\n\n")
    f.writelines(latex_parts)

print(f"Saved LaTeX tables to: {out_path}")

### DHL top 10

In [None]:
# # Configuration
# METRICS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]
# TOP_N = 20
# TIE_BREAKERS = ["Support LHS", "LB OR"]  # higher is better

# # Validate inputs
# missing = [c for c in METRICS if c not in crm_rules.columns]
# if missing:
#     raise ValueError(f"Missing expected columns: {missing}")

# def _rank_global(df: pd.DataFrame, col: str) -> pd.Series:
#     """
#     Global ranking for a single metric (higher = better).
#     NaNs receive the lowest rank (n + 1).
#     """
#     r = df[col].rank(method="dense", ascending=False)
#     return r.fillna(len(df) + 1)

# # Rank each metric globally
# df_ranked = crm_rules.copy()
# rank_cols = []
# for m in METRICS:
#     rcol = f"rank::{m}"
#     df_ranked[rcol] = _rank_global(df_ranked, m)
#     rank_cols.append(rcol)

# # Aggregate ranks (lower = better)
# df_ranked["rank_agg"] = df_ranked[rank_cols].mean(axis=1)

# # Overall top-N: primary sort by aggregate rank, ties resolved by TIE_BREAKERS (desc)
# sort_cols = ["rank_agg"] + [c for c in TIE_BREAKERS if c in df_ranked.columns]
# ascending = [True] + [False] * (len(sort_cols) - 1)

# top10_overall = (
#     df_ranked
#     .sort_values(sort_cols, ascending=ascending)
#     .head(TOP_N)
#     .reset_index(drop=True)
# )

# display(top10_overall)


In [None]:
# # Output file
# out_dir = Path("5_analysis")
# out_path = out_dir / "top10_overall.tex"
# out_dir.mkdir(parents=True, exist_ok=True)

# # Numeric columns to format to 2 decimals (if present)
# TARGET_DEC_COLS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]

# # Columns to drop from export
# DROP_COLS = {
#     "labeling", "Labeling",
#     "Odds ratio", "UB OR", "n12", "n21",
#     "Fair set count", "Stratified",
#     "rank::LB OR", "rank::Support LHS", "rank::Confidence",
#     "rank::Lift", "rank::Conviction", "rank_agg",
# }

# def detok(s: pd.Series) -> pd.Series:
#     """Wrap string values in \detokenize{...}; keep NaN as empty."""
#     return s.astype(str).fillna("").map(lambda x: rf"\detokenize{{{x}}}" if x != "" else "")

# def prepare_for_latex(df: pd.DataFrame) -> pd.DataFrame:
#     """
#     Remove non-essential columns, apply \detokenize to text fields,
#     and coerce metric columns to numeric for consistent formatting.
#     """
#     if df is None or len(df) == 0:
#         return df

#     df2 = df.drop(columns=[c for c in df.columns if c in DROP_COLS], errors="ignore").copy()

#     if "Rule" in df2.columns:
#         df2["Rule"] = detok(df2["Rule"])

#     if "Encoding" in df2.columns:
#         df2["Encoding"] = detok(df2["Encoding"])
#     elif "Feature Encoding" in df2.columns:
#         df2["Feature Encoding"] = detok(df2["Feature Encoding"])

#     for col in TARGET_DEC_COLS:
#         if col in df2.columns:
#             df2[col] = pd.to_numeric(df2[col], errors="coerce")

#     # Put Rule first if present
#     cols = list(df2.columns)
#     if "Rule" in cols:
#         df2 = df2[["Rule"] + [c for c in cols if c != "Rule"]]

#     return df2

# def df_to_latex_block(df: pd.DataFrame, title: str) -> str:
#     """Render DataFrame to LaTeX with 2-decimal formatting for target columns."""
#     if df is None or len(df) == 0:
#         return f"% {title}: (no rows)\n\n"

#     formatters = {
#         col: (lambda x: "" if pd.isna(x) else f"{float(x):.2f}")
#         for col in TARGET_DEC_COLS
#         if col in df.columns
#     }

#     return (
#         f"% ===== {title} =====\n"
#         + df.to_latex(index=False, escape=False, formatters=formatters)
#         + "\n\n"
#     )

# # Build LaTeX and write file (expects `top10_overall` from previous step)
# overall_tex_df = prepare_for_latex(top10_overall)
# latex_text = df_to_latex_block(overall_tex_df, "Top 10 — Overall")

# with open(out_path, "w", encoding="utf-8") as f:
#     f.write("% Auto-generated top-10 table (overall)\n\n")
#     f.write(latex_text)

# print(f"Saved LaTeX table to: {out_path}")

### Top 3 per encoding

In [None]:
# # Configuration
# LAB_COL = "labeling"   # normalized values: 'declare', 'sequential', 'payload'
# METRICS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]
# TOP_PER_ENCODING = 3
# TIE_BREAKERS = ["Support LHS", "LB OR"]  # higher is better

# # Resolve encoding column name
# ENC_CANDIDATES = ["Encoding", "Feature Encoding", "Feature encoding", "encoding", "feature encoding"]
# ENC_COL = next((c for c in ENC_CANDIDATES if c in crm_rules.columns), None)
# if ENC_COL is None:
#     raise ValueError(f"Could not find an encoding column among: {ENC_CANDIDATES}")

# # Validate inputs
# missing = [c for c in [LAB_COL, ENC_COL] + METRICS if c not in crm_rules.columns]
# if missing:
#     raise ValueError(f"Missing expected columns: {missing}")

# def _rank_within_group(g: pd.DataFrame, col: str) -> pd.Series:
#     """Rank one metric within each labeling group (higher = better)."""
#     r = g[col].rank(method="dense", ascending=False)
#     return r.fillna(len(g) + 1)

# # Rank metrics within labeling
# df_ranked = crm_rules.copy()
# rank_cols = []
# for m in METRICS:
#     rcol = f"rank::{m}"
#     df_ranked[rcol] = df_ranked.groupby(LAB_COL, group_keys=False).apply(_rank_within_group, col=m)
#     rank_cols.append(rcol)

# # Aggregate ranks (lower = better; rank 1 is best)
# df_ranked["rank_agg"] = df_ranked[rank_cols].mean(axis=1)

# # Sort keys: primary by aggregate rank (asc), then tie-breakers (desc)
# sort_cols = ["rank_agg"] + [c for c in TIE_BREAKERS if c in df_ranked.columns]
# ascending = [True] + [False] * (len(sort_cols) - 1)

# # Select top-K per encoding inside each labeling
# def _topk_per_encoding_within_label(grp: pd.DataFrame) -> pd.DataFrame:
#     srt = grp.sort_values(sort_cols, ascending=ascending)
#     out = srt.groupby(ENC_COL, group_keys=False).head(TOP_PER_ENCODING)
#     return out.sort_values(by=ENC_COL).reset_index(drop=True)

# top_by_labeling = {
#     lab: _topk_per_encoding_within_label(grp).reset_index(drop=True)
#     for lab, grp in df_ranked.groupby(LAB_COL, dropna=False)
# }

# # Convenience handles
# top3_declare    = top_by_labeling.get("declare")
# top3_sequential = top_by_labeling.get("sequential")
# top3_payload    = top_by_labeling.get("payload")

# # Preview
# for lab, df_top in top_by_labeling.items():
#     print(f"\nTop {TOP_PER_ENCODING} per encoding for labeling = {lab}")
#     display(df_top)

In [None]:
# Output paths
out_dir = Path("5_analysis")
out_path = out_dir / "top10_labeling_tables.tex"
out_dir.mkdir(parents=True, exist_ok=True)

# Numeric columns to format to two decimals (if present)
TARGET_DEC_COLS = ["LB OR", "Support LHS", "Confidence", "Lift", "Conviction"]

# Columns not needed in the export
DROP_COLS = {
    "labeling", "Labeling",
    "Odds ratio", "UB OR", "n12", "n21",
    "Fair set count", "Stratified",
    "rank::LB OR", "rank::Support LHS", "rank::Confidence",
    "rank::Lift", "rank::Conviction", "rank_agg",
}

def detok(s: pd.Series) -> pd.Series:
    """Wrap strings in \detokenize{...}; keep NaN as empty."""
    return s.astype(str).fillna("").map(lambda x: rf"\detokenize{{{x}}}" if x != "" else "")

def prepare_for_latex(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove non-essential columns, apply \detokenize to text fields,
    and coerce metric columns to numeric for consistent formatting.
    """
    if df is None or len(df) == 0:
        return df

    df2 = df.drop(columns=[c for c in df.columns if c in DROP_COLS], errors="ignore").copy()

    if "Rule" in df2.columns:
        df2["Rule"] = detok(df2["Rule"])

    if "Encoding" in df2.columns:
        df2["Encoding"] = detok(df2["Encoding"])
    elif "Feature Encoding" in df2.columns:
        df2["Feature Encoding"] = detok(df2["Feature Encoding"])

    for col in TARGET_DEC_COLS:
        if col in df2.columns:
            df2[col] = pd.to_numeric(df2[col], errors="coerce")

    return df2

def df_to_latex_block(df: pd.DataFrame, title: str) -> str:
    """Render a DataFrame to LaTeX with two-decimal formatting for target columns."""
    if df is None or len(df) == 0:
        return f"% {title}: (no rows)\n\n"

    formatters = {
        col: (lambda x: "" if pd.isna(x) else f"{float(x):.2f}")
        for col in TARGET_DEC_COLS
        if col in df.columns
    }

    return (
        f"% ===== {title} =====\n"
        + df.to_latex(index=False, escape=False, formatters=formatters)
        + "\n\n"
    )

# Prepare per-labeling tables
declare_tex_df    = prepare_for_latex(top10_declare)
sequential_tex_df = prepare_for_latex(top10_sequential)
payload_tex_df    = prepare_for_latex(top10_payload)

# Write a single .tex file with three blocks
latex_parts = [
    df_to_latex_block(declare_tex_df,    "Top 10 — declare"),
    df_to_latex_block(sequential_tex_df, "Top 10 — sequential"),
    df_to_latex_block(payload_tex_df,    "Top 10 — payload"),
]

with open(out_path, "w", encoding="utf-8") as f:
    f.write("% Auto-generated top-10 tables per labeling\n\n")
    f.writelines(latex_parts)

print(f"Saved LaTeX tables to: {out_path}")