## Initialization

In [1]:
import pandas as pd
import os
import glob
import re
import numpy as np
from pathlib import Path
import itertools

## 1. Working on 'combined.csv' (CRM)

### Write out a combined dataframe for every log, every random/ipweights, every encoding, every k0, to 5_analysis

In [2]:
input_root  = '4_output'
output_root = '5_analysis'

for dirpath, dirnames, filenames in os.walk(input_root):
    base = os.path.basename(dirpath)
    if base.startswith('k'):
        csv_pattern = os.path.join(dirpath, '*', '*.csv')
        csv_files = glob.glob(csv_pattern)
        if not csv_files:
            continue

        pieces = []
        for fp in csv_files:
            method_name = os.path.basename(os.path.dirname(fp))
            # 1) read the CSV
            try:
                df = pd.read_csv(fp, sep=';')
            except Exception as e:
                print(f"⚠️ Skipped {fp} because of read error: {e}")
                continue

            # 2) read the single .txt in that same folder
            txt_pattern = os.path.join(os.path.dirname(fp), '*.txt')
            txt_files = glob.glob(txt_pattern)
            if txt_files:
                try:
                    with open(txt_files[0], 'r') as f:
                        txt_val = f.read().strip()
                        # if it's a number, convert; otherwise keep as string
                        try:
                            txt_val = float(txt_val)
                        except ValueError:
                            pass
                except Exception as e:
                    print(f"⚠️ Could not read {txt_files[0]}: {e}")
                    txt_val = None
            else:
                txt_val = None

            # Calculate UB odds ratio from OR and LB
            std_err_or = (np.log(df['Odds ratio']) - np.log(df['LB odds ratio'])) / 1.96
            # Calculate UB odds ratio
            df['UB odds ratio'] = np.exp(np.log(df['Odds ratio']) + 1.96 * std_err_or)

            # 3) tag the DataFrame
            df['Feature Encoding'] = method_name
            df['Filename']         = os.path.basename(fp)
            df['Runtime (seconds)']     = txt_val   # <-- new column

            pieces.append(df)

        # 4) concatenate and write out as before
        combined = pd.concat(pieces, ignore_index=True)
        relative_subpath = os.path.relpath(dirpath, input_root)
        save_dir = os.path.join(output_root, relative_subpath)
        os.makedirs(save_dir, exist_ok=True)
        out_fp = os.path.join(save_dir, 'combined.csv')
        combined.to_csv(out_fp, index=False, sep=',')
        print(f"✅ Saved combined.csv for {base} → {out_fp}")


✅ Saved combined.csv for k3 → 5_analysis/DHL/dhl/dhl_features/k3/combined.csv
✅ Saved combined.csv for k3 → 5_analysis/random/traffic/traffic_payload_Pay36_features/k3/combined.csv
✅ Saved combined.csv for k3 → 5_analysis/random/traffic/traffic_decl3_features/k3/combined.csv
✅ Saved combined.csv for k2 → 5_analysis/random/traffic/traffic_decl3_features/k2/combined.csv
✅ Saved combined.csv for k3 → 5_analysis/random/traffic/traffic_mr_tr_features/k3/combined.csv
✅ Saved combined.csv for k3 → 5_analysis/random/BPI15A/BPI15A_mr_tr_features/k3/combined.csv
✅ Saved combined.csv for k2 → 5_analysis/random/BPI15A/BPI15A_mr_tr_features/k2/combined.csv
✅ Saved combined.csv for k3 → 5_analysis/random/BPI15A/BPI15A_decl2_features/k3/combined.csv


  combined = pd.concat(pieces, ignore_index=True)


✅ Saved combined.csv for k3 → 5_analysis/random/BPI15A/BPI15A_payload_560925_features/k3/combined.csv
✅ Saved combined.csv for k2 → 5_analysis/random/BPI15A/BPI15A_payload_560925_features/k2/combined.csv
✅ Saved combined.csv for k3 → 5_analysis/random/sepsis/sepsis_payload2_features/k3/combined.csv


  combined = pd.concat(pieces, ignore_index=True)
  combined = pd.concat(pieces, ignore_index=True)
  combined = pd.concat(pieces, ignore_index=True)


✅ Saved combined.csv for k3 → 5_analysis/random/sepsis/sepsis_mr_tr_features/k3/combined.csv
✅ Saved combined.csv for k3 → 5_analysis/random/sepsis/sepsis_decl_features/k3/combined.csv
✅ Saved combined.csv for k2 → 5_analysis/random/sepsis/sepsis_decl_features/k2/combined.csv


  combined = pd.concat(pieces, ignore_index=True)
  combined = pd.concat(pieces, ignore_index=True)


### Load a single 'combined' from 5_analysis

In [3]:
# # Read one of the combined CSVs (e.g., for random_k1)
# file_path = '5_analysis/random/sepsis/sepsis_decl_features/k2/combined.csv'
# combined = pd.read_csv(file_path, sep=',')

# # Method 1: pop + insert
# cols = combined.columns.tolist()
# # remove “Feature Encoding” from its current position…
# cols.pop(cols.index('Feature Encoding'))
# # …and insert it at the front
# cols.insert(0, 'Feature Encoding')
# # reindex
# combined = combined[cols]

# combined.round(2)

## 2. Working on 'aggregated.csv' (CRM)

In [4]:
# function that helps counting the number of variables in the LHS of a rule
# (e.g., in "['A', 'B', 'C']" it returns 3)
# this is a difficult task because variables can contain commas, apostrophes, and nested brackets

# -----------------------------------------------------------------------------
# Helper to count how many top-level elements are inside the leading [...] in Rule
# -----------------------------------------------------------------------------

def count_lhs_vars(rule: str) -> int:
    text = rule
    start = text.find('[')
    if start < 0:
        return 0

    # find matching ]
    depth = 0
    in_s = in_d = esc = False
    end = None
    for i, ch in enumerate(text[start:], start):
        if esc:
            esc = False
            continue
        if ch == '\\':
            esc = True
            continue

        if in_s:
            if ch == "'":
                in_s = False
            continue
        if in_d:
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            in_s = True
            continue
        if ch == '"':
            in_d = True
            continue

        if ch == '[':
            depth += 1
            continue
        if ch == ']':
            depth -= 1
            if depth == 0:
                end = i
                break

    if end is None:
        return 0

    # split on commas outside of any quotes
    content = text[start+1:end]
    parts = []
    curr = ""
    in_s = in_d = esc = False
    for ch in content:
        if esc:
            curr += ch
            esc = False
            continue
        if ch == '\\':
            curr += ch
            esc = True
            continue

        if in_s:
            curr += ch
            if ch == "'":
                in_s = False
            continue
        if in_d:
            curr += ch
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            curr += ch
            in_s = True
            continue
        if ch == '"':
            curr += ch
            in_d = True
            continue

        # only split top-level commas
        if ch == ',':
            parts.append(curr.strip())
            curr = ""
        else:
            curr += ch

    # final element
    if curr or content == "":
        parts.append(curr.strip())

    return len(parts)

### automatic

In [5]:
import os
import pandas as pd

# -------------------------------------------------------------------------
# Walk every subfolder of your analysis output and rebuild `aggregated.csv`
# -------------------------------------------------------------------------
input_root = '5_analysis'

for dirpath, dirnames, filenames in os.walk(input_root):
    if 'combined.csv' not in filenames:
        continue

    combined_fp = os.path.join(dirpath, 'combined.csv')
    df = pd.read_csv(combined_fp, sep=',')

    # 1) Flags
    df['is_lb_or_gt1'] = df['LB odds ratio'] > 1
    df['neg_or'] = df['is_lb_or_gt1'] & df['Rule'].str.endswith('!Label')
    df['pos_or'] = df['is_lb_or_gt1'] & ~df['Rule'].str.endswith('!Label')
    df['n_vars'] = df['Rule'].apply(count_lhs_vars)
    for k in (1, 2, 3):
        df[f'k={k}'] = ((df['n_vars'] == k) & df['is_lb_or_gt1']).astype(int)

    # 2) First-seen runtime → minutes
    first_rts = df.groupby('Feature Encoding')['Runtime (seconds)'].first()
    runtime_mins = (first_rts / 60).round(2)

    # 3) Aggregation
    aggregated = (
        df
        .groupby('Feature Encoding')
        .agg(
            **{
                'Total Rules': ('Rule', 'count'),
                'Rules LB OR>1': ('is_lb_or_gt1', 'sum'),
                'Positive Rules': ('pos_or', 'sum'),
                'Negative Rules': ('neg_or', 'sum'),
                **{f'k={k}': (f'k={k}', 'sum') for k in (1, 2, 3)},
            }
        )
        .reset_index()
    )

    # 4) Add runtime columns
    aggregated['Runtime (Minutes)'] = aggregated['Feature Encoding'].map(runtime_mins)
    aggregated['Runtime (Hours)'] = (aggregated['Runtime (Minutes)'] / 60).round(2)

    # 5) Write out
    out_fp = os.path.join(dirpath, 'aggregated.csv')
    aggregated.to_csv(out_fp, index=False, sep=',')

    print(f"✅ Saved aggregated.csv for {dirpath} → {out_fp}")


✅ Saved aggregated.csv for 5_analysis/DHL/dhl/dhl_features/k3 → 5_analysis/DHL/dhl/dhl_features/k3/aggregated.csv
✅ Saved aggregated.csv for 5_analysis/random/traffic/traffic_payload_Pay36_features/k3 → 5_analysis/random/traffic/traffic_payload_Pay36_features/k3/aggregated.csv
✅ Saved aggregated.csv for 5_analysis/random/traffic/traffic_decl3_features/k3 → 5_analysis/random/traffic/traffic_decl3_features/k3/aggregated.csv
✅ Saved aggregated.csv for 5_analysis/random/traffic/traffic_decl3_features/k2 → 5_analysis/random/traffic/traffic_decl3_features/k2/aggregated.csv
✅ Saved aggregated.csv for 5_analysis/random/traffic/traffic_mr_tr_features/k3 → 5_analysis/random/traffic/traffic_mr_tr_features/k3/aggregated.csv
✅ Saved aggregated.csv for 5_analysis/random/BPI15A/BPI15A_mr_tr_features/k3 → 5_analysis/random/BPI15A/BPI15A_mr_tr_features/k3/aggregated.csv
✅ Saved aggregated.csv for 5_analysis/random/BPI15A/BPI15A_mr_tr_features/k2 → 5_analysis/random/BPI15A/BPI15A_mr_tr_features/k2/aggre

## 3. Create 'combined_sorted.csv' (CRM)

### Per K

In [6]:
# Root directory containing your experiment subfolders
root_dir = '5_analysis/random/BPI15A/BPI15A_payload_560925_features'
# Pattern to match all combined.csv files recursively
pattern = os.path.join(root_dir, '**', 'combined.csv')

for file_path in glob.glob(pattern, recursive=True):
    save_dir = os.path.dirname(file_path)
    base = os.path.basename(save_dir)
    
    # 1) Load your combined.csv
    combined = pd.read_csv(file_path, sep=',')
    
    # 2) Sort on LB odds ratio descending
    combined = combined.sort_values(by='LB odds ratio', ascending=False)

    # 3) Drop all where LB odds ratio <= 1
    combined = combined[combined['LB odds ratio'] > 1]

    # 4) Round all numeric columns to 3 decimal places
    float_cols = combined.select_dtypes(include=['float64']).columns
    combined[float_cols] = combined[float_cols].round(3)

    # 5) Drop unwanted columns
    combined = combined.drop(columns=['Support RHS', 'Filename', 'Runtime (seconds)'])

    # 6) Move 'Feature Encoding' to second column
    cols = combined.columns.tolist()
    cols.insert(1, cols.pop(cols.index('Feature Encoding')))
    combined = combined[cols]

    # 6.1) Move 'UB odds ratio' to fifth column
    cols = combined.columns.tolist()
    cols.insert(4, cols.pop(cols.index('UB odds ratio')))
    combined = combined[cols]

    # 7) Export to CSV
    out_fp_csv = os.path.join(save_dir, 'combined_sorted.csv')
    combined.to_csv(out_fp_csv, index=False, sep=',')
    print(f"✅ Saved combined_sorted.csv for {base} → {out_fp_csv}")

    # 8) Export to LaTeX
    out_fp_tex = os.path.join(save_dir, 'combined_sorted.tex')
    def fmt_rule(x):
        # wrap in \detokenize{…} so TeX will not parse special chars
        return r'\detokenize{' + str(x) + '}'

    combined.to_latex(
        out_fp_tex,
        index=False,
        escape=False,
        longtable=True,
        float_format="%.2f",
        formatters={
            "Rule": fmt_rule,
            "Feature Encoding": fmt_rule
        }
    )
    print(f"✅ Saved combined_sorted.tex for {base} → {out_fp_tex}")


✅ Saved combined_sorted.csv for k3 → 5_analysis/random/BPI15A/BPI15A_payload_560925_features/k3/combined_sorted.csv
✅ Saved combined_sorted.tex for k3 → 5_analysis/random/BPI15A/BPI15A_payload_560925_features/k3/combined_sorted.tex
✅ Saved combined_sorted.csv for k2 → 5_analysis/random/BPI15A/BPI15A_payload_560925_features/k2/combined_sorted.csv
✅ Saved combined_sorted.tex for k2 → 5_analysis/random/BPI15A/BPI15A_payload_560925_features/k2/combined_sorted.tex


### Per Labelling

In [7]:
import os, glob
import pandas as pd

# Root directory containing your experiment subfolders
root_dir = '5_analysis/DHL/dhl/dhl_features'
# Pattern to match all combined.csv files one level down (e.g., k1/combined.csv, k2/combined.csv)
pattern = os.path.join(root_dir, '*', 'combined.csv')

# Find all matching files
file_paths = glob.glob(pattern)
if not file_paths:
    print(f"🚨 No combined.csv files found under {root_dir}")
else:
    # Read and concatenate all dataframes
    df_list = [pd.read_csv(fp, sep=',') for fp in file_paths]
    combined = pd.concat(df_list, ignore_index=True)

    # --- Rename columns early so the rest uses the new names ---
    rename_map = {
        'Feature Encoding': 'Encoding',
        'LB odds ratio': 'LB OR',
        'UB odds ratio': 'UB OR',
    }
    combined = combined.rename(columns=rename_map)

    # Sort on LB OR descending, then drop all where LB OR <= 1
    combined = combined.sort_values(by='LB OR', ascending=False)
    combined = combined[combined['LB OR'] > 1]

    # Round non-OR float columns to 3 decimals
    float_cols = combined.select_dtypes(include=['float64', 'float32']).columns.tolist()
    or_cols = [c for c in ['LB OR', 'UB OR', 'odds ratio'] if c in combined.columns]
    float_cols_wo_or = [c for c in float_cols if c not in or_cols]
    if float_cols_wo_or:
        combined[float_cols_wo_or] = combined[float_cols_wo_or].round(3)

    # Make OR columns integers (nullable Int64 so it won't crash on NaNs)
    for c in or_cols:
        combined[c] = pd.to_numeric(combined[c], errors='coerce').round(0).astype('Int64')

    # Drop unwanted columns
    drop_cols = [c for c in ['Support RHS', 'Filename', 'Runtime (seconds)'] if c in combined.columns]
    if drop_cols:
        combined = combined.drop(columns=drop_cols)

    # Move 'Encoding' to second column
    cols = combined.columns.tolist()
    if 'Encoding' in cols:
        cols.insert(1, cols.pop(cols.index('Encoding')))
        combined = combined[cols]

    # Move 'UB OR' to fifth column (if present)
    cols = combined.columns.tolist()
    if 'UB OR' in cols:
        cols.insert(4, cols.pop(cols.index('UB OR')))
        combined = combined[cols]

    # Export to CSV at the top level directory
    out_fp_csv = os.path.join(root_dir, 'combined_sorted.csv')
    combined.to_csv(out_fp_csv, index=False, sep=',')
    print(f"✅ Saved combined_sorted.csv → {out_fp_csv}")

    # Export to LaTeX at the top level directory
    out_fp_tex = os.path.join(root_dir, 'combined_sorted.tex')

    def fmt_rule(x):
        # wrap in \detokenize{…} so TeX will not parse special chars
        return r'\detokenize{' + str(x) + '}'

    # drop the columns you don't want (order of the rest is preserved)
    cols_to_drop = ['n12', 'n21', 'Fair set count', 'Stratified']
    cols_keep = [c for c in combined.columns if c not in cols_to_drop]

    combined.to_latex(
        out_fp_tex,
        index=False,
        escape=False,
        longtable=True,
        float_format="%.2f",  # applies only to float columns (OR cols are Int64, so no decimals)
        columns=cols_keep,
        formatters={
            "Rule": fmt_rule,
            "Encoding": fmt_rule,  # updated name
        }
    )
    print(f"✅ Saved combined_sorted.tex → {out_fp_tex}")


✅ Saved combined_sorted.csv → 5_analysis/DHL/dhl/dhl_features/combined_sorted.csv
✅ Saved combined_sorted.tex → 5_analysis/DHL/dhl/dhl_features/combined_sorted.tex


### Per log

In [8]:
import os, glob
import pandas as pd

# Root directory one level up (everything under sepsis)
root_dir = '5_analysis/random/BPI15A'

# Find all combined.csv files (two levels deep; switch to recursive ** if needed)
pattern = os.path.join(root_dir, '*', '*', 'combined.csv')
file_paths = sorted(glob.glob(pattern))

def infer_labeling_from_path(fp: str, base: str) -> str:
    """
    Infer labeling from the *top-level* subfolder under 'sepsis'.
    Mapping:
      - contains 'mr_tr'   -> 'sequential'
      - contains 'decl'    -> 'declare'
      - contains 'payload' -> 'payload'
    """
    rel = os.path.relpath(fp, base)
    parts = rel.split(os.sep)
    top = parts[0].lower() if parts else ""
    if 'mr_tr' in top:
        return 'sequential'
    if 'decl' in top:
        return 'declare'
    if 'payload' in top:
        return 'payload'
    return 'unknown'

if not file_paths:
    print(f"🚨 No combined.csv files found under {root_dir} (looked for {pattern})")
else:
    # Read & tag each df with 'labeling'
    df_list = []
    for fp in file_paths:
        df = pd.read_csv(fp, sep=',')
        labeling = infer_labeling_from_path(fp, root_dir)
        df['labeling'] = labeling
        df_list.append(df)

    combined = pd.concat(df_list, ignore_index=True)

    # --- Rename columns early so the rest uses the new names ---
    rename_map = {
        'Feature Encoding': 'Encoding',
        'LB odds ratio': 'LB OR',
        'UB odds ratio': 'UB OR',
    }
    combined = combined.rename(columns=rename_map)

    # Sort on LB OR descending, then drop all where LB OR <= 1
    if 'LB OR' in combined.columns:
        combined = combined.sort_values(by='LB OR', ascending=False)
        combined = combined[combined['LB OR'] > 1]

    # Round non-OR float columns to 3 decimals
    float_cols = combined.select_dtypes(include=['float64', 'float32']).columns.tolist()
    or_cols = [c for c in ['LB OR', 'UB OR', 'odds ratio'] if c in combined.columns]
    float_cols_wo_or = [c for c in float_cols if c not in or_cols]
    if float_cols_wo_or:
        combined[float_cols_wo_or] = combined[float_cols_wo_or].round(3)

    # Make OR columns integers (nullable Int64 so it won't crash on NaNs)
    for c in or_cols:
        combined[c] = pd.to_numeric(combined[c], errors='coerce').round(0).astype('Int64')

    # Drop unwanted columns
    drop_cols = [c for c in ['Support RHS', 'Filename', 'Runtime (seconds)'] if c in combined.columns]
    if drop_cols:
        combined = combined.drop(columns=drop_cols)

    # Reorder a bit: place 'Encoding' second and keep new 'labeling' near the front
    cols = combined.columns.tolist()
    # ensure labeling is present and near the front
    for desired, idx in [('Encoding', 1), ('labeling', 2)]:
        if desired in cols:
            cols.insert(idx, cols.pop(cols.index(desired)))
    combined = combined[cols]

    # Move 'UB OR' to fifth column (if present)
    cols = combined.columns.tolist()
    if 'UB OR' in cols:
        cols.insert(4, cols.pop(cols.index('UB OR')))
        combined = combined[cols]

    # Export to CSV at the sepsis level
    out_fp_csv = os.path.join(root_dir, 'combined_sorted.csv')
    combined.to_csv(out_fp_csv, index=False, sep=',')
    print(f"✅ Saved combined_sorted.csv → {out_fp_csv}")

    # Export to LaTeX at the sepsis level
    out_fp_tex = os.path.join(root_dir, 'combined_sorted.tex')

    def fmt_rule(x):
        # wrap in \detokenize{…} so TeX will not parse special chars
        return r'\detokenize{' + str(x) + '}'

    cols_to_drop = ['n12', 'n21', 'Fair set count', 'Stratified']
    cols_keep = [c for c in combined.columns if c not in cols_to_drop]

    combined.to_latex(
        out_fp_tex,
        index=False,
        escape=False,
        longtable=True,
        float_format="%.2f",
        columns=cols_keep,
        formatters={
            "Rule": fmt_rule,
            "Encoding": fmt_rule,
        }
    )
    print(f"✅ Saved combined_sorted.tex → {out_fp_tex}")

    # Optional sanity check
    print("Labelings found:", sorted(combined['labeling'].unique()))


✅ Saved combined_sorted.csv → 5_analysis/random/BPI15A/combined_sorted.csv
✅ Saved combined_sorted.tex → 5_analysis/random/BPI15A/combined_sorted.tex
Labelings found: ['declare', 'payload', 'sequential']


### For all crm experiments

In [9]:
root_dir = '5_analysis/random'
pattern = os.path.join(root_dir, '**', 'combined.csv')

file_paths = glob.glob(pattern, recursive=True)

if not file_paths:
    print(f"🚨 No combined.csv files found under {root_dir}")
else:
    df_list = []
    for fp in file_paths:
        df = pd.read_csv(fp, sep=',')

        # sanity check: we now trust 'Feature Encoding' from the file itself
        if 'Feature Encoding' not in df.columns:
            raise ValueError(f"'Feature Encoding' column missing in {fp}")

        # derive Dataset and Labeling from path relative to root_dir
        rel = Path(fp).relative_to(root_dir)
        # expect: <dataset>/<labeling>_features/<...>/combined.csv
        dataset  = rel.parts[0] if len(rel.parts) >= 1 else 'UNKNOWN'
        labeling = rel.parts[1] if len(rel.parts) >= 2 else 'UNKNOWN'
        labeling = labeling.replace('_features', '')

        # add columns (do NOT overwrite 'Feature Encoding')
        df.insert(0, 'Dataset', dataset)
        df.insert(1, 'Labeling', labeling)

        df_list.append(df)

    combined_all = pd.concat(df_list, ignore_index=True)

    # sort and filter as before
    combined_all = combined_all.sort_values(by='LB odds ratio', ascending=False)
    combined_all = combined_all[combined_all['LB odds ratio'] > 1]

    # round only float columns
    float_cols = combined_all.select_dtypes(include=[np.floating]).columns
    combined_all[float_cols] = combined_all[float_cols].round(3)

    # drop unwanted columns if present
    drop_cols = [c for c in ['Support RHS', 'Filename', 'Runtime (seconds)'] if c in combined_all.columns]
    if drop_cols:
        combined_all = combined_all.drop(columns=drop_cols)

    # ensure first three columns are Dataset, Labeling, Feature Encoding (in that order),
    # then keep the rest in their existing order
    cols = list(combined_all.columns)
    for must in ['Dataset', 'Labeling', 'Feature Encoding']:
        if must not in cols:
            raise ValueError(f"Required column '{must}' missing after assembly.")

    rest = [c for c in cols if c not in ['Dataset', 'Labeling', 'Feature Encoding']]
    ordered_cols = ['Dataset', 'Labeling', 'Feature Encoding'] + rest
    combined_all = combined_all[ordered_cols]

    # sort by Dataset, Labeling, Feature Encoding, then LB odds ratio
    combined_all = combined_all.sort_values(
    by=['Dataset', 'Labeling', 'Feature Encoding', 'LB odds ratio'],
    ascending=[True, True, True, False]
    ).reset_index(drop=True)

    # save CSV
    out_fp_csv = os.path.join(root_dir, 'combined_sorted_all.csv')
    combined_all.to_csv(out_fp_csv, index=False)
    print(f"✅ Saved combined_sorted_all.csv → {out_fp_csv}")

    # save LaTeX
    out_fp_tex = os.path.join(root_dir, 'combined_sorted_all.tex')

    def fmt_rule(x):
        return r'\detokenize{' + str(x) + '}'

    combined_all.to_latex(
        out_fp_tex,
        index=False,
        escape=False,
        longtable=True,
        float_format="%.2f",
        formatters={
            "Rule": fmt_rule,
            "Feature Encoding": fmt_rule
        }
    )
    print(f"✅ Saved combined_sorted_all.tex → {out_fp_tex}")

✅ Saved combined_sorted_all.csv → 5_analysis/random/combined_sorted_all.csv
✅ Saved combined_sorted_all.tex → 5_analysis/random/combined_sorted_all.tex


### Latex export for IMPresseD

In [10]:
# # Latex export for IMPresseD results
#     # save LaTeX
# # 1) Keep only the requested columns (in this order)
# cols = ["Dataset", "Labeling", "Rule", "LB odds ratio"]
# missing = [c for c in cols if c not in df.columns]
# if missing:
#     raise KeyError(f"Missing expected column(s): {missing}")
# df_latex = combined_all[cols].copy()

# # 2) Clean 'Labeling' values (case-insensitive):
# #    any value containing 'decl' -> 'declare'
# #    any value containing 'mr_tr' -> 'sequential'
# #    any value containing 'payload' -> 'payload'
# lab_lower = df_latex["Labeling"].astype(str).str.lower()
# conds = [
#     lab_lower.str.contains(r"decl", na=False),
#     lab_lower.str.contains(r"mr_tr", na=False),
#     lab_lower.str.contains(r"payload", na=False),
# ]
# choices = ["declare", "sequential", "payload"]
# df_latex["Labeling"] = np.select(conds, choices, default=df_latex["Labeling"])

# # Ensure numeric for formatting step
# df_latex["LB odds ratio"] = pd.to_numeric(df_latex["LB odds ratio"], errors="coerce")

# def fmt_rule(x):
#     # Keep rules LaTeX-safe
#     return r"\detokenize{" + str(x) + "}"

# def fmt_two_decimals(x):
#     # 3) Format LB odds ratio to max 2 decimals
#     return "" if pd.isna(x) else f"{x:.2f}"

# # Export to LaTeX
# out_fp_tex = os.path.join(root_dir, "combined_sorted_all.tex")
# df_latex.to_latex(
#     out_fp_tex,
#     index=False,
#     escape=False,
#     longtable=True,
#     float_format=None,               # don't apply a global float formatter
#     formatters={
#         "Rule": fmt_rule,
#         "LB odds ratio": fmt_two_decimals,
#     },
# )
# print(f"✅ Saved {os.path.basename(out_fp_tex)} → {out_fp_tex}")

### Load a specific csv (inspection)

In [11]:
# path = os.path.join('5_analysis', 'DHL', 'combined_sorted_all.csv')
# df = pd.read_csv(path, sep=',')
# df.sort_values(
#     by="LB odds ratio",
#     ascending=False
# ).reset_index(drop=True)

In [12]:
# #filter on payload feature encodings
# path = os.path.join('5_analysis', 'random', 'traffic', 'traffic_decl3_features', 'combined_sorted.csv')
# df = pd.read_csv(path, sep=',')
# df = df[df['LB odds ratio'] <= 1]
# df

## 4. Collect results for dt and RipperK experiments

### Collect aggregated metrics

In [13]:
base_dir = Path("4_output")
metrics = ("precision", "recall", "f1", "roc_auc")
# Drop rows where Feature Encoding is 'mr', 'mra', 'tr', or 'tra'
exclude_encodings = ["mr", "mra", "tr", "tra"]

In [14]:
dt_records = []
ripperk_records = []

for cls_dir in base_dir.iterdir():
    if not cls_dir.is_dir():
        continue
    classifier = cls_dir.name

    if classifier == "dt":
        target_list = dt_records
    elif classifier == "ripperk":
        target_list = ripperk_records
    else:
        continue  # skip any other classifiers

    for ds_dir in cls_dir.iterdir():
        if not ds_dir.is_dir():
            continue
        dataset = ds_dir.name

        for lab_dir in ds_dir.iterdir():
            if not lab_dir.is_dir():
                continue
            labeling_raw = lab_dir.name

            # Extract only the middle part (between first and last "_")
            parts = labeling_raw.split("_")
            labeling = parts[1] if len(parts) > 2 else labeling_raw

            for enc_dir in lab_dir.iterdir():
                if not enc_dir.is_dir():
                    continue
                encoding = enc_dir.name

                csv_files = sorted(enc_dir.glob("*.csv"))
                if not csv_files:
                    continue

                try:
                    df = pd.read_csv(csv_files[0])
                except Exception:
                    continue

                # Read runtime from .txt file (if available)
                runtime_file = next(enc_dir.glob("*.txt"), None)
                runtime_seconds = pd.NA
                if runtime_file and runtime_file.is_file():
                    try:
                        with open(runtime_file, "r") as f:
                            val = f.read().strip()
                            runtime_seconds = round(float(val), 2)
                    except Exception:
                        pass

                # Build record
                row = {
                    "Dataset": dataset,
                    "Labeling": labeling,
                    "Feature Encoding": encoding
                }

                if df.empty:
                    for m in metrics:
                        row[m] = pd.NA
                else:
                    for m in metrics:
                        row[m] = round(df[m].iloc[0], 4) if m in df.columns else pd.NA

                # Add runtime as last column
                row["Runtime (Seconds)"] = runtime_seconds

                target_list.append(row)

# Create and sort dataframes
dt_df = pd.DataFrame.from_records(dt_records).sort_values(
    by=["Dataset", "Labeling", "Feature Encoding"]
).reset_index(drop=True)

ripperk_df = pd.DataFrame.from_records(ripperk_records).sort_values(
    by=["Dataset", "Labeling", "Feature Encoding"]
).reset_index(drop=True)

# Drop rows where Feature Encoding is 'mr', 'mra', 'tr', or 'tra'
dt_df = dt_df[~dt_df["Feature Encoding"].isin(exclude_encodings)].reset_index(drop=True)
ripperk_df = ripperk_df[~ripperk_df["Feature Encoding"].isin(exclude_encodings)].reset_index(drop=True)

# --- Normalize Labeling values (substring-based) ---
for _df in (dt_df, ripperk_df):
    if "Labeling" in _df.columns:
        low = _df["Labeling"].astype(str).str.lower()
        _df.loc[low.str.contains("decl", na=False), "Labeling"] = "declare"
        _df.loc[low.str.contains("payload", na=False), "Labeling"] = "payload"
        _df.loc[low.str.contains("mr", na=False), "Labeling"] = "sequential"

dt_df

Unnamed: 0,Dataset,Labeling,Feature Encoding,precision,recall,f1,roc_auc,Runtime (Seconds)
0,BPI15A,declare,baseline,1.0000,1.0000,1.0000,1.0000,0.78
1,BPI15A,declare,bs_data,1.0000,1.0000,1.0000,1.0000,0.84
2,BPI15A,declare,bs_dwd,1.0000,1.0000,1.0000,1.0000,0.66
3,BPI15A,declare,dec_data,1.0000,1.0000,1.0000,1.0000,0.53
4,BPI15A,declare,dec_dwd,1.0000,1.0000,1.0000,1.0000,0.48
...,...,...,...,...,...,...,...,...
133,traffic,payload,hybrid_dwd,1.0000,1.0000,1.0000,1.0000,28.66
134,traffic,payload,hybrid_dwd_data,1.0000,1.0000,1.0000,1.0000,36.36
135,traffic,payload,payload,0.9843,0.9939,0.9891,0.9986,10.23
136,traffic,payload,seq_combined,0.1540,0.9953,0.2668,0.8617,17.06


### Collect aggregated metrics averaged over datasets

In [15]:
def average_over_datasets(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df

    tmp = df.copy()

    # 1) Harmonize labeling names
    labs = tmp["Labeling"].astype(str)
    labs_clean = labs.str.lower().str.replace(r"\s+", "", regex=True)  # e.g., "payload 2" -> "payload2"
    tmp.loc[labs_clean.isin({"decl", "decl2", "decl3"}), "Labeling"] = "decl"
    tmp.loc[labs_clean.isin({"payload", "payload2"}), "Labeling"] = "payload"

    # 2) Ensure numeric
    numeric_cols = [c for c in ["precision", "recall", "f1", "roc_auc", "Runtime (Seconds)"] if c in tmp.columns]
    for c in numeric_cols:
        tmp[c] = pd.to_numeric(tmp[c], errors="coerce")

    # 3) Group and average (NaNs ignored), count contributing datasets
    grouped = (
        tmp.groupby(["Labeling", "Feature Encoding"], as_index=False)
           .agg({**{c: "mean" for c in numeric_cols}, "Dataset": "nunique"})
           .rename(columns={"Dataset": "#Datasets"})
           .sort_values(["Labeling", "Feature Encoding"])
           .reset_index(drop=True)
    )

    # 4) Round
    for c in ["precision", "recall", "f1", "roc_auc"]:
        if c in grouped.columns:
            grouped[c] = grouped[c].round(4)
    if "Runtime (Seconds)" in grouped.columns:
        grouped["Runtime (Seconds)"] = grouped["Runtime (Seconds)"].round(2)

    return grouped

In [16]:
dt_avg = average_over_datasets(dt_df)
ripperk_avg = average_over_datasets(ripperk_df)

dt_avg

Unnamed: 0,Labeling,Feature Encoding,precision,recall,f1,roc_auc,Runtime (Seconds),#Datasets
0,declare,IMPresseD,0.9757,0.9697,0.9723,0.9917,1.22,1
1,declare,baseline,0.8782,0.9991,0.9324,0.9731,2.39,3
2,declare,bs_data,0.8796,0.989,0.9292,0.9793,6.1,3
3,declare,bs_dwd,0.9964,0.9991,0.9977,0.9993,10.66,3
4,declare,dec_data,0.9982,0.9991,0.9986,0.9995,11.32,3
5,declare,dec_dwd,0.9982,0.9991,0.9986,0.9995,14.89,3
6,declare,dec_dwd_data,0.9982,0.9992,0.9987,0.9996,17.29,3
7,declare,declare,0.9982,0.9991,0.9986,0.9995,7.76,3
8,declare,dwd,0.9964,1.0,0.9982,0.9997,9.37,3
9,declare,hybrid,0.9982,0.9991,0.9986,0.9995,16.22,3


In [17]:
def fmt_rule(x):
    # Wrap in \detokenize{…} so TeX won't parse special chars
    return r'\detokenize{' + str(x) + '}'

def save_raw(df, out_dir, stem):
    os.makedirs(out_dir, exist_ok=True)
    csv_path = os.path.join(out_dir, f'{stem}.csv')
    tex_path = os.path.join(out_dir, f'{stem}.tex')

    df.to_csv(csv_path, index=False, sep=',')
    df.to_latex(
        tex_path,
        index=False,
        escape=False,
        longtable=True,
        float_format="%.2f",
        formatters={"Feature Encoding": fmt_rule}
    )
    print(f"✅ Saved {stem}.csv → {csv_path}")
    print(f"✅ Saved {stem}.tex  → {tex_path}")

def save_avg(df_avg, out_dir, stem):
    os.makedirs(out_dir, exist_ok=True)
    csv_path = os.path.join(out_dir, f'{stem}.csv')
    tex_path = os.path.join(out_dir, f'{stem}.tex')

    df_avg.to_csv(csv_path, index=False, sep=',')
    fmt4 = (lambda x: f"{x:.4f}" if pd.notna(x) else "")
    fmt2 = (lambda x: f"{x:.2f}" if pd.notna(x) else "")

    df_avg.to_latex(
        tex_path,
        index=False,
        escape=False,
        longtable=True,
        formatters={
            "Feature Encoding": fmt_rule,
            "precision": fmt4, "recall": fmt4, "f1": fmt4, "roc_auc": fmt4,
            "Runtime (Seconds)": fmt2
        }
    )
    print(f"✅ Saved {stem}.csv → {csv_path}")
    print(f"✅ Saved {stem}.tex  → {tex_path}")

# Run for both classifiers
for name, df_raw, df_avg in [
    ("dt", dt_df, dt_avg),
    ("ripperk", ripperk_df, ripperk_avg),
]:
    out_dir = os.path.join('5_analysis', name)
    save_raw(df_raw, out_dir, f'aggregated_sorted_{name}')
    save_avg(df_avg, out_dir, f'aggregated_averaged_{name}')


✅ Saved aggregated_sorted_dt.csv → 5_analysis/dt/aggregated_sorted_dt.csv
✅ Saved aggregated_sorted_dt.tex  → 5_analysis/dt/aggregated_sorted_dt.tex
✅ Saved aggregated_averaged_dt.csv → 5_analysis/dt/aggregated_averaged_dt.csv
✅ Saved aggregated_averaged_dt.tex  → 5_analysis/dt/aggregated_averaged_dt.tex
✅ Saved aggregated_sorted_ripperk.csv → 5_analysis/ripperk/aggregated_sorted_ripperk.csv
✅ Saved aggregated_sorted_ripperk.tex  → 5_analysis/ripperk/aggregated_sorted_ripperk.tex
✅ Saved aggregated_averaged_ripperk.csv → 5_analysis/ripperk/aggregated_averaged_ripperk.csv
✅ Saved aggregated_averaged_ripperk.tex  → 5_analysis/ripperk/aggregated_averaged_ripperk.tex


### Collect aggregated rules

In [18]:
base_dir = Path("4_output")

dt_rule_records = []
ripperk_rule_records = []

for cls_dir in base_dir.iterdir():
    if not cls_dir.is_dir():
        continue
    classifier = cls_dir.name

    # Select which list to append to
    if classifier == "dt":
        target_list = dt_rule_records
    elif classifier == "ripperk":
        target_list = ripperk_rule_records
    else:
        continue  # skip any other classifiers

    # Datasets
    for ds_dir in cls_dir.iterdir():
        if not ds_dir.is_dir():
            continue
        dataset = ds_dir.name

        # Labelings
        for lab_dir in ds_dir.iterdir():
            if not lab_dir.is_dir():
                continue
            labeling_raw = lab_dir.name
            parts = labeling_raw.split("_")
            labeling = parts[1] if len(parts) > 2 else labeling_raw  # keep middle part

            # Encodings
            for enc_dir in lab_dir.iterdir():
                if not enc_dir.is_dir():
                    continue
                encoding = enc_dir.name

                # Read the first CSV found in the encoding folder
                csv_files = sorted(enc_dir.glob("*.csv"))
                if not csv_files:
                    continue

                try:
                    df = pd.read_csv(csv_files[0])
                except Exception:
                    continue

                # Be robust to case differences: look for 'rule' case-insensitively
                lower_map = {c.lower(): c for c in df.columns}
                if "rule" not in lower_map:
                    continue  # no rules in this CSV

                rule_col = lower_map["rule"]
                rules_series = (
                    df[rule_col]
                    .dropna()
                    .astype(str)
                    .str.strip()
                )

                # Skip empties
                rules_series = rules_series[rules_series != ""]

                for rule in rules_series:
                    target_list.append({
                        "Dataset": dataset,
                        "Labeling": labeling,
                        "Feature Encoding": encoding,
                        "Rule": rule
                    })

# Build dataframes (no classifier col), drop duplicates, sort
dt_rules_df = (
    pd.DataFrame.from_records(dt_rule_records)
      .drop_duplicates()
      .sort_values(by=["Dataset", "Labeling", "Feature Encoding", "Rule"])
      .reset_index(drop=True)
)

ripperk_rules_df = (
    pd.DataFrame.from_records(ripperk_rule_records)
      .drop_duplicates()
      .sort_values(by=["Dataset", "Labeling", "Feature Encoding", "Rule"])
      .reset_index(drop=True)
)

# Drop rows where Feature Encoding is 'mr', 'mra', 'tr', or 'tra'
dt_rules_df = dt_rules_df[~dt_rules_df["Feature Encoding"].isin(exclude_encodings)].reset_index(drop=True)
ripperk_rules_df = ripperk_rules_df[~ripperk_rules_df["Feature Encoding"].isin(exclude_encodings)].reset_index(drop=True)

# --- Normalize Labeling values (substring-based) ---
for _df in (dt_rules_df, ripperk_rules_df):
    if "Labeling" in _df.columns:
        low = _df["Labeling"].astype(str).str.lower()
        _df.loc[low.str.contains("decl", na=False), "Labeling"] = "declare"
        _df.loc[low.str.contains("payload", na=False), "Labeling"] = "payload"
        _df.loc[low.str.contains("mr", na=False), "Labeling"] = "sequential"

dt_rules_df

Unnamed: 0,Dataset,Labeling,Feature Encoding,Rule
0,BPI15A,declare,baseline,[01_HOOFD_011 = 0] --> Label
1,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 0] --> Label
2,BPI15A,declare,baseline,[01_HOOFD_011 = 1 ∧ 01_HOOFD_099 = 1] --> Label
3,BPI15A,declare,bs_data,[01_HOOFD_011 = 0] --> Label
4,BPI15A,declare,bs_data,[01_HOOFD_011 = 1 ∧ 01_HOOFD_494a = 0] --> Label
...,...,...,...,...
2059,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
2060,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
2061,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...
2062,traffic,payload,seq_combined_data,[paymentAmount|first|continuous_binned_(-0.001...


In [19]:
def fmt_detok(x):
    # Prevent TeX from parsing special chars
    return r'\detokenize{' + str(x) + '}'

def save_rules(df, classifier):
    out_dir = os.path.join('5_analysis', classifier)
    os.makedirs(out_dir, exist_ok=True)

    stem = f'rules_{classifier}'
    csv_path = os.path.join(out_dir, f'{stem}.csv')
    tex_path = os.path.join(out_dir, f'{stem}.tex')

    # CSV
    df.to_csv(csv_path, index=False, sep=',')
    # LaTeX
    df.to_latex(
        tex_path,
        index=False,
        escape=False,
        longtable=True,
        formatters={
            "Feature Encoding": fmt_detok,
            "Rule": fmt_detok
        }
    )
    print(f"✅ Saved {stem}.csv → {csv_path}")
    print(f"✅ Saved {stem}.tex  → {tex_path}")

# Run for both classifiers
for name, df in [
    ("dt", dt_rules_df),
    ("ripperk", ripperk_rules_df),
]:
    save_rules(df, name)


✅ Saved rules_dt.csv → 5_analysis/dt/rules_dt.csv
✅ Saved rules_dt.tex  → 5_analysis/dt/rules_dt.tex
✅ Saved rules_ripperk.csv → 5_analysis/ripperk/rules_ripperk.csv
✅ Saved rules_ripperk.tex  → 5_analysis/ripperk/rules_ripperk.tex


## 5. Comparison of Baseline with CRM

In [20]:
# all_rules_crm_path = os.path.join('5_analysis', 'random', 'combined_sorted_all.csv')
# all_rules_crm = pd.read_csv(all_rules_crm_path, sep=',')

all_rules_crm_path = os.path.join('5_analysis', 'DHL', 'combined_sorted_all.csv')
all_rules_crm = pd.read_csv(all_rules_crm_path, sep=',')

all_rules_dt_path = os.path.join('5_analysis', 'dt', 'rules_dt.csv')
all_rules_dt = pd.read_csv(all_rules_dt_path, sep=',')

all_metrics_dt_path = os.path.join('5_analysis', 'dt', 'aggregated_sorted_dt.csv')
all_metrics_dt = pd.read_csv(all_metrics_dt_path, sep=',')

all_rules_ripperk_path = os.path.join('5_analysis', 'ripperk', 'rules_ripperk.csv')
all_rules_ripperk = pd.read_csv(all_rules_ripperk_path, sep=',')

all_metrics_ripperk_path = os.path.join('5_analysis', 'ripperk', 'aggregated_sorted_ripperk.csv')
all_metrics_ripperk = pd.read_csv(all_metrics_ripperk_path, sep=',')

In [21]:
# --- 0) Helpers --------------------------------------------------------------

def _harmonize_and_filter(df: pd.DataFrame) -> pd.DataFrame:
    # Normalize encoding column name and drop excluded encodings
    out = df.rename(columns={
        'Feature Encoding': 'Encoding',
        'Feature encoding': 'Encoding'
    }).copy()
    if 'Encoding' in out.columns:
        enc_norm = out['Encoding'].astype(str).str.strip().str.lower()
        out = out[~enc_norm.isin(exclude_encodings)].copy()
    return out

def _rule_counts(df: pd.DataFrame, name: str) -> pd.DataFrame:
    # Count rules per (Dataset, Labeling, Encoding)
    return (
        df.groupby(['Dataset', 'Labeling', 'Encoding'], as_index=False)
          .agg(**{f'{name}_rule_count': ('Rule', 'count')})
    )

def _merge_with_crm(left_counts: pd.DataFrame, crm_counts: pd.DataFrame) -> pd.DataFrame:
    # left_counts: dt or ripperk counts; merge with CRM counts
    left_col = next(c for c in left_counts.columns if c.endswith('_rule_count') and c != 'crm_rule_count')
    merged = (
        pd.merge(left_counts, crm_counts, on=['Dataset', 'Labeling', 'Encoding'], how='outer')
          .fillna({left_col: 0, 'crm_rule_count': 0})
          .astype({left_col: int, 'crm_rule_count': int})
          .sort_values(by=['Dataset', 'Labeling', 'Encoding'])
          .reset_index(drop=True)
    )
    return merged

# --- 1) Harmonize + filter inputs -------------------------------------------

rules_crm_f      = _harmonize_and_filter(all_rules_crm)
rules_dt_f       = _harmonize_and_filter(all_rules_dt)
rules_ripperk_f  = _harmonize_and_filter(all_rules_ripperk)

# --- 2) Counts and initial comparisons --------------------------------------

crm_counts      = _rule_counts(rules_crm_f,     'crm')
dt_counts       = _rule_counts(rules_dt_f,      'dt')
ripperk_counts  = _rule_counts(rules_ripperk_f, 'ripperk')

dt_comparison       = _merge_with_crm(dt_counts, crm_counts)
ripperk_comparison  = _merge_with_crm(ripperk_counts, crm_counts)

# Optional: consistent column order
_dt_cols = ['Dataset', 'Labeling', 'Encoding', 'dt_rule_count', 'crm_rule_count']
_rk_cols = ['Dataset', 'Labeling', 'Encoding', 'ripperk_rule_count', 'crm_rule_count']
dt_comparison = dt_comparison.reindex(columns=[c for c in _dt_cols if c in dt_comparison.columns])
ripperk_comparison = ripperk_comparison.reindex(columns=[c for c in _rk_cols if c in ripperk_comparison.columns])

# --- 3) CRM summary stats (LB odds ratio > 1) and merge into both -----------

# Ensure numeric types for needed CRM metrics
crm_stats_df = rules_crm_f.copy()
for col in ['LB odds ratio', 'Confidence', 'Support LHS']:
    crm_stats_df[col] = pd.to_numeric(crm_stats_df[col], errors='coerce')

# Filter to "interesting" CRM rules, then aggregate
crm_filtered = crm_stats_df[crm_stats_df['LB odds ratio'] > 1].copy()

crm_all_agg = (
    crm_filtered
    .groupby(['Dataset', 'Labeling', 'Encoding'], as_index=False)
    .agg(
        crm_conf_median=('Confidence', 'median'),
        crm_conf_max=('Confidence', 'max'),
        crm_support_lhs_median=('Support LHS', 'median'),
        crm_support_lhs_max=('Support LHS', 'max')
    )
)

# Merge aggregated CRM metrics into both comparisons
dt_comparison = dt_comparison.merge(
    crm_all_agg, on=['Dataset', 'Labeling', 'Encoding'], how='left'
)
ripperk_comparison = ripperk_comparison.merge(
    crm_all_agg, on=['Dataset', 'Labeling', 'Encoding'], how='left'
)

In [22]:
# --- Robustly attach precision/recall, then rename/reorder/round for DT & RIPPERk ---

# 0) Prep: harmonize 'Encoding' for both metrics tables
dt_metrics = all_metrics_dt.copy()
if 'Encoding' not in dt_metrics.columns and 'Feature Encoding' in dt_metrics.columns:
    dt_metrics = dt_metrics.rename(columns={'Feature Encoding': 'Encoding'})

rk_metrics = all_metrics_ripperk.copy()
if 'Encoding' not in rk_metrics.columns and 'Feature Encoding' in rk_metrics.columns:
    rk_metrics = rk_metrics.rename(columns={'Feature Encoding': 'Encoding'})

# Build a set of known dataset-prefix tokens to strip from Labeling (case-insensitive)
def _collect_prefixes(*dfs) -> set:
    prefixes = set()
    for df in dfs:
        if df is None or not isinstance(df, pd.DataFrame):
            continue
        if 'Dataset' in df.columns:
            s = pd.Series(df['Dataset']).astype(str).str.strip().str.lower()
            prefixes.update(s.str.replace(r'\s+', '', regex=True).unique().tolist())
    # Add common aliases you might encounter
    prefixes |= {'sepsis', 'traffic', 'bpi15a', 'bpic15a', 'bpic2015', 'bpi2015', 'bpi15'}
    return {p for p in prefixes if p and p != 'nan'}

KNOWN_PREFIXES = _collect_prefixes(dt_comparison, ripperk_comparison, dt_metrics, rk_metrics)

def _normalize_side(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # Harmonize column names and coerce to clean strings
    out = out.rename(columns={'Feature encoding': 'Encoding', 'Feature Encoding': 'Encoding'})
    for c in ['Dataset', 'Labeling', 'Encoding']:
        if c in out.columns:
            out[c] = out[c].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()

    def norm_dataset(s: str) -> str:
        return str(s).strip().lower()

    def norm_encoding(s: str) -> str:
        return str(s).strip().lower()

    def norm_labeling(lbl: str) -> str:
        s = str(lbl).strip().lower()

        # Drop suffixes like "_feature" / "_features"
        s = re.sub(r'(_features?)$', '', s)

        # Normalize whitespace/underscores
        s = s.replace(' ', '_')

        # Repeatedly strip known dataset prefixes (you already have KNOWN_PREFIXES)
        changed = True
        while changed:
            changed = False
            for p in sorted(KNOWN_PREFIXES, key=len, reverse=True):
                if s.startswith(p + '_'):
                    s = s[len(p) + 1:]
                    changed = True

        # --- Canonicalize labeling families so merge keys match across tables ---
        # payload, payload2, payload_*, etc. -> "payload"
        if re.search(r'\bpayload\b|\bpayload\d+\b', s):
            return 'payload'

        # decl, decl2, decl3, declare, declare2, etc. -> "declare"
        if re.fullmatch(r'(decl(are)?\d*)', s):
            return 'declare'

        # anything containing token "mr" (e.g., mr, mr_tr, mr-anything) -> "sequential"
        # use token-ish boundaries to avoid accidental matches inside other words
        if re.search(r'(^|[^a-z])mr([^a-z]|$)', s):
            return 'sequential'

        # If none matched, return cleaned label
        return s

    out['Dataset_norm']  = out['Dataset'].apply(norm_dataset)   if 'Dataset' in out.columns else ''
    out['Encoding_norm'] = out['Encoding'].apply(norm_encoding) if 'Encoding' in out.columns else ''
    out['Labeling_norm'] = out['Labeling'].apply(norm_labeling) if 'Labeling' in out.columns else ''
    out['__merge_key__'] = (out['Dataset_norm'].astype(str) + '|' +
                            out['Labeling_norm'].astype(str) + '|' +
                            out['Encoding_norm'].astype(str))
    return out

def _find_col(df: pd.DataFrame, candidates):
    cand_norm = [c.casefold() for c in candidates]
    for col in df.columns:
        if col.casefold() in cand_norm:
            return col
    return None

def _attach_metrics(comp_df: pd.DataFrame, metrics_df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    # Normalize both sides
    comp_n = _normalize_side(comp_df)
    metr_n = _normalize_side(metrics_df)

    # Locate metric columns (case-insensitive)
    prec_col = _find_col(metr_n, ['precision', 'prec'])
    rec_col  = _find_col(metr_n, ['recall', 'rec'])
    if prec_col is None or rec_col is None:
        raise ValueError(
            f"Could not locate precision/recall in metrics for {prefix}. "
            f"Columns present: {list(metrics_df.columns)}"
        )

    # Keep only key + metrics, rename to prefixed names, and make numeric
    subset = (
        metr_n[['__merge_key__', prec_col, rec_col]]
        .rename(columns={prec_col: f'{prefix}_precision', rec_col: f'{prefix}_recall'})
    )
    subset[f'{prefix}_precision'] = pd.to_numeric(subset[f'{prefix}_precision'], errors='coerce')
    subset[f'{prefix}_recall']    = pd.to_numeric(subset[f'{prefix}_recall'],    errors='coerce')

    # Aggregate (mean) by merge key in case of duplicates
    subset = (
        subset.groupby('__merge_key__', as_index=False)
              .agg({f'{prefix}_precision': 'mean', f'{prefix}_recall': 'mean'})
    )

    # Merge back onto the comparison df (keeping original row order/cols)
    merged = comp_n[['__merge_key__']].merge(subset, on='__merge_key__', how='left')
    out = comp_df.copy()
    out[f'{prefix}_precision'] = pd.to_numeric(merged[f'{prefix}_precision'], errors='coerce').round(3)
    out[f'{prefix}_recall']    = pd.to_numeric(merged[f'{prefix}_recall'],    errors='coerce').round(3)
    return out

# 1) Attach metrics to both comparisons
dt_comparison       = _attach_metrics(dt_comparison, dt_metrics, prefix='dt')
ripperk_comparison  = _attach_metrics(ripperk_comparison, rk_metrics, prefix='ripperk')

# 2) Rename, reorder, round (column-wise to avoid shape mismatch)
def _rename_reorder_round(df: pd.DataFrame, rename_map: dict, final_order: list) -> pd.DataFrame:
    out = df.rename(columns=rename_map).copy()

    # Reorder columns (keep any others at the end)
    ordered = [c for c in final_order if c in out.columns]
    tail    = [c for c in out.columns if c not in ordered]
    out = out[ordered + tail]

    # Identify count columns (by name suffix " Rules" after renaming)
    count_cols = [c for c in out.columns if c.endswith(' Rules')]

    # Ensure counts are Int64 and *not* rounded
    for c in count_cols:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors='coerce').astype('Int64')

    # Round all other numeric columns to 3 d.p., one-by-one to avoid assignment shape issues
    for c in out.columns:
        if c in count_cols:
            continue
        if pd.api.types.is_numeric_dtype(out[c]):
            out[c] = pd.to_numeric(out[c], errors='coerce').round(3)

    return out

# DT rename/reorder
dt_rename_map = {
    'dt_rule_count': 'DT Rules',
    'crm_rule_count': 'CRM Rules',
    'crm_conf_median': 'CRM Confidence Median',
    'crm_conf_max': 'CRM Confidence Max',
    'dt_precision': 'DT Precision',
    'dt_recall': 'DT Recall',
    'crm_support_lhs_median': 'CRM LHS Support Median',
    'crm_support_lhs_max': 'CRM LHS Support Max'
}
dt_final_order = [
    'Dataset', 'Labeling', 'Encoding',
    'DT Rules', 'CRM Rules',
    'CRM Confidence Median', 'CRM Confidence Max',
    'DT Precision',
    'CRM LHS Support Median', 'CRM LHS Support Max',
    'DT Recall'
]
dt_comparison = _rename_reorder_round(dt_comparison, dt_rename_map, dt_final_order)

# RIPPERk rename/reorder
rk_rename_map = {
    'ripperk_rule_count': 'RIPPERk Rules',
    'crm_rule_count': 'CRM Rules',
    'crm_conf_median': 'CRM Confidence Median',
    'crm_conf_max': 'CRM Confidence Max',
    'ripperk_precision': 'RIPPERk Precision',
    'ripperk_recall': 'RIPPERk Recall',
    'crm_support_lhs_median': 'CRM LHS Support Median',
    'crm_support_lhs_max': 'CRM LHS Support Max'
}
rk_final_order = [
    'Dataset', 'Labeling', 'Encoding',
    'RIPPERk Rules', 'CRM Rules',
    'CRM Confidence Median', 'CRM Confidence Max',
    'RIPPERk Precision',
    'CRM LHS Support Median', 'CRM LHS Support Max',
    'RIPPERk Recall'
]
ripperk_comparison = _rename_reorder_round(ripperk_comparison, rk_rename_map, rk_final_order)

# --- Normalize Labeling values (substring-based) ---
for _df in (dt_comparison, ripperk_comparison):
    if "Labeling" in _df.columns:
        low = _df["Labeling"].astype(str).str.lower()
        _df.loc[low.str.contains("decl", na=False), "Labeling"] = "declare"
        _df.loc[low.str.contains("payload", na=False), "Labeling"] = "payload"
        _df.loc[low.str.contains("mr", na=False), "Labeling"] = "sequential"


# (Optionally) display
dt_comparison
# ripperk_comparison

Unnamed: 0,Dataset,Labeling,Encoding,DT Rules,CRM Rules,CRM Confidence Median,CRM Confidence Max,DT Precision,CRM LHS Support Median,CRM LHS Support Max,DT Recall
0,BPI15A,declare,baseline,3,0,,,1.000,,,1.000
1,BPI15A,declare,bs_data,3,0,,,1.000,,,1.000
2,BPI15A,declare,bs_dwd,3,0,,,1.000,,,1.000
3,BPI15A,declare,dec_data,3,0,,,1.000,,,1.000
4,BPI15A,declare,dec_dwd,3,0,,,1.000,,,1.000
...,...,...,...,...,...,...,...,...,...,...,...
148,traffic,sequential,hybrid_dwd,2,0,,,1.000,,,1.000
149,traffic,sequential,hybrid_dwd_data,2,0,,,1.000,,,1.000
150,traffic,sequential,payload,133,0,,,0.811,,,0.968
151,traffic,sequential,seq_combined,33,0,,,0.986,,,1.000


## 6. Calculating Rule Redundancy

### 6.1. Subsumption-Based Redundancy (Structural Overlap)

In [23]:
# ---------- CRM rules → expand LHS features (excluding MR/MRA/TR/TRA encodings) ----------
# Fallback in case it's not defined earlier
try:
    exclude_encodings
except NameError:
    exclude_encodings = ["mr", "mra", "tr", "tra"]

# ---------- 0) Load rules ----------
if 'all_rules_crm' in globals() and isinstance(all_rules_crm, pd.DataFrame):
    crm_df = all_rules_crm.copy()
else:
    raise ValueError("all_rules_crm is not available as a DataFrame.")

# Harmonize encoding column name
if 'Encoding' not in crm_df.columns and 'Feature Encoding' in crm_df.columns:
    crm_df = crm_df.rename(columns={'Feature Encoding': 'Encoding'})

# Exclude encodings (case/whitespace-insensitive)
if 'Encoding' in crm_df.columns:
    excl = {e.lower().strip() for e in exclude_encodings}
    enc_norm = crm_df['Encoding'].astype(str).str.strip().str.lower()
    crm_df = crm_df[~enc_norm.isin(excl)].copy()

# Ensure Odds ratio is numeric and filter OR > 1
crm_df['Odds ratio'] = pd.to_numeric(crm_df['Odds ratio'], errors='coerce')
crm_df = crm_df[crm_df['Odds ratio'] > 1].copy()

# ---------- Normalize Labeling (mirror your other DFs; robust to payload_* etc.) ----------
if 'Labeling' in crm_df.columns:
    # Build KNOWN_PREFIXES once
    try:
        KNOWN_PREFIXES
    except NameError:
        KNOWN_PREFIXES = set(
            crm_df.get('Dataset', pd.Series([], dtype=str))
                  .astype(str).str.strip().str.lower()
                  .str.replace(r'\s+', '', regex=True)
                  .unique().tolist()
        ) | {'sepsis', 'traffic', 'bpi15a', 'bpic15a', 'bpic2015', 'bpi2015', 'bpi15'}

    def _strip_prefix_suffix(x: str) -> str:
        s = str(x)
        s = re.sub(r'(_features?)$', '', s, flags=re.I)  # drop trailing "_features"
        s = s.strip().lower().replace(' ', '_')
        # remove known dataset prefixes repeatedly
        changed = True
        while changed:
            changed = False
            for p in sorted(KNOWN_PREFIXES, key=len, reverse=True):
                if s.startswith(p + '_'):
                    s = s[len(p) + 1:]
                    changed = True
        return s

    # 1) If a helper exists, use it to get a first pass; else, strip suffixes/prefixes ourselves
    base_series = crm_df['Labeling']
    if '_normalize_side' in globals() and callable(globals()['_normalize_side']):
        try:
            tmp = _normalize_side(crm_df)
            if isinstance(tmp, pd.DataFrame):
                if 'Labeling_norm' in tmp.columns:
                    base_series = tmp['Labeling_norm']
                elif 'Labeling' in tmp.columns:
                    base_series = tmp['Labeling']
        except Exception:
            pass  # fall back to original base_series

    crm_df['Labeling'] = base_series.apply(_strip_prefix_suffix)

    # 2) Final override mapping — exactly like your other DataFrames
    low = crm_df['Labeling'].astype(str).str.lower()
    crm_df.loc[low.str.contains('decl', na=False),    'Labeling'] = 'declare'
    crm_df.loc[low.str.contains('payload', na=False), 'Labeling'] = 'payload'      # catches "payload_pay36"
    crm_df.loc[low.str.contains('mr', na=False),      'Labeling'] = 'sequential'

# ---------- 1) Extract exact LHS and RHS ----------
def extract_lhs_exact(rule_str: str) -> str:
    """Everything before the arrow '-->' (preserve quotes/brackets exactly)."""
    m = re.search(r"^(.*?)(?=\s*-->)", str(rule_str))
    return m.group(1) if m else str(rule_str)

def parse_rhs_label(rule_str: str):
    """Return 1 for 'Label', 0 for '!Label', or None if not found."""
    m = re.search(r"-->\s*(Label|!Label)", str(rule_str))
    if not m:
        return None
    return 1 if m.group(1) == "Label" else 0

crm_df['LHS_features'] = crm_df['Rule'].apply(extract_lhs_exact)
crm_df['RHS_label']    = crm_df['Rule'].apply(parse_rhs_label)

# ---------- 2) Split LHS into up to 3 features ----------
def _find_outer_brackets_span(text: str):
    """Return (start_idx, end_idx) of the outermost [...] in `text`."""
    s = str(text)
    start = s.find('[')
    if start < 0:
        return None, None

    depth = 0
    in_s = in_d = esc = False
    end = None
    for i, ch in enumerate(s[start:], start):
        if esc:
            esc = False
            continue
        if ch == '\\':
            esc = True
            continue

        if in_s:
            if ch == "'":
                in_s = False
            continue
        if in_d:
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            in_s = True
            continue
        if ch == '"':
            in_d = True
            continue

        if ch == '[':
            depth += 1
            continue
        if ch == ']':
            depth -= 1
            if depth == 0:
                end = i
                break
    return (start, end)

def _split_top_level_commas(content: str):
    """Split `content` on commas that are outside quotes."""
    parts, curr = [], ""
    in_s = in_d = esc = False
    for ch in content:
        if esc:
            curr += ch
            esc = False
            continue
        if ch == '\\':
            curr += ch
            esc = True
            continue

        if in_s:
            curr += ch
            if ch == "'":
                in_s = False
            continue
        if in_d:
            curr += ch
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            curr += ch
            in_s = True
            continue
        if ch == '"':
            curr += ch
            in_d = True
            continue

        if ch == ',':
            parts.append(curr.strip())
            curr = ""
        else:
            curr += ch
    parts.append(curr.strip())
    return parts

def _strip_one_layer_quotes(s: str):
    """Remove a single layer of outer quotes if present; keep inner brackets intact."""
    s = s.strip()
    if len(s) >= 2 and ((s[0] == s[-1] == "'") or (s[0] == s[-1] == '"')):
        return s[1:-1]
    return s

def split_lhs_items(lhs_text: str):
    """
    lhs_text is exactly what's before '-->', e.g. "['A', 'B', 'C']" or "['A']".
    Return list like ['A','B','C'] (no outer quotes/brackets).
    """
    s = str(lhs_text)
    start, end = _find_outer_brackets_span(s)
    if start is None or end is None:
        return []
    inner = s[start+1:end]  # inside [...]
    raw_items = _split_top_level_commas(inner)
    return [_strip_one_layer_quotes(x).strip() for x in raw_items if x != ""]

def _pad3(items):
    items = items[:3]
    return items + [""] * (3 - len(items))

lhs_split = crm_df['LHS_features'].apply(split_lhs_items).apply(_pad3)
lhs_df = pd.DataFrame(lhs_split.tolist(), columns=['feature_1_lhs','feature_2_lhs','feature_3_lhs'])

# ---------- 3) Final table ----------
crm_rules_all_expanded = pd.concat(
    [crm_df[['Dataset','Labeling','Encoding','Rule','LHS_features','RHS_label']].reset_index(drop=True),
     lhs_df.reset_index(drop=True)],
    axis=1
).reset_index(drop=True)

# Optional: sort for readability
crm_rules_all_expanded = crm_rules_all_expanded.sort_values(
    by=['Dataset','Labeling','Encoding'], ascending=True
).reset_index(drop=True)

# Show result
crm_rules_all_expanded

Unnamed: 0,Dataset,Labeling,Encoding,Rule,LHS_features,RHS_label,feature_1_lhs,feature_2_lhs,feature_3_lhs
0,dhl,dhl,baseline,"['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]'] -->...","['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]']",1,"ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]",,
1,dhl,dhl,baseline,"['PALLET_CREATED_binned_(1.0, 199.0]'] --> Label","['PALLET_CREATED_binned_(1.0, 199.0]']",1,"PALLET_CREATED_binned_(1.0, 199.0]",,
2,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PALLET_C...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'PALLET_C...",0,"ORDER_CLOSED_binned_(1.0, 200.0]","PALLET_CREATED_binned_(-0.001, 1.0]",
3,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'SHIPUNIT...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'SHIPUNIT...",0,"ORDER_CLOSED_binned_(1.0, 200.0]","SHIPUNIT_BUILT_binned_(-0.001, 1.0]",
4,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PACKED_b...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'PACKED_b...",0,"ORDER_CLOSED_binned_(1.0, 200.0]","PACKED_binned_(-0.001, 1.0]",
...,...,...,...,...,...,...,...,...,...
824,dhl,dhl,seq_combined_data,"['mr[ORDER_CLOSED-complete]_binned_(-0.001, 1....","['mr[ORDER_CLOSED-complete]_binned_(-0.001, 1....",1,"mr[ORDER_CLOSED-complete]_binned_(-0.001, 1.0]","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...
825,dhl,dhl,seq_combined_data,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...","['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",1,"mr[ORDER_GATEWAY-complete, ORDER_OPENED-comple...","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...
826,dhl,dhl,seq_combined_data,"['mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, ...","['mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, ...",1,"mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, 1.0]","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...
827,dhl,dhl,seq_combined_data,"['mr[RELEASED_FOR_PICKING-complete, PICKED-com...","['mr[RELEASED_FOR_PICKING-complete, PICKED-com...",1,"mr[RELEASED_FOR_PICKING-complete, PICKED-compl...","length_binned_(11.0, 2197.0]","mr[PACKED-complete, PALLET_CREATED-complete]_0.0"


In [24]:
# --- Subsumption-based redundancy (structural overlap) ---

# CONFIG: relative tolerance (e.g., 0.05 => ±5%)
SUBSUMPTION_TOL = 0.05

# 0) Ensure we have Odds ratio in crm_rules_all_expanded
rules_src = crm_rules_all_expanded.copy()
#rules_src = all_rules_crm.copy()

if 'LB odds ratio' not in rules_src.columns:
    # Merge it from the original all_rules_crm by (Dataset, Labeling, Encoding, Rule)
    # Assumes `all_rules_crm` is already loaded in memory
    or_src = all_rules_crm.copy()
    if 'Encoding' not in or_src.columns and 'Feature Encoding' in or_src.columns:
        or_src = or_src.rename(columns={'Feature Encoding': 'Encoding'})
    or_src['LB odds ratio'] = pd.to_numeric(or_src['LB odds ratio'], errors='coerce')
    rules_src = rules_src.merge(
        or_src[['Dataset','Labeling','Encoding','Rule','LB odds ratio']],
        on=['Dataset','Labeling','Encoding','Rule'],
        how='left'
    )

# 1) Build antecedent lists/sets and sizes
def _collect_feats(row):
    feats = []
    for c in ('feature_1_lhs','feature_2_lhs','feature_3_lhs'):
        v = row.get(c, "")
        if isinstance(v, str) and v.strip():
            feats.append(v.strip())
    return tuple(feats)  # order as given in rule (we compare as sets for subset check)

rules_src['antecedent_items'] = rules_src.apply(_collect_feats, axis=1)
rules_src['antecedent_size']  = rules_src['antecedent_items'].apply(lambda t: len([x for x in t if x]))

# Safety: numeric OR only
rules_src['LB odds ratio'] = pd.to_numeric(rules_src['LB odds ratio'], errors='coerce')

# 2) For each group, check subsumption
results = []

group_cols = ['Dataset','Labeling','Encoding','RHS_label']
for gkey, g in rules_src.groupby(group_cols, dropna=False):
    g = g.reset_index(drop=True).copy()

    # Index conveniences
    sizes = g['antecedent_size'].values
    ors   = g['LB odds ratio'].values
    items = g['antecedent_items'].values
    rules = g['Rule'].values

    # Pre-bucket indices by antecedent size for quick lookup
    from collections import defaultdict
    by_size = defaultdict(list)
    for idx, s in enumerate(sizes):
        by_size[int(s)].append(idx)

    n = len(g)
    n_subsumers = np.zeros(n, dtype=int)
    is_subsumed = np.zeros(n, dtype=bool)
    subsumed_by = [[] for _ in range(n)]

    for i in range(n):
        Xi = set(items[i])
        si = sizes[i]
        oi = ors[i]

        if si <= 0 or np.isnan(oi):
            continue  # nothing to compare or missing OR

        # Candidates: strictly smaller antecedents
        candidates = []
        for s in range(1, int(si)):  # only sizes 1..(si-1)
            candidates.extend(by_size.get(s, []))

        for j in candidates:
            Xj = set(items[j])
            oj = ors[j]

            if np.isnan(oj):
                continue

            # same consequent is guaranteed by grouping on RHS_label
            # subset check: Xj ⊆ Xi
            if not Xj.issubset(Xi):
                continue

            # relative effect size difference within tolerance:
            # |oi - oj| / max(oj, tiny) <= SUBSUMPTION_TOL
            denom = max(abs(oj), 1e-12)
            rel_diff = abs(oi - oj) / denom
            if rel_diff <= SUBSUMPTION_TOL:
                n_subsumers[i] += 1
                is_subsumed[i] = True
                subsumed_by[i].append(rules[j])

    g['n_subsumers']      = n_subsumers
    g['is_subsumed']      = is_subsumed
    g['subsumed_by_rules']= subsumed_by
    results.append(g)

crm_subsumption = pd.concat(results, ignore_index=True)

# 3) Tidy output columns (you can adjust ordering as you like)
crm_subsumption = crm_subsumption[[
    'Dataset','Labeling','Encoding','Rule','RHS_label',
    'LB odds ratio','antecedent_size','antecedent_items',
    'feature_1_lhs','feature_2_lhs','feature_3_lhs',
    'n_subsumers','is_subsumed','subsumed_by_rules'
]].sort_values(['Dataset','Labeling','Encoding','RHS_label','antecedent_size'])

# Optional display tweaks
crm_subsumption.reset_index(drop=True, inplace=True)

# (Optional) Quick summary per experiment:
subsumed_summary = (
    crm_subsumption
    .groupby(['Dataset','Labeling','Encoding','RHS_label'], as_index=False)
    .agg(
        n_rules=('Rule','count'),
        n_subsumed=('is_subsumed','sum'),
        pct_subsumed=('is_subsumed', lambda x: round(100.0 * x.mean(), 2))
    )
).sort_values(['Dataset','Labeling','Encoding','RHS_label'])

# Show heads so you can inspect
display(crm_subsumption)
display(subsumed_summary)


Unnamed: 0,Dataset,Labeling,Encoding,Rule,RHS_label,LB odds ratio,antecedent_size,antecedent_items,feature_1_lhs,feature_2_lhs,feature_3_lhs,n_subsumers,is_subsumed,subsumed_by_rules
0,dhl,dhl,baseline,"['ORDER_ACKNOWLEDGED_binned_(-0.001, 1.0]'] --...",0,4.975,1,"(ORDER_ACKNOWLEDGED_binned_(-0.001, 1.0],)","ORDER_ACKNOWLEDGED_binned_(-0.001, 1.0]",,,0,False,[]
1,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PALLET_C...",0,7.329,2,"(ORDER_CLOSED_binned_(1.0, 200.0], PALLET_CREA...","ORDER_CLOSED_binned_(1.0, 200.0]","PALLET_CREATED_binned_(-0.001, 1.0]",,0,False,[]
2,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'SHIPUNIT...",0,7.329,2,"(ORDER_CLOSED_binned_(1.0, 200.0], SHIPUNIT_BU...","ORDER_CLOSED_binned_(1.0, 200.0]","SHIPUNIT_BUILT_binned_(-0.001, 1.0]",,0,False,[]
3,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PACKED_b...",0,7.189,2,"(ORDER_CLOSED_binned_(1.0, 200.0], PACKED_binn...","ORDER_CLOSED_binned_(1.0, 200.0]","PACKED_binned_(-0.001, 1.0]",,0,False,[]
4,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PICKED_b...",0,7.189,2,"(ORDER_CLOSED_binned_(1.0, 200.0], PICKED_binn...","ORDER_CLOSED_binned_(1.0, 200.0]","PICKED_binned_(-0.001, 1.0]",,0,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824,dhl,dhl,seq_combined_data,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",1,1.001,3,"(mr[ORDER_GATEWAY-complete, ORDER_OPENED-compl...","mr[ORDER_GATEWAY-complete, ORDER_OPENED-comple...","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...,0,False,[]
825,dhl,dhl,seq_combined_data,"['mr[ORDER_CLOSED-complete]_binned_(-0.001, 1....",1,1.001,3,"(mr[ORDER_CLOSED-complete]_binned_(-0.001, 1.0...","mr[ORDER_CLOSED-complete]_binned_(-0.001, 1.0]","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...,0,False,[]
826,dhl,dhl,seq_combined_data,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",1,1.001,3,"(mr[ORDER_GATEWAY-complete, ORDER_OPENED-compl...","mr[ORDER_GATEWAY-complete, ORDER_OPENED-comple...","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...,0,False,[]
827,dhl,dhl,seq_combined_data,"['mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, ...",1,1.001,3,"(mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, 1...","mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, 1.0]","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...,0,False,[]


Unnamed: 0,Dataset,Labeling,Encoding,RHS_label,n_rules,n_subsumed,pct_subsumed
0,dhl,dhl,baseline,0,11,0,0.0
1,dhl,dhl,baseline,1,4,0,0.0
2,dhl,dhl,bs_data,0,27,0,0.0
3,dhl,dhl,bs_data,1,12,0,0.0
4,dhl,dhl,bs_dwd,0,25,0,0.0
5,dhl,dhl,bs_dwd,1,9,0,0.0
6,dhl,dhl,dec_data,0,26,0,0.0
7,dhl,dhl,dec_data,1,27,0,0.0
8,dhl,dhl,dec_dwd,0,6,0,0.0
9,dhl,dhl,dec_dwd,1,9,0,0.0


In [25]:
# --- Export subsumed_summary with cleaned Labeling and renamed columns (CSV + LaTeX) ---
if 'subsumed_summary' not in globals() or not isinstance(subsumed_summary, pd.DataFrame):
    raise ValueError("subsumed_summary is not available as a DataFrame.")

# 1) Collect dataset tokens to strip from Labeling (case/whitespace-insensitive)
def _collect_prefixes(series: pd.Series) -> set:
    s = series.astype(str).str.strip().str.lower().str.replace(r"\s+", "", regex=True)
    prefixes = set(s.unique().tolist())
    # Common aliases you might encounter
    prefixes |= {"sepsis", "traffic", "bpi15a"}
    return {p for p in prefixes if p and p != "nan"}

# 2) Strip dataset prefix from Labeling while preserving the rest of the string's case
def _strip_dataset_prefix(label: str, prefixes: set) -> str:
    s = str(label)
    while True:
        if "_" not in s:
            return s
        head, tail = s.split("_", 1)
        head_norm = head.strip().lower().replace(" ", "")
        if head_norm in prefixes:
            s = tail
        else:
            return s

# 3) Prepare a detokenizer for LaTeX
def _fmt_detok(x):
    return r'\detokenize{' + str(x) + '}'

# Build export DataFrame
prefixes = _collect_prefixes(subsumed_summary["Dataset"]) if "Dataset" in subsumed_summary.columns else set()
subsumed_export = subsumed_summary.copy()

# Clean Labeling: remove dataset prefix
if "Labeling" in subsumed_export.columns:
    subsumed_export["Labeling"] = subsumed_export["Labeling"].map(lambda v: _strip_dataset_prefix(v, prefixes))

# Rename requested columns
rename_map = {
    "RHS_label": "RHS Label",
    "n_rules": "n rules",
    "n_subsumed": "n sumbsumed",   # (as requested)
    "pct_subsumed": "pct subsumed"
}
subsumed_export = subsumed_export.rename(columns=rename_map)

# Paths
out_dir = os.path.join("5_analysis")
os.makedirs(out_dir, exist_ok=True)
csv_path = os.path.join(out_dir, "redundancy_subsumed.csv")
tex_path = os.path.join(out_dir, "redundancy_subsumed.tex")

# CSV
subsumed_export.to_csv(csv_path, index=False)
print(f"✅ Saved redundancy_subsumed.csv → {csv_path}")

# LaTeX: make column headers LaTeX-safe and detokenize 'Labeling' values
latex_safe_map = {c: str(c).replace("#", r"\#").replace("%", r"\%").replace("_", r"\_")
                  for c in subsumed_export.columns}
subsumed_export_tex = subsumed_export.rename(columns=latex_safe_map)

formatters = {"Labeling": _fmt_detok} if "Labeling" in subsumed_export_tex.columns else None

subsumed_export_tex.to_latex(
    tex_path,
    index=False,
    escape=False,         # we handle escaping and use \detokenize for values
    longtable=True,
    float_format="%.2f",
    formatters=formatters
)
print(f"✅ Saved redundancy_subsumed.tex  → {tex_path}")

✅ Saved redundancy_subsumed.csv → 5_analysis/redundancy_subsumed.csv
✅ Saved redundancy_subsumed.tex  → 5_analysis/redundancy_subsumed.tex


### 6.2. Coverage-Based Redundancy

In [44]:
# ---------- CRM rules → expand LHS features and compute per-rule coverage (excluding MR/MRA/TR/TRA) ----------

# Fallback if not defined earlier
try:
    exclude_encodings
except NameError:
    exclude_encodings = ["mr", "mra", "tr", "tra"]

# ---------- 0) Start from all_rules_crm ----------
if 'all_rules_crm' not in globals() or not isinstance(all_rules_crm, pd.DataFrame):
    raise ValueError("all_rules_crm is not available as a DataFrame.")
crm_df = all_rules_crm.copy()

# Harmonize encoding column name
crm_df = crm_df.rename(columns={
    'Feature Encoding': 'Encoding',
    'Feature encoding': 'Encoding'
})

# Exclude encodings (case/whitespace-insensitive)
if 'Encoding' in crm_df.columns:
    _excl = {e.strip().lower() for e in exclude_encodings}
    enc_norm = crm_df['Encoding'].astype(str).str.strip().str.lower()
    crm_df = crm_df[~enc_norm.isin(_excl)].copy()

# Filter on LB odds ratio > 1
crm_df['LB odds ratio'] = pd.to_numeric(crm_df['LB odds ratio'], errors='coerce')
crm_df = crm_df[crm_df['LB odds ratio'] > 1].copy()

# ---------- NEW: normalize Labeling exactly like earlier ----------
def _build_known_prefixes(df_list):
    prefixes = set()
    for df in df_list:
        if isinstance(df, pd.DataFrame) and 'Dataset' in df.columns:
            s = pd.Series(df['Dataset']).astype(str).str.strip().str.lower()
            prefixes.update(s.str.replace(r'\s+', '', regex=True).unique().tolist())
    prefixes |= {'sepsis', 'traffic', 'bpi15a', 'bpic15a', 'bpic2015', 'bpi2015', 'bpi15'}
    return {p for p in prefixes if p and p != 'nan'}

try:
    KNOWN_PREFIXES
except NameError:
    KNOWN_PREFIXES = _build_known_prefixes([crm_df])

def _canon_label(lbl: str) -> str:
    s = str(lbl).strip().lower()
    s = re.sub(r'(_features?)$', '', s)   # drop suffix
    s = s.replace(' ', '_')
    # strip dataset prefixes repeatedly
    changed = True
    while changed:
        changed = False
        for p in sorted(KNOWN_PREFIXES, key=len, reverse=True):
            if s.startswith(p + '_'):
                s = s[len(p) + 1:]
                changed = True
    if re.search(r'\bpayload(\d+)?\b', s):
        return 'payload'
    if re.fullmatch(r'(decl(are)?\d*)', s):
        return 'declare'
    if re.search(r'(^|[^a-z])mr([^a-z]|$)', s):
        return 'sequential'
    return s

if 'Labeling' in crm_df.columns:
    crm_df['Labeling'] = crm_df['Labeling'].apply(_canon_label)

# ---------- 1) Extract exact LHS and RHS ----------
def extract_lhs_exact(rule_str: str) -> str:
    m = re.search(r"^(.*?)(?=\s*-->)", str(rule_str))
    return m.group(1) if m else str(rule_str)

def parse_rhs_label(rule_str: str):
    m = re.search(r"-->\s*(Label|!Label)", str(rule_str))
    if not m:
        return None
    return 1 if m.group(1) == "Label" else 0

crm_df['LHS_features'] = crm_df['Rule'].apply(extract_lhs_exact)
crm_df['RHS_label']    = crm_df['Rule'].apply(parse_rhs_label)

# ---------- 2) Robustly split LHS into up to 3 features (paren-aware) ----------
def _find_outer_brackets_span(text: str):
    s = str(text)
    start = s.find('[')
    if start < 0:
        return None, None
    depth = 0; in_s = in_d = esc = False; end = None
    for i, ch in enumerate(s[start:], start):
        if esc: esc = False; continue
        if ch == '\\': esc = True; continue
        if in_s:
            if ch == "'": in_s = False
            continue
        if in_d:
            if ch == '"': in_d = False
            continue
        if ch == "'": in_s = True; continue
        if ch == '"': in_d = True; continue
        if ch == '[': depth += 1; continue
        if ch == ']':
            depth -= 1
            if depth == 0: end = i; break
    return (start, end)

def _split_commas_outside_quotes_and_parens(content: str):
    """Split on commas that are outside quotes AND outside parentheses."""
    parts, curr = [], ""
    in_s = in_d = esc = False
    paren_depth = 0
    for ch in content:
        if esc: curr += ch; esc = False; continue
        if ch == '\\': curr += ch; esc = True; continue

        if in_s:
            curr += ch
            if ch == "'": in_s = False
            continue
        if in_d:
            curr += ch
            if ch == '"': in_d = False
            continue

        if ch == "'": curr += ch; in_s = True; continue
        if ch == '"': curr += ch; in_d = True; continue

        if ch == '(':
            paren_depth += 1; curr += ch; continue
        if ch == ')':
            paren_depth = max(0, paren_depth - 1); curr += ch; continue

        if ch == ',' and paren_depth == 0:
            parts.append(curr.strip()); curr = ""
        else:
            curr += ch
    parts.append(curr.strip())
    return parts

def _strip_one_layer_quotes(s: str):
    s = s.strip()
    if len(s) >= 2 and ((s[0] == s[-1] == "'") or (s[0] == s[-1] == '"')):
        return s[1:-1]
    return s

def split_lhs_items(lhs_text: str):
    s = str(lhs_text)
    start, end = _find_outer_brackets_span(s)
    if start is None or end is None:
        return []
    inner = s[start+1:end]          # inside [...]
    raw_items = _split_commas_outside_quotes_and_parens(inner)
    return [_strip_one_layer_quotes(x).strip() for x in raw_items if x != ""]

def _pad3(items):
    items = items[:3]
    return items + [""] * (3 - len(items))

lhs_split = crm_df['LHS_features'].apply(split_lhs_items).apply(_pad3)
lhs_df = pd.DataFrame(lhs_split.tolist(), columns=['feature_1_lhs','feature_2_lhs','feature_3_lhs'])

# ---------- 3) Final rules table (expanded) ----------
crm_rules_all_expanded = pd.concat(
    [crm_df[['Dataset','Labeling','Encoding','Rule','LHS_features','RHS_label']].reset_index(drop=True),
     lhs_df.reset_index(drop=True)],
    axis=1
).reset_index(drop=True)

# ---------- 4) Per-rule coverage over 3.2_binned_logs ----------
base_dir = "3.2_binned_features"

def _infer_case_col(df: pd.DataFrame) -> str:
    for c in ["Case_ID", "case:concept:name", "Case ID", "case_id"]:
        if c in df.columns:
            return c
    raise KeyError("No Case ID column found (tried: Case_ID, case:concept:name, Case ID, case_id)")

def _norm_numeric(col: pd.Series) -> pd.Series:
    if col.dtype == bool:
        return col.astype(int)
    out = pd.to_numeric(col, errors='coerce')
    if out.isna().all() and col.dtype == object:
        return col
    return out

NUM_SUFFIX_RE = re.compile(r"_(\-?\d+(?:\.\d+)?)$")  # _1, _1.0, _0, _-1, etc.

def _match_single_feature(df: pd.DataFrame, feat: str) -> pd.Series:
    feat = str(feat).strip().strip('"').strip("'")

    # A) exact one-hot column
    if feat in df.columns:
        col = _norm_numeric(df[feat])
        return (col == 1) if pd.api.types.is_numeric_dtype(col) else (col.astype(str) == "1")

    # B) general numeric suffix at end: base_<num>
    m = NUM_SUFFIX_RE.search(feat)
    if m:
        base_col = feat[:m.start()]
        desired_str = m.group(1)
        desired = float(desired_str)
        if base_col in df.columns:
            col = _norm_numeric(df[base_col])
            if pd.api.types.is_numeric_dtype(col):
                return (col == desired).fillna(False)
            else:
                return (col.astype(str) == desired_str).fillna(False)
        # rare: indicator named with suffix
        if feat in df.columns:
            col = _norm_numeric(df[feat])
            return ((col == 1) if pd.api.types.is_numeric_dtype(col) else (col.astype(str) == "1")).fillna(False)

    # C) binned: base_(...) or base_[...]
    pos1 = feat.rfind("_(")
    pos2 = feat.rfind("_[")
    split_pos = max(pos1, pos2)
    if split_pos != -1:
        base_col = feat[:split_pos]
        bin_val  = feat[split_pos+1:]  # drop underscore before bracket
        if base_col in df.columns:
            return (df[base_col].astype(str) == bin_val).fillna(False)

    # fallback: no matches
    return pd.Series(False, index=df.index)

def _match_rule(df: pd.DataFrame, features: list, rhs_label: int) -> pd.Series:
    mask = pd.Series(True, index=df.index)
    for f in features:
        if f:
            mask &= _match_single_feature(df, f)
            if not mask.any():
                break
    # enforce RHS label
    if rhs_label in (0, 1):
        mask &= (pd.to_numeric(df["Label"], errors="coerce") == rhs_label)
    else:
        mask &= False
    return mask

# ---------- NEW: robust path resolution using canonicalized labeling ----------
def _canon_from_folder(name: str) -> str:
    s = str(name).strip().lower()
    s = re.sub(r'(_features?)$', '', s)
    s = s.replace(' ', '_')
    # strip dataset prefixes
    changed = True
    while changed:
        changed = False
        for p in sorted(KNOWN_PREFIXES, key=len, reverse=True):
            if s.startswith(p + '_'):
                s = s[len(p) + 1:]
                changed = True
    if re.search(r'\bpayload(\d+)?\b', s):
        return 'payload'
    if re.fullmatch(r'(decl(are)?\d*)', s):
        return 'declare'
    if re.search(r'(^|[^a-z])mr([^a-z]|$)', s):
        return 'sequential'
    return s

def _find_ci_subdir(parent: str, target: str) -> str | None:
    """Case-insensitive lookup of subdir 'target' inside 'parent'."""
    t = target.lower()
    try:
        for d in os.listdir(parent):
            full = os.path.join(parent, d)
            if os.path.isdir(full) and d.lower() == t:
                return full
    except FileNotFoundError:
        return None
    return None

def _resolve_enc_path(ds: str, lab_canon: str, enc: str, base_dir: str) -> str | None:
    ds_dir = os.path.join(base_dir, ds)
    if not os.path.isdir(ds_dir):
        return None
    candidates = []
    for d in sorted(os.listdir(ds_dir)):
        full = os.path.join(ds_dir, d)
        if not os.path.isdir(full):
            continue
        if not d.lower().endswith("_features"):
            continue
        if _canon_from_folder(d) == lab_canon:
            # exact encoding dir?
            enc_path = os.path.join(full, enc)
            if os.path.isdir(enc_path):
                return enc_path
            # try case-insensitive encoding match
            ci = _find_ci_subdir(full, enc)
            if ci:
                return ci
            candidates.append(full)
    # If we found labeling folder(s) but no encoding subdir, return None
    return None

crm_rules_all_with_coverage = crm_rules_all_expanded.copy()
crm_rules_all_with_coverage["covered_case_ids"] = [[] for _ in range(len(crm_rules_all_with_coverage))]
crm_rules_all_with_coverage["n_covered_cases"] = 0

# Iterate once per experiment; load CSV and evaluate each rule in that group
for (ds, lab, enc), g in crm_rules_all_with_coverage.groupby(["Dataset", "Labeling", "Encoding"]):
    enc_path = _resolve_enc_path(ds, lab, enc, base_dir)
    if enc_path is None:
        continue

    csv_files = [f for f in os.listdir(enc_path) if f.endswith(".csv")]
    if not csv_files:
        continue
    csv_path = os.path.join(enc_path, csv_files[0])

    df_enc = pd.read_csv(csv_path)
    if "Label" not in df_enc.columns:
        raise KeyError(f"No 'Label' column in: {csv_path}")
    case_col = _infer_case_col(df_enc)

    for idx, row in g.iterrows():
        feats = [row.get("feature_1_lhs",""), row.get("feature_2_lhs",""), row.get("feature_3_lhs","")]
        feats = [f for f in feats if isinstance(f, str) and f.strip() != ""]
        rhs   = row["RHS_label"]

        rule_mask = _match_rule(df_enc, feats, rhs)
        case_ids = df_enc.loc[rule_mask, case_col].dropna().astype(str).unique().tolist()

        crm_rules_all_with_coverage.at[idx, "covered_case_ids"] = case_ids
        crm_rules_all_with_coverage.at[idx, "n_covered_cases"] = len(case_ids)

# (optional) sort for readability
crm_rules_all_with_coverage = crm_rules_all_with_coverage.sort_values(
    by=["Dataset","Labeling","Encoding"]
).reset_index(drop=True)

crm_rules_all_with_coverage

Unnamed: 0,Dataset,Labeling,Encoding,Rule,LHS_features,RHS_label,feature_1_lhs,feature_2_lhs,feature_3_lhs,covered_case_ids,n_covered_cases
0,dhl,dhl,baseline,"['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]'] -->...","['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]']",1,"ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]",,,"[Bu, D7, Et, Eu, Ev, Ew, Ey, Ez, E0, E5, FE, F...",1023
1,dhl,dhl,baseline,"['PALLET_CREATED_binned_(1.0, 199.0]'] --> Label","['PALLET_CREATED_binned_(1.0, 199.0]']",1,"PALLET_CREATED_binned_(1.0, 199.0]",,,"[Bu, D7, Et, Eu, Ev, Ew, Ey, Ez, E0, E5, FE, F...",841
2,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PALLET_C...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'PALLET_C...",0,"ORDER_CLOSED_binned_(1.0, 200.0]","PALLET_CREATED_binned_(-0.001, 1.0]",,"[F, H, K, L, N, R, T, b, f, GN, Hw, Lr, Ra, Vl...",306
3,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'SHIPUNIT...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'SHIPUNIT...",0,"ORDER_CLOSED_binned_(1.0, 200.0]","SHIPUNIT_BUILT_binned_(-0.001, 1.0]",,"[F, H, K, L, N, R, T, b, f, BZ, Co, Cz, DH, DK...",374
4,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PACKED_b...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'PACKED_b...",0,"ORDER_CLOSED_binned_(1.0, 200.0]","PACKED_binned_(-0.001, 1.0]",,"[F, H, K, L, N, R, T, f, GN, Hw, Ra, Vl, XQ, h...",205
...,...,...,...,...,...,...,...,...,...,...,...
824,dhl,dhl,seq_combined_data,"['mr[ORDER_CLOSED-complete]_binned_(-0.001, 1....","['mr[ORDER_CLOSED-complete]_binned_(-0.001, 1....",1,"mr[ORDER_CLOSED-complete]_binned_(-0.001, 1.0]","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...,"[B1P, B1T, B1U, B2c, B2e, B33, B38, B39, B3C, ...",181
825,dhl,dhl,seq_combined_data,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...","['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",1,"mr[ORDER_GATEWAY-complete, ORDER_OPENED-comple...","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...,"[B1P, B1T, B1U, B2c, B2e, B33, B38, B39, B3C, ...",181
826,dhl,dhl,seq_combined_data,"['mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, ...","['mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, ...",1,"mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, 1.0]","mr[PALLET_CREATED-complete, SHIPUNIT_BUILT-com...",DEPARTURE_NO|first|continuous_binned_(535102.0...,"[B1P, B1T, B1U, B2c, B2e, B33, B38, B39, B3C, ...",181
827,dhl,dhl,seq_combined_data,"['mr[RELEASED_FOR_PICKING-complete, PICKED-com...","['mr[RELEASED_FOR_PICKING-complete, PICKED-com...",1,"mr[RELEASED_FOR_PICKING-complete, PICKED-compl...","length_binned_(11.0, 2197.0]","mr[PACKED-complete, PALLET_CREATED-complete]_0.0","[0m, 0s, 11, 1H, 1s, 27, 34, 3B, 3C, 3m, 3x, 4...",334


In [45]:
# ---- Config ----
JACCARD_THR = 0.95  # e.g., 0.8 as suggested

def _to_case_set(x):
    if isinstance(x, list):
        try:
            return set(map(str, x))
        except Exception:
            return set()
    return set()

def _jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 0.0
    inter = len(a & b)
    union = len(a | b)
    return inter / union if union > 0 else 0.0

pairs_rows = []

# Pairwise Jaccard *within* each experiment & RHS_label (only compare rules for the same consequent)
group_cols = ["Dataset", "Labeling", "Encoding", "RHS_label"]
for gkey, g in crm_rules_all_with_coverage.groupby(group_cols, dropna=False):
    g = g.reset_index(drop=True)
    # Precompute sets
    sets = [ _to_case_set(x) for x in g["covered_case_ids"].tolist() ]
    rules = g["Rule"].tolist()

    for (i, j) in itertools.combinations(range(len(g)), 2):
        A, B = sets[i], sets[j]
        jac = _jaccard(A, B)
        pairs_rows.append({
            "Dataset":   gkey[0],
            "Labeling":  gkey[1],
            "Encoding":  gkey[2],
            "RHS_label": gkey[3],
            "Rule_i":    rules[i],
            "Rule_j":    rules[j],
            "n_i":       len(A),
            "n_j":       len(B),
            "n_inter":   len(A & B),
            "n_union":   len(A | B),
            "jaccard":   round(jac, 4),
            "redundant_pair": jac >= JACCARD_THR
        })

crm_cov_jaccard_pairs = pd.DataFrame(pairs_rows).sort_values(
    ["Dataset","Labeling","Encoding","RHS_label","jaccard"], ascending=[True,True,True,True,False]
).reset_index(drop=True)

# Pairs flagged as redundant (Jaccard >= threshold)
crm_cov_redundant_pairs = crm_cov_jaccard_pairs[crm_cov_jaccard_pairs["redundant_pair"]].reset_index(drop=True)

# ---- Per-rule summary: max overlap & count of redundant partners ----
def _summarize_for_rules(df_pairs: pd.DataFrame) -> pd.DataFrame:
    # Construct a long table of (rule, partner, jaccard)
    long_i = df_pairs.rename(columns={"Rule_i":"Rule", "Rule_j":"partner", "n_i":"n_rule", "n_j":"n_partner"})[
        ["Dataset","Labeling","Encoding","RHS_label","Rule","partner","jaccard","redundant_pair","n_rule","n_partner"]
    ]
    long_j = df_pairs.rename(columns={"Rule_j":"Rule", "Rule_i":"partner", "n_j":"n_rule", "n_i":"n_partner"})[
        ["Dataset","Labeling","Encoding","RHS_label","Rule","partner","jaccard","redundant_pair","n_rule","n_partner"]
    ]
    long_all = pd.concat([long_i, long_j], ignore_index=True)

    # Aggregate per rule
    summary = (
        long_all
        .groupby(["Dataset","Labeling","Encoding","RHS_label","Rule"], as_index=False)
        .agg(
            n_partners=("partner","nunique"),
            max_jaccard=("jaccard","max"),
            n_redundant_partners=("redundant_pair","sum")
        )
    )
    summary["is_redundant"] = summary["n_redundant_partners"] > 0
    return summary

crm_cov_redundancy_summary = _summarize_for_rules(crm_cov_jaccard_pairs).sort_values(
    ["Dataset","Labeling","Encoding","RHS_label","max_jaccard"], ascending=[True,True,True,True,False]
).reset_index(drop=True)

crm_cov_redundancy_summary = crm_cov_redundancy_summary.drop(columns=["RHS_label", "n_partners", "is_redundant"])
crm_cov_redundancy_summary = crm_cov_redundancy_summary[crm_cov_redundancy_summary["n_redundant_partners"] > 0]

In [46]:
display(crm_cov_jaccard_pairs)
display(crm_cov_redundant_pairs)
display(crm_cov_redundancy_summary)

Unnamed: 0,Dataset,Labeling,Encoding,RHS_label,Rule_i,Rule_j,n_i,n_j,n_inter,n_union,jaccard,redundant_pair
0,dhl,dhl,baseline,0,"['SHIPUNIT_BUILT_binned_(-0.001, 1.0]', 'PALLE...","['ORDER_ACKNOWLEDGED_binned_(-0.001, 1.0]'] --...",6176,6613,6081,6708,0.9065,False
1,dhl,dhl,baseline,0,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PALLET_C...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'SHIPUNIT...",306,374,306,374,0.8182,False
2,dhl,dhl,baseline,0,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PACKED_b...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'PICKED_b...",205,162,162,205,0.7902,False
3,dhl,dhl,baseline,0,"['RELEASED_FOR_PICKING_binned_(1.0, 200.0]', '...","['PICKED_binned_(1.0, 200.0]', 'SHIPUNIT_BUILT...",278,212,212,278,0.7626,False
4,dhl,dhl,baseline,0,"['RELEASED_FOR_PICKING_binned_(1.0, 200.0]', '...","['RELEASED_FOR_PICKING_binned_(1.0, 200.0]', '...",278,210,210,278,0.7554,False
...,...,...,...,...,...,...,...,...,...,...,...,...
24599,dhl,dhl,seq_combined_data,1,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...","['mr[RELEASED_FOR_PICKING-complete, PICKED-com...",181,334,0,515,0.0000,False
24600,dhl,dhl,seq_combined_data,1,"['mr[ORDER_CLOSED-complete]_binned_(-0.001, 1....","['mr[RELEASED_FOR_PICKING-complete, PICKED-com...",181,334,0,515,0.0000,False
24601,dhl,dhl,seq_combined_data,1,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...","['mr[RELEASED_FOR_PICKING-complete, PICKED-com...",181,334,0,515,0.0000,False
24602,dhl,dhl,seq_combined_data,1,"['mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, ...","['mr[RELEASED_FOR_PICKING-complete, PICKED-com...",181,334,0,515,0.0000,False


Unnamed: 0,Dataset,Labeling,Encoding,RHS_label,Rule_i,Rule_j,n_i,n_j,n_inter,n_union,jaccard,redundant_pair
0,dhl,dhl,baseline,1,"['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]'] -->...","['PICKED_binned_(1.0, 200.0]'] --> Label",1023,1023,1023,1023,1.0000,True
1,dhl,dhl,baseline,1,"['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]'] -->...","['PACKED_binned_(1.0, 200.0]'] --> Label",1023,1023,1023,1023,1.0000,True
2,dhl,dhl,baseline,1,"['PICKED_binned_(1.0, 200.0]'] --> Label","['PACKED_binned_(1.0, 200.0]'] --> Label",1023,1023,1023,1023,1.0000,True
3,dhl,dhl,bs_data,0,"['PICKED_binned_(-0.001, 1.0]', 'Length|first|...","['PACKED_binned_(-0.001, 1.0]', 'Length|first|...",2101,2120,2101,2120,0.9910,True
4,dhl,dhl,bs_data,0,"['PICKED_binned_(-0.001, 1.0]', 'Length|first|...","['RELEASED_FOR_PICKING_binned_(-0.001, 1.0]', ...",2101,2079,2079,2101,0.9895,True
...,...,...,...,...,...,...,...,...,...,...,...,...
7234,dhl,dhl,seq_combined_data,1,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...","['mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, ...",186,181,181,186,0.9731,True
7235,dhl,dhl,seq_combined_data,1,"['mr[PACKED-complete, PALLET_CREATED-complete]...","['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",186,181,181,186,0.9731,True
7236,dhl,dhl,seq_combined_data,1,"['mr[PACKED-complete, PALLET_CREATED-complete]...","['mr[ORDER_CLOSED-complete]_binned_(-0.001, 1....",186,181,181,186,0.9731,True
7237,dhl,dhl,seq_combined_data,1,"['mr[PACKED-complete, PALLET_CREATED-complete]...","['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",186,181,181,186,0.9731,True


Unnamed: 0,Dataset,Labeling,Encoding,Rule,max_jaccard,n_redundant_partners
11,dhl,dhl,baseline,"['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]'] -->...",1.0000,2
12,dhl,dhl,baseline,"['PACKED_binned_(1.0, 200.0]'] --> Label",1.0000,2
13,dhl,dhl,baseline,"['PICKED_binned_(1.0, 200.0]'] --> Label",1.0000,2
15,dhl,dhl,bs_data,"['PACKED_binned_(-0.001, 1.0]', 'Length|first|...",0.9910,2
16,dhl,dhl,bs_data,"['PICKED_binned_(-0.001, 1.0]', 'Length|first|...",0.9910,2
...,...,...,...,...,...,...
791,dhl,dhl,seq_combined_data,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",0.9960,2
792,dhl,dhl,seq_combined_data,"['mra[ORDER_OPENED-complete, ORDER_GATEWAY-com...",0.9960,2
793,dhl,dhl,seq_combined_data,"['Length|first|continuous_binned_(39.5, 258.0]...",0.9939,1
794,dhl,dhl,seq_combined_data,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",0.9939,1


In [47]:
# ---------- Coverage-based redundancy: per-experiment summary (no dropping) ----------

# Safety checks
if 'crm_rules_all_with_coverage' not in globals():
    raise RuntimeError("crm_rules_all_with_coverage not found. Run the previous cell first.")
if 'crm_cov_jaccard_pairs' not in globals():
    raise RuntimeError("crm_cov_jaccard_pairs not found. Run the Jaccard computation cell first.")
if 'crm_cov_redundant_pairs' not in globals():
    raise RuntimeError("crm_cov_redundant_pairs not found. Run the Jaccard computation cell first.")

# --- Base experiments and rule counts ---
exp_rules = (
    crm_rules_all_with_coverage
    .groupby(['Dataset','Labeling','Encoding'], as_index=False)
    .agg(n_rules_total=('Rule','nunique'),
         n_rules_z=('RHS_label', lambda s: (s==1).sum()),
         n_rules_notz=('RHS_label', lambda s: (s==0).sum()))
)

# --- Pairwise stats (all pairs) ---
if not crm_cov_jaccard_pairs.empty:
    exp_pairs_all = (
        crm_cov_jaccard_pairs
        .groupby(['Dataset','Labeling','Encoding'], as_index=False)
        .agg(n_pairs_total=('jaccard','size'),
             mean_jaccard_all=('jaccard','mean'),
             max_jaccard_all=('jaccard','max'))
    )
else:
    exp_pairs_all = pd.DataFrame(columns=['Dataset','Labeling','Encoding','n_pairs_total','mean_jaccard_all','max_jaccard_all'])

# --- Redundant pairs only ---
if not crm_cov_redundant_pairs.empty:
    exp_pairs_redundant = (
        crm_cov_redundant_pairs
        .groupby(['Dataset','Labeling','Encoding'], as_index=False)
        .agg(n_pairs_redundant=('jaccard','size'),
             mean_jaccard_redundant=('jaccard','mean'))
    )
else:
    exp_pairs_redundant = pd.DataFrame(columns=['Dataset','Labeling','Encoding','n_pairs_redundant','mean_jaccard_redundant'])

# --- Redundant rules (unique nodes that appear in any redundant pair) ---
if not crm_cov_redundant_pairs.empty:
    nodes_i = crm_cov_redundant_pairs[['Dataset','Labeling','Encoding','RHS_label','Rule_i']].rename(columns={'Rule_i':'Rule'})
    nodes_j = crm_cov_redundant_pairs[['Dataset','Labeling','Encoding','RHS_label','Rule_j']].rename(columns={'Rule_j':'Rule'})
    red_nodes = pd.concat([nodes_i, nodes_j], ignore_index=True).drop_duplicates()
    exp_rules_redundant = (
        red_nodes
        .groupby(['Dataset','Labeling','Encoding'], as_index=False)
        .agg(n_rules_redundant=('Rule','nunique'))
    )
else:
    exp_rules_redundant = pd.DataFrame(columns=['Dataset','Labeling','Encoding','n_rules_redundant'])

# --- Build redundancy clusters (connected components) within each (exp, RHS_label) ---
class DSU:
    def __init__(self): self.p={}; self.r={}
    def find(self,x):
        if self.p.get(x,x)!=x: self.p[x]=self.find(self.p[x])
        return self.p.get(x,x)
    def union(self,a,b):
        ra,rb=self.find(a),self.find(b)
        if ra==rb: return
        self.p.setdefault(ra,ra); self.p.setdefault(rb,rb)
        self.r.setdefault(ra,0);  self.r.setdefault(rb,0)
        if self.r[ra]<self.r[rb]: self.p[ra]=rb
        elif self.r[ra]>self.r[rb]: self.p[rb]=ra
        else: self.p[rb]=ra; self.r[ra]+=1

def cluster_counts_per_exp(pairs_df: pd.DataFrame) -> pd.DataFrame:
    rows=[]
    if pairs_df.empty:
        return pd.DataFrame(columns=['Dataset','Labeling','Encoding','n_clusters','avg_cluster_size','max_cluster_size'])
    for (ds,lab,enc,rhs), g in pairs_df.groupby(['Dataset','Labeling','Encoding','RHS_label'], dropna=False):
        nodes = set(g['Rule_i']).union(set(g['Rule_j']))
        if not nodes:
            continue
        dsu=DSU()
        for _,r in g.iterrows():
            dsu.union(('R',r['Rule_i']), ('R',r['Rule_j']))
        from collections import defaultdict as _dd
        parent_sizes=_dd(int)
        for ru in nodes:
            parent_sizes[dsu.find(('R',ru))]+=1
        rows.append({
            'Dataset': ds, 'Labeling': lab, 'Encoding': enc,
            'RHS_label': rhs,
            'n_clusters_rhs': len(parent_sizes),
            'avg_cluster_size_rhs': float(np.mean(list(parent_sizes.values()))) if parent_sizes else 0.0,
            'max_cluster_size_rhs': max(parent_sizes.values()) if parent_sizes else 0
        })
    if not rows:
        return pd.DataFrame(columns=['Dataset','Labeling','Encoding','n_clusters','avg_cluster_size','max_cluster_size'])
    df = pd.DataFrame(rows)
    out = (
        df.groupby(['Dataset','Labeling','Encoding'], as_index=False)
          .agg(n_clusters=('n_clusters_rhs','sum'),
               avg_cluster_size=('avg_cluster_size_rhs','mean'),
               max_cluster_size=('max_cluster_size_rhs','max'))
    )
    return out

exp_clusters = cluster_counts_per_exp(crm_cov_redundant_pairs)

# --- Assemble summary ---
summary = (
    exp_rules
    .merge(exp_pairs_all, on=['Dataset','Labeling','Encoding'], how='left')
    .merge(exp_pairs_redundant, on=['Dataset','Labeling','Encoding'], how='left')
    .merge(exp_rules_redundant, on=['Dataset','Labeling','Encoding'], how='left')
    .merge(exp_clusters, on=['Dataset','Labeling','Encoding'], how='left')
)

# Fill NaNs
for c in ['n_pairs_total','n_pairs_redundant','n_rules_redundant','n_clusters',
          'max_cluster_size']:
    if c in summary.columns:
        summary[c] = summary[c].fillna(0).astype(int)
for c in ['mean_jaccard_all','mean_jaccard_redundant','max_jaccard_all','avg_cluster_size']:
    if c in summary.columns:
        summary[c] = summary[c].fillna(0.0)

# Derived metrics (no keep/drop heuristic)
summary['pct_pairs_redundant']  = np.where(summary['n_pairs_total']>0,
                                           100.0*summary['n_pairs_redundant']/summary['n_pairs_total'], 0.0)
summary['pct_rules_redundant']  = np.where(summary['n_rules_total']>0,
                                           100.0*summary['n_rules_redundant']/summary['n_rules_total'], 0.0)

# Order columns
cols_order = [
    'Dataset','Labeling','Encoding',
    'n_rules_total','n_rules_z','n_rules_notz',
    'n_rules_redundant','pct_rules_redundant',
    'n_clusters','avg_cluster_size','max_cluster_size',
    'n_pairs_total','n_pairs_redundant','pct_pairs_redundant',
    'mean_jaccard_all','max_jaccard_all','mean_jaccard_redundant'
]
crm_cov_redundancy_exp_summary = summary[[c for c in cols_order if c in summary.columns]].sort_values(
    ['Dataset','Labeling','Encoding']
).reset_index(drop=True)

# ---------- Rename and slim the coverage redundancy summary (no "Rules dropped") ----------

if 'crm_cov_redundancy_exp_summary' not in globals():
    raise RuntimeError("crm_cov_redundancy_exp_summary not found. Run the previous summary cell first.")

keep_raw = [
    'Dataset', 'Labeling', 'Encoding',
    'n_rules_total', 'n_rules_redundant', 'n_clusters'
]
keep_present = [c for c in keep_raw if c in crm_cov_redundancy_exp_summary.columns]

crm_cov_redundancy_exp_summary_pretty = crm_cov_redundancy_exp_summary[keep_present].copy()

rename_map = {
    'n_rules_total': 'Total Rules',
    'n_rules_redundant': 'Number of Redundant Rules',
    'n_clusters': 'Number of Clusters'
}
crm_cov_redundancy_exp_summary_pretty = crm_cov_redundancy_exp_summary_pretty.rename(columns=rename_map)

final_order = [c for c in [
    'Dataset', 'Labeling', 'Encoding',
    'Total Rules', 'Number of Redundant Rules', 'Number of Clusters'
] if c in crm_cov_redundancy_exp_summary_pretty.columns]

crm_cov_redundancy_exp_summary = (
    crm_cov_redundancy_exp_summary_pretty[final_order]
    .sort_values(['Dataset', 'Labeling', 'Encoding'])
    .reset_index(drop=True)
)

crm_cov_redundancy_exp_summary


Unnamed: 0,Dataset,Labeling,Encoding,Total Rules,Number of Redundant Rules,Number of Clusters
0,dhl,dhl,baseline,15,3,1
1,dhl,dhl,bs_data,39,7,3
2,dhl,dhl,bs_dwd,34,32,3
3,dhl,dhl,dec_data,53,41,7
4,dhl,dhl,dec_dwd,15,9,3
5,dhl,dhl,dec_dwd_data,28,12,4
6,dhl,dhl,declare,13,7,2
7,dhl,dhl,dwd,9,6,1
8,dhl,dhl,hybrid,149,135,13
9,dhl,dhl,hybrid_data,88,51,12


In [30]:
#average rules number of redundant rules per dataset
# Calculate average redundant rules per dataset
# Sum rules redundant per dataset
sum_rules = (
    crm_cov_redundancy_exp_summary.groupby("Dataset")["Total Rules"]
      .sum()
      .reset_index(name="Total Rules")
)
print(sum_rules)

avg_redundant_rules = (
    crm_cov_redundancy_exp_summary.groupby("Dataset")["Number of Redundant Rules"]
      .mean()
      .reset_index(name="Avg Redundant Rules")
)
print(avg_redundant_rules)

# # Sum rules dropped per dataset
# sum_rules_dropped = (
#     crm_cov_redundancy_exp_summary.groupby("Dataset")["Rules dropped"]
#       .sum()
#       .reset_index(name="Total Rules Dropped")
# )
# print(sum_rules_dropped)

# Sum rules redundant per dataset
sum_rules_redundant = (
    crm_cov_redundancy_exp_summary.groupby("Dataset")["Number of Redundant Rules"]
      .sum()
      .reset_index(name="Total Redundant Rules")
)
print(sum_rules_redundant)

# rules_after = (
#     sum_rules
#     .merge(sum_rules_dropped, on="Dataset", how="outer")
#     .fillna(0)
# )
# rules_after["Rules After Dropping"] = (
#     rules_after["Total Rules"] - rules_after["Total Rules Dropped"]
# )

# # (optional) cast to int if these are counts
# cols = ["Total Rules", "Total Rules Dropped", "Rules After Dropping"]
# rules_after[cols] = rules_after[cols].astype(int)

# print(rules_after[["Dataset"] + cols])
# Total number of clusters per dataset
total_clusters = (
    crm_cov_redundancy_exp_summary.groupby("Dataset")["Number of Clusters"]
      .sum()
      .reset_index(name="Total Clusters")
)
print(total_clusters)

# (Optional) overall total across all datasets
overall_total_clusters = int(crm_cov_redundancy_exp_summary["Number of Clusters"].sum())
print(f"Overall total clusters across datasets: {overall_total_clusters}")


  Dataset  Total Rules
0     dhl          829
  Dataset  Avg Redundant Rules
0     dhl                  0.0
  Dataset  Total Redundant Rules
0     dhl                      0
  Dataset  Total Clusters
0     dhl               0
Overall total clusters across datasets: 0


In [48]:
out_dir = os.path.join('5_analysis')
os.makedirs(out_dir, exist_ok=True)

csv_path_exp_summary = os.path.join(out_dir, 'redundancy_coverage_experiment_summary.csv')
tex_path_exp_summary = os.path.join(out_dir, 'redundancy_coverage_experiment_summary.tex')
crm_cov_redundancy_exp_summary.to_csv(csv_path_exp_summary, index=False)

crm_cov_redundancy_exp_summary.to_latex(
    tex_path_exp_summary,
    index=False,
    escape=False,
    longtable=True,
    formatters={
        "Encoding": fmt_detok,
        "Labeling": fmt_detok
    }
)

csv_path_summary = os.path.join(out_dir, 'redundancy_coverage_rule_summary.csv')
tex_path_summary = os.path.join(out_dir, 'redundancy_coverage_rule_summary.tex')
crm_cov_redundancy_summary.to_csv(csv_path_summary, index=False)

crm_cov_redundancy_summary.to_latex(
    tex_path_summary,
    index=False,
    escape=False,
    longtable=True,
    formatters={
        "Encoding": fmt_detok,
        "Labeling": fmt_detok,
        "Rule": fmt_detok,
        "max_jaccard": lambda x: f"{x:.3f}"
    }
)


print(f"✅ Saved redundancy_coverage_redundant_pairs.csv → {csv_path_exp_summary}")
print(f"✅ Saved redundancy_coverage_rule_summary.csv → {csv_path_summary}")

✅ Saved redundancy_coverage_redundant_pairs.csv → 5_analysis/redundancy_coverage_experiment_summary.csv
✅ Saved redundancy_coverage_rule_summary.csv → 5_analysis/redundancy_coverage_rule_summary.csv


### Selecting rules to drop

In [32]:
# # === Pick a representative rule per redundant-coverage cluster and list rules to drop ===
# import pandas as pd
# import numpy as np
# import os
# from collections import defaultdict

# # Safety: keep only pairs explicitly flagged as redundant, if the CSV contains a column for that
# if 'redundant_pair' in crm_cov_redundant_pairs.columns:
#     crm_cov_redundant_pairs = crm_cov_redundant_pairs[crm_cov_redundant_pairs['redundant_pair'] == True].copy()

# # ---- 1) Collect all rules that appear in any redundant pair (nodes of the graph) ----
# key_cols = ['Dataset', 'Labeling', 'Encoding', 'RHS_label', 'Rule']
# nodes_i = crm_cov_redundant_pairs[['Dataset','Labeling','Encoding','RHS_label','Rule_i']].rename(columns={'Rule_i':'Rule'})
# nodes_j = crm_cov_redundant_pairs[['Dataset','Labeling','Encoding','RHS_label','Rule_j']].rename(columns={'Rule_j':'Rule'})
# redundant_nodes = pd.concat([nodes_i, nodes_j], ignore_index=True).drop_duplicates().reset_index(drop=True)

# # ---- 2) Pull metrics from all_rules_crm for ranking (LB OR, Support LHS, Lift) ----
# metrics_src = all_rules_crm.copy()
# # Harmonize encoding column name if needed
# if 'Encoding' not in metrics_src.columns and 'Feature Encoding' in metrics_src.columns:
#     metrics_src = metrics_src.rename(columns={'Feature Encoding':'Encoding'})

# # Helper: find a column by a set of candidate names (case/space/underscore-insensitive)
# def _find_col(df, candidates):
#     canon = { ''.join(c.lower().split()): c for c in candidates }
#     for c in df.columns:
#         k = ''.join(c.lower().split()).replace('_','')
#         if k in canon:
#             return c
#     # second pass allowing looser underscore removal
#     for c in df.columns:
#         k = ''.join(c.lower().replace('_','').split())
#         for cand in candidates:
#             if k == ''.join(cand.lower().replace('_','').split()):
#                 return c
#     return None

# lb_or_col     = _find_col(metrics_src, ['LB odds ratio','LB_OR','lower bound odds ratio'])
# support_lhs_c = _find_col(metrics_src, ['Support LHS','Support_LHS','LHS support'])
# lift_col      = _find_col(metrics_src, ['Lift','Causal lift','Causal Lift'])

# if lb_or_col is None:
#     raise ValueError(f"Could not find LB odds ratio column in all_rules_crm. Columns: {list(metrics_src.columns)}")

# # Keep only the needed columns
# need_cols = ['Dataset','Labeling','Encoding','Rule', lb_or_col]
# if support_lhs_c: need_cols.append(support_lhs_c)
# if lift_col:      need_cols.append(lift_col)
# metrics_keep = metrics_src[need_cols].copy()

# # Coerce to numeric for sorting
# metrics_keep[lb_or_col] = pd.to_numeric(metrics_keep[lb_or_col], errors='coerce')
# if support_lhs_c: metrics_keep[support_lhs_c] = pd.to_numeric(metrics_keep[support_lhs_c], errors='coerce')
# if lift_col:      metrics_keep[lift_col]      = pd.to_numeric(metrics_keep[lift_col], errors='coerce')

# # Merge metrics onto nodes
# nodes_with_metrics = redundant_nodes.merge(
#     metrics_keep,
#     on=['Dataset','Labeling','Encoding','Rule'],
#     how='left'
# )

# # ---- 3) Build redundant clusters (connected components) per (Dataset, Labeling, Encoding, RHS_label) ----
# # Simple Union-Find for components
# class DSU:
#     def __init__(self):
#         self.parent = {}
#         self.rank = {}
#     def find(self, x):
#         if self.parent.get(x, x) != x:
#             self.parent[x] = self.find(self.parent[x])
#         return self.parent.get(x, x)
#     def union(self, a, b):
#         ra, rb = self.find(a), self.find(b)
#         if ra == rb: return
#         self.parent.setdefault(ra, ra)
#         self.parent.setdefault(rb, rb)
#         self.rank.setdefault(ra, 0)
#         self.rank.setdefault(rb, 0)
#         if self.rank[ra] < self.rank[rb]:
#             self.parent[ra] = rb
#         elif self.rank[ra] > self.rank[rb]:
#             self.parent[rb] = ra
#         else:
#             self.parent[rb] = ra
#             self.rank[ra] += 1

# # Add a unique node key for UF (tuple)
# nodes_with_metrics['node_key'] = list(zip(
#     nodes_with_metrics['Dataset'],
#     nodes_with_metrics['Labeling'],
#     nodes_with_metrics['Encoding'],
#     nodes_with_metrics['RHS_label'],
#     nodes_with_metrics['Rule'],
# ))

# # Map for fast lookup
# node_set = set(nodes_with_metrics['node_key'])

# # Group pairs by experiment + RHS and union
# dsu = DSU()
# for (ds, lab, enc, rhs), g in crm_cov_redundant_pairs.groupby(['Dataset','Labeling','Encoding','RHS_label'], dropna=False):
#     for _, r in g.iterrows():
#         nk_i = (ds, lab, enc, rhs, r['Rule_i'])
#         nk_j = (ds, lab, enc, rhs, r['Rule_j'])
#         if nk_i in node_set and nk_j in node_set:
#             dsu.union(nk_i, nk_j)

# # Assign a cluster id to each node (its DSU root)
# nodes_with_metrics['cluster_id'] = nodes_with_metrics['node_key'].apply(dsu.find)

# # ---- 4) Choose representative per cluster using the ranking:
# #      LB OR (desc), then Support LHS (desc), then Lift (desc)
# def _sort_key(row):
#     lb  = row.get(lb_or_col, np.nan)
#     sup = row.get(support_lhs_c, np.nan) if support_lhs_c else np.nan
#     lif = row.get(lift_col, np.nan)      if lift_col      else np.nan
#     # Use very small numbers for NaNs so they sort at the end
#     lb  = -np.inf if pd.isna(lb)  else lb
#     sup = -np.inf if pd.isna(sup) else sup
#     lif = -np.inf if pd.isna(lif) else lif
#     return (lb, sup, lif)

# keepers = []
# drops = []

# for (ds, lab, enc, rhs), g in nodes_with_metrics.groupby(['Dataset','Labeling','Encoding','RHS_label'], dropna=False):
#     for cid, gg in g.groupby('cluster_id', dropna=False):
#         if len(gg) == 1:
#             # single-node cluster: not actually redundant, but appears in a pair? keep it
#             keep_row = gg.iloc[0]
#             keepers.append(keep_row)
#             continue

#         # pick best row by our sort key (descending)
#         # (We sort ascending=False by each key, so we can use argsort on a precomputed tuple)
#         order = sorted(gg.index, key=lambda ix: _sort_key(gg.loc[ix]), reverse=True)
#         keep_ix = order[0]
#         keep_row = gg.loc[keep_ix]
#         keepers.append(keep_row)

#         # Remaining rules in the cluster are drops; add info about the representative
#         rep_info = {
#             'rep_Dataset':   keep_row['Dataset'],
#             'rep_Labeling':  keep_row['Labeling'],
#             'rep_Encoding':  keep_row['Encoding'],
#             'rep_RHS_label': keep_row['RHS_label'],
#             'rep_Rule':      keep_row['Rule'],
#             'rep_'+lb_or_col: keep_row.get(lb_or_col, np.nan),
#         }
#         if support_lhs_c: rep_info['rep_'+support_lhs_c] = keep_row.get(support_lhs_c, np.nan)
#         if lift_col:      rep_info['rep_'+lift_col]      = keep_row.get(lift_col, np.nan)

#         for ix in order[1:]:
#             row = gg.loc[ix].to_dict()
#             row.update(rep_info)
#             drops.append(row)

# # ---- 5) Build final DataFrames ----
# crm_rules_keep = pd.DataFrame(keepers).drop_duplicates(subset=['node_key']).reset_index(drop=True)

# _out_cols = ['Dataset','Labeling','Encoding','RHS_label','Rule', lb_or_col]
# if support_lhs_c: _out_cols.append(support_lhs_c)
# if lift_col:      _out_cols.append(lift_col)
# crm_rules_keep = crm_rules_keep[_out_cols + ['cluster_id']]

# crm_rules_drop = pd.DataFrame(drops)
# if not crm_rules_drop.empty:
#     drop_cols = _out_cols + ['cluster_id', 'rep_Dataset','rep_Labeling','rep_Encoding','rep_RHS_label','rep_Rule',
#                              'rep_'+lb_or_col]
#     if support_lhs_c: drop_cols.append('rep_'+support_lhs_c)
#     if lift_col:      drop_cols.append('rep_'+lift_col)
#     # Ensure columns exist
#     crm_rules_drop = crm_rules_drop.reindex(columns=drop_cols)

# # One-row summary per cluster
# crm_redundant_groups_summary = (
#     crm_rules_keep
#     .groupby(['Dataset','Labeling','Encoding','RHS_label','cluster_id'], as_index=False)
#     .agg(
#         representative_rule = ('Rule','first'),
#         rep_lb_or          = (lb_or_col,'first'),
#         rep_support_lhs    = (support_lhs_c,'first') if support_lhs_c else (lb_or_col,'size'),
#         rep_lift           = (lift_col,'first') if lift_col else (lb_or_col,'size'),
#         cluster_size       = ('Rule','size')
#     )
#     .sort_values(['Dataset','Labeling','Encoding','RHS_label','cluster_size'], ascending=[True,True,True,True,False])
# )

# # Optional: sort outputs for readability
# crm_rules_keep = crm_rules_keep.sort_values(['Dataset','Labeling','Encoding','RHS_label', lb_or_col], ascending=[True,True,True,True,False]).reset_index(drop=True)
# crm_rules_drop = crm_rules_drop.sort_values(['Dataset','Labeling','Encoding','RHS_label', lb_or_col], ascending=[True,True,True,True,False]).reset_index(drop=True)

# # Show quick counts
# print(f"Clusters with redundancy: {crm_redundant_groups_summary.shape[0]}")
# print(f"Rules kept: {crm_rules_keep.shape[0]} | Rules to drop: {crm_rules_drop.shape[0]}")


In [33]:
# crm_rules_drop

In [34]:
# drop_rules_path = os.path.join(out_dir, "redundancy_coverage_dropped.csv")
# keep_rules_path = os.path.join(out_dir, "redundancy_coverage_kept.csv")

# crm_rules_drop.to_csv(drop_rules_path, index=False)
# crm_rules_keep.to_csv(keep_rules_path, index=False)

### fix Labeling of all_rules_crm (EXCLUDED: and keep only rules that made it through redundancy test)

In [35]:
# --- helpers (same logic as in your coverage cell) ---
def _build_known_prefixes_from_fs(base_dir: str = "3.2_binned_features") -> set:
    prefixes = set()
    try:
        for d in os.listdir(base_dir):
            full = os.path.join(base_dir, d)
            if os.path.isdir(full):
                prefixes.add(str(d).strip().lower().replace(" ", ""))
    except FileNotFoundError:
        pass
    for _name in ("dt_comparison", "ripperk_comparison", "all_rules_crm"):
        _df = globals().get(_name)
        if isinstance(_df, pd.DataFrame) and "Dataset" in _df.columns:
            prefixes |= set(
                _df["Dataset"].astype(str).str.strip().str.lower()
                  .str.replace(r"\s+", "", regex=True).unique().tolist()
            )
    prefixes |= {'sepsis', 'traffic', 'bpi15a', 'bpic15a', 'bpic2015', 'bpi2015', 'bpi15'}
    return {p for p in prefixes if p and p != "nan"}

try:
    KNOWN_PREFIXES
except NameError:
    KNOWN_PREFIXES = _build_known_prefixes_from_fs("3.2_binned_features")

def _strip_label_suffix(s: pd.Series) -> pd.Series:
    return s.astype(str).str.replace(r"_features?$", "", regex=True)

# --- helpers (same logic as in your coverage cell) ---
import os, re

def _build_known_prefixes_from_fs(base_dir: str = "3.2_binned_features") -> set:
    prefixes = set()
    try:
        for d in os.listdir(base_dir):
            full = os.path.join(base_dir, d)
            if os.path.isdir(full):
                prefixes.add(str(d).strip().lower().replace(" ", ""))
    except FileNotFoundError:
        pass
    for _name in ("dt_comparison", "ripperk_comparison", "all_rules_crm"):
        _df = globals().get(_name)
        if isinstance(_df, pd.DataFrame) and "Dataset" in _df.columns:
            prefixes |= set(
                _df["Dataset"].astype(str).str.strip().str.lower()
                  .str.replace(r"\s+", "", regex=True).unique().tolist()
            )
    prefixes |= {'sepsis', 'traffic', 'bpi15a', 'bpic15a', 'bpic2015', 'bpi2015', 'bpi15'}
    return {p for p in prefixes if p and p != "nan"}

try:
    KNOWN_PREFIXES
except NameError:
    KNOWN_PREFIXES = _build_known_prefixes_from_fs("3.2_binned_features")

def _strip_label_suffix(s: pd.Series) -> pd.Series:
    return s.astype(str).str.replace(r"_features?$", "", regex=True)

def _canon_label(name: str) -> str:
    # Robust canonicalizer that mirrors your other DF logic (contains-based mapping)
    if pd.isna(name):
        return name
    s = str(name).strip().lower()
    s = re.sub(r'(_features?)$', '', s, flags=re.I)   # drop trailing "_features"
    s = s.replace(' ', '_')

    # strip known dataset prefixes repeatedly
    changed = True
    while changed:
        changed = False
        for p in sorted(KNOWN_PREFIXES, key=len, reverse=True):
            if s.startswith(p + "_"):
                s = s[len(p) + 1:]
                changed = True

    # final mapping (contains-based; catches "payload_pay36", "decl3", etc.)
    if "decl" in s:
        return "declare"
    if "payload" in s:
        return "payload"
    if "mr" in s:
        return "sequential"
    return s

# --- normalize Labeling in all_rules_crm ---
if "Labeling" in all_rules_crm.columns:
    all_rules_crm = all_rules_crm.copy()
    all_rules_crm["Labeling"] = all_rules_crm["Labeling"].map(_canon_label)
    # idempotent; safe even if _canon_label already removed the suffix
    all_rules_crm["Labeling"] = _strip_label_suffix(all_rules_crm["Labeling"])
else:
    raise KeyError("all_rules_crm has no 'Labeling' column")

In [36]:
key_cols = ["Dataset", "Labeling", "Encoding", "Rule"]

def _harmonize_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={
        "Feature Encoding": "Encoding",
        "Feature encoding": "Encoding",
    }).copy()
    # exact match, but trim stray whitespace to avoid false mismatches
    for c in key_cols:
        if c not in df.columns:
            raise KeyError(f"Missing column '{c}' in dataframe")
        df[c] = df[c].astype(str).str.strip()
    return df

all_rules_norm = _harmonize_cols(all_rules_crm)
# drop_norm      = _harmonize_cols(crm_rules_drop)

# # Anti-join: keep rows from all_rules_crm not present in crm_rules_drop on the 4 keys
# rules_crm_not_dropped = (
#     all_rules_norm
#       .merge(drop_norm[key_cols].drop_duplicates(),
#              on=key_cols, how="left", indicator=True)
#       .query('_merge == "left_only"')
#       .drop(columns='_merge')
# )

# # (optional) restore original dtypes where possible
# rules_crm_not_dropped = rules_crm_not_dropped.astype(
#     {c: all_rules_crm[c].dtype for c in all_rules_crm.columns if c in rules_crm_not_dropped},
#     errors="ignore"
# )

# def _norm_enc(s: pd.Series) -> pd.Series:
#     return s.astype(str).str.strip().str.lower()

# rules_crm_not_dropped = rules_crm_not_dropped[_norm_enc(rules_crm_not_dropped['Encoding']).isin(set(map(str.lower, exclude_encodings))) == False]
# rules_crm_not_dropped
all_rules_norm

Unnamed: 0,Dataset,Labeling,Encoding,Rule,Odds ratio,LB odds ratio,Support LHS,Confidence,Lift,Conviction,n12,n21,Fair set count,Stratified,UB odds ratio
0,dhl,dhl,baseline,"['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]'] -->...",465.00,65.361,0.077,0.903,1.656,4.683,465.0,1.0,952,True,3308.148
1,dhl,dhl,baseline,"['PALLET_CREATED_binned_(1.0, 199.0]'] --> Label",23.75,8.733,0.094,0.606,1.111,1.154,95.0,4.0,578,True,64.586
2,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PALLET_C...",53.00,7.329,0.033,0.627,1.379,1.462,53.0,1.0,220,True,383.253
3,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'SHIPUNIT...",53.00,7.329,0.038,0.671,1.477,1.660,53.0,1.0,324,True,383.253
4,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PACKED_b...",52.00,7.189,0.014,1.000,2.199,inf,52.0,1.0,212,True,376.155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824,dhl,dhl,seq_combined_data,"['mr[ORDER_CLOSED-complete]_binned_(-0.001, 1....",8.00,1.001,0.020,0.618,1.133,1.189,8.0,1.0,412,True,63.965
825,dhl,dhl,seq_combined_data,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",8.00,1.001,0.020,0.618,1.133,1.189,8.0,1.0,412,True,63.965
826,dhl,dhl,seq_combined_data,"['mr[SHIPUNIT_BUILT-complete]_binned_(-0.001, ...",8.00,1.001,0.020,0.618,1.133,1.189,8.0,1.0,412,True,63.965
827,dhl,dhl,seq_combined_data,"['mr[RELEASED_FOR_PICKING-complete, PICKED-com...",8.00,1.001,0.034,0.669,1.227,1.375,8.0,1.0,632,True,63.965


## 7. Calculating Coverage CRM rules

In [37]:
try:
    exclude_encodings
except NameError:
    exclude_encodings = ["mr", "mra", "tr", "tra"]

base_dir = "3.2_binned_features"  # root folder

def _norm_enc(s: pd.Series) -> pd.Series:
    return s.astype(str).str.strip().str.lower()

def _strip_label_suffix(s: pd.Series) -> pd.Series:
    # drop trailing "_features" if present
    return s.astype(str).str.replace(r"_features$", "", regex=True)

# --- Canonicalize Labeling helpers (same logic as earlier cells) ---
def _build_known_prefixes_from_fs(base_dir: str) -> set:
    prefixes = set()
    # dataset folder names
    try:
        for d in os.listdir(base_dir):
            full = os.path.join(base_dir, d)
            if os.path.isdir(full):
                prefixes.add(str(d).strip().lower().replace(" ", ""))
    except FileNotFoundError:
        pass
    # also take from any existing comparison frames if present
    for _name in ("dt_comparison", "ripperk_comparison"):
        _df = globals().get(_name)
        if isinstance(_df, pd.DataFrame) and "Dataset" in _df.columns:
            prefixes |= set(
                _df["Dataset"].astype(str).str.strip().str.lower()
                   .str.replace(r"\s+", "", regex=True).unique().tolist()
            )
    prefixes |= {'sepsis', 'traffic', 'bpi15a', 'bpic15a', 'bpic2015', 'bpi2015', 'bpi15'}
    return {p for p in prefixes if p and p != "nan"}

KNOWN_PREFIXES = _build_known_prefixes_from_fs(base_dir)

def _canon_label(name: str) -> str:
    s = str(name).strip().lower()
    s = re.sub(r'(_features?)$', '', s)   # drop suffix
    s = s.replace(' ', '_')
    # strip dataset prefixes repeatedly (e.g., "sepsis_", "bpi15a_")
    changed = True
    while changed:
        changed = False
        for p in sorted(KNOWN_PREFIXES, key=len, reverse=True):
            if s.startswith(p + "_"):
                s = s[len(p) + 1:]
                changed = True
    # family mapping
    if re.search(r'\bpayload(\d+)?\b', s):
        return 'payload'
    if re.fullmatch(r'(decl(are)?\d*)', s):
        return 'declare'
    if re.search(r'(^|[^a-z])mr([^a-z]|$)', s):
        return 'sequential'
    return s

# 1) Walk the directory tree and compute coverage per (Dataset, Labeling, Encoding)
coverage_records = []
for dataset in os.listdir(base_dir):
    dataset_path = os.path.join(base_dir, dataset)
    if not os.path.isdir(dataset_path):
        continue

    for labeling in os.listdir(dataset_path):
        labeling_path = os.path.join(dataset_path, labeling)
        if not os.path.isdir(labeling_path):
            continue

        for encoding in os.listdir(labeling_path):
            encoding_path = os.path.join(labeling_path, encoding)
            if not os.path.isdir(encoding_path):
                continue

            # Skip excluded encodings (case-insensitive)
            if _norm_enc(pd.Series([encoding])).iloc[0] in set(map(str.lower, exclude_encodings)):
                continue

            # Expect a single CSV file in this folder
            csv_files = [f for f in os.listdir(encoding_path) if f.endswith(".csv")]
            if not csv_files:
                continue

            csv_path = os.path.join(encoding_path, csv_files[0])
            try:
                df = pd.read_csv(csv_path)
            except Exception:
                continue

            if "Label" not in df.columns:
                continue

            # Make sure Label is numeric
            df["Label"] = pd.to_numeric(df["Label"], errors="coerce")

            total = len(df)
            num_normal  = int((df["Label"] == 0).sum())
            num_deviant = int((df["Label"] == 1).sum())

            if total == 0:
                pct_normal = pct_deviant = 0.0
            else:
                pct_normal  = num_normal / total * 100.0
                pct_deviant = num_deviant / total * 100.0

            # Canonicalize labeling right here so downstream merges match
            labeling_canon = _canon_label(labeling)

            coverage_records.append({
                "Dataset": dataset,
                "Labeling": labeling_canon,
                "Encoding": encoding,
                "#_normal": num_normal,
                "#_deviant": num_deviant,
                "%_normal": pct_normal,
                "%_deviant": pct_deviant
            })

crm_coverage = pd.DataFrame(coverage_records)

# 2) Normalize columns: (keep for idempotency) and round percentages
if not crm_coverage.empty:
    crm_coverage["Labeling"] = _strip_label_suffix(crm_coverage["Labeling"])
    crm_coverage["%_normal"]  = pd.to_numeric(crm_coverage["%_normal"], errors="coerce").round(2)
    crm_coverage["%_deviant"] = pd.to_numeric(crm_coverage["%_deviant"], errors="coerce").round(2)
    crm_coverage["#_normal"]  = pd.to_numeric(crm_coverage["#_normal"], errors="coerce").astype("Int64")
    crm_coverage["#_deviant"] = pd.to_numeric(crm_coverage["#_deviant"], errors="coerce").astype("Int64")

# 3) Compute CRM Rules from ALL rules (NO drop list)
key_cols = ["Dataset", "Labeling", "Encoding"]
rule_key_cols = key_cols + ["Rule"]

# --- Harmonize & canonicalize base rules (all_rules_crm) ---
if 'all_rules_crm' in globals() and isinstance(all_rules_crm, pd.DataFrame) and 'Rule' in all_rules_crm.columns:
    base = all_rules_crm.copy()

    # Normalize column names
    if 'Encoding' not in base.columns and 'Feature Encoding' in base.columns:
        base = base.rename(columns={'Feature Encoding': 'Encoding'})

    # Canonicalize labeling like earlier steps
    if 'Labeling' in base.columns:
        base['Labeling'] = base['Labeling'].apply(_canon_label)
        base['Labeling'] = _strip_label_suffix(base['Labeling'])

    # OPTIONAL: exclude encodings (keep this if you want to match coverage’s skip logic)
    if 'exclude_encodings' in globals():
        base = base[_norm_enc(base['Encoding']).isin(set(map(str.lower, exclude_encodings))) == False]

    # Count ALL unique rules per (Dataset, Labeling, Encoding) — no reference to crm_rules_drop
    cc_rules = (
        base.groupby(key_cols, dropna=False)['Rule']
            .nunique()
            .reset_index(name='CRM Rules')
    )
else:
    # Fallback: try existing comparison frames
    cc_frames = []
    for _name in ("dt_comparison", "ripperk_comparison"):
        _df = globals().get(_name)
        if isinstance(_df, pd.DataFrame) and all(c in _df.columns for c in key_cols + ["CRM Rules"]):
            tmp = _df.loc[:, key_cols + ["CRM Rules"]].copy()
            tmp["Labeling"] = tmp["Labeling"].apply(_canon_label)
            tmp["Labeling"] = _strip_label_suffix(tmp["Labeling"])
            if 'exclude_encodings' in globals():
                tmp = tmp[_norm_enc(tmp["Encoding"]).isin(set(map(str.lower, exclude_encodings))) == False]
            cc_frames.append(tmp)
    if cc_frames:
        cc_rules = (
            pd.concat(cc_frames, ignore_index=True)
              .groupby(key_cols, as_index=False)["CRM Rules"].max()
        )
    else:
        cc_rules = pd.DataFrame(columns=key_cols + ["CRM Rules"])

# 4) Merge CRM Rules into coverage (fill missing with 0)
crm_coverage = crm_coverage.merge(cc_rules, on=key_cols, how="left")
crm_coverage["CRM Rules"] = pd.to_numeric(crm_coverage["CRM Rules"], errors="coerce").fillna(0).astype("Int64")

# 5) Sort for readability
if not crm_coverage.empty:
    crm_coverage = crm_coverage.sort_values(key_cols).reset_index(drop=True)

crm_coverage

Unnamed: 0,Dataset,Labeling,Encoding,#_normal,#_deviant,%_normal,%_deviant,CRM Rules
0,BPI15A,declare,IMPresseD,404,793,33.75,66.25,0
1,BPI15A,declare,baseline,405,794,33.78,66.22,0
2,BPI15A,declare,bs_data,405,794,33.78,66.22,0
3,BPI15A,declare,bs_dwd,405,794,33.78,66.22,0
4,BPI15A,declare,dec_data,405,794,33.78,66.22,0
...,...,...,...,...,...,...,...,...
136,traffic,sequential,hybrid_dwd,131357,19739,86.94,13.06,0
137,traffic,sequential,hybrid_dwd_data,132815,19739,87.06,12.94,0
138,traffic,sequential,payload,130631,19739,86.87,13.13,0
139,traffic,sequential,seq_combined,130709,19739,86.88,13.12,0


In [38]:
# sum CRM Rules
sum_crm_rules = (
    crm_coverage["CRM Rules"]
      .sum()
)

print(sum_crm_rules)


0


In [39]:
# 1) Harmonize column names
crm_df = all_rules_crm.copy()
if 'Encoding' not in crm_df.columns and 'Feature Encoding' in crm_df.columns:
    crm_df = crm_df.rename(columns={'Feature Encoding': 'Encoding'})

# Ensure numeric types we sort/filter on
crm_df['LB odds ratio'] = pd.to_numeric(crm_df['LB odds ratio'], errors='coerce')
if 'Confidence' in crm_df.columns:
    crm_df['Confidence'] = pd.to_numeric(crm_df['Confidence'], errors='coerce')

# --- Canonicalize Labeling like earlier (payload*→payload, decl/declare*→declare, mr*→sequential)
def _build_known_prefixes(df_list):
    prefixes = set()
    for df in df_list:
        if isinstance(df, pd.DataFrame) and 'Dataset' in df.columns:
            s = pd.Series(df['Dataset']).astype(str).str.strip().str.lower()
            prefixes.update(s.str.replace(r'\s+', '', regex=True).unique().tolist())
    prefixes |= {'sepsis', 'traffic', 'bpi15a', 'bpic15a', 'bpic2015', 'bpi2015', 'bpi15'}
    return {p for p in prefixes if p and p != 'nan'}

try:
    KNOWN_PREFIXES  # reuse if already defined
except NameError:
    KNOWN_PREFIXES = _build_known_prefixes([crm_df])

def _canon_label(lbl: str) -> str:
    s = str(lbl).strip().lower()
    s = re.sub(r'(_features?)$', '', s)   # drop _feature/_features if present
    s = s.replace(' ', '_')
    # strip dataset prefixes repeatedly
    changed = True
    while changed:
        changed = False
        for p in sorted(KNOWN_PREFIXES, key=len, reverse=True):
            if s.startswith(p + '_'):
                s = s[len(p) + 1:]
                changed = True
    if re.search(r'\bpayload(\d+)?\b', s):
        return 'payload'
    if re.fullmatch(r'(decl(are)?\d*)', s):
        return 'declare'
    if re.search(r'(^|[^a-z])mr([^a-z]|$)', s):
        return 'sequential'
    return s

if 'Labeling' in crm_df.columns:
    crm_df['Labeling'] = crm_df['Labeling'].apply(_canon_label)

# 2) Filter for LB odds ratio > 1
crm_filtered = crm_df[crm_df['LB odds ratio'] > 1].copy()

# 3) Sort & select top-5 per experiment
crm_filtered = crm_filtered.sort_values(
    by=['Dataset', 'Labeling', 'Encoding', 'LB odds ratio', 'Confidence'],
    ascending=[True, True, True, False, False]
)
top5_rules_df = (
    crm_filtered
    .groupby(['Dataset', 'Labeling', 'Encoding'], group_keys=False)
    .head(5)
    .reset_index(drop=True)
)

# 4) Exact LHS and RHS extraction (no processing of LHS)
def extract_lhs_exact(rule_str: str) -> str:
    s = str(rule_str)
    m = re.search(r"^(.*?)(?=\s*-->)", s)
    return m.group(1) if m else s

def parse_rhs_label(rule_str: str):
    s = str(rule_str)
    m = re.search(r"-->\s*(Label|!Label)", s)
    if not m:
        return None
    return 1 if m.group(1) == "Label" else 0

top5_rules_df['LHS_features'] = top5_rules_df['Rule'].apply(extract_lhs_exact)
top5_rules_df['RHS_label']    = top5_rules_df['Rule'].apply(parse_rhs_label)

# 5) Final table
top5_crm_rules_expanded = top5_rules_df[
    ['Dataset', 'Labeling', 'Encoding', 'Rule', 'LHS_features', 'RHS_label', 'LB odds ratio']
].reset_index(drop=True)

top5_crm_rules_expanded.sort_values(
    by="LB odds ratio",
    ascending=False
).reset_index(drop=True)


Unnamed: 0,Dataset,Labeling,Encoding,Rule,LHS_features,RHS_label,LB odds ratio
0,dhl,dhl,payload,['trace:Label|first|discrete_0.0'] --> !Label,['trace:Label|first|discrete_0.0'],0,209.037
1,dhl,dhl,hybrid,"[""responded_existence:('PICKED', 'ORDER_ACKNOW...","[""responded_existence:('PICKED', 'ORDER_ACKNOW...",1,144.383
2,dhl,dhl,payload,['trace:Label|first|discrete_1.0'] --> Label,['trace:Label|first|discrete_1.0'],1,137.340
3,dhl,dhl,declare,"[""responded_existence:('RELEASED_FOR_PICKING',...","[""responded_existence:('RELEASED_FOR_PICKING',...",1,134.241
4,dhl,dhl,declare,"[""alternate_response:('ORDER_CLOSED', 'ORDER_A...","[""alternate_response:('ORDER_CLOSED', 'ORDER_A...",1,134.241
...,...,...,...,...,...,...,...
67,dhl,dhl,hybrid_dwd_data,['DEPARTURE_NO|first|continuous_binned_(535102...,['DEPARTURE_NO|first|continuous_binned_(535102...,1,5.217
68,dhl,dhl,hybrid_dwd_data,"['response:(RELEASED_FOR_PICKING,PICKED):Data_...","['response:(RELEASED_FOR_PICKING,PICKED):Data_...",1,5.217
69,dhl,dhl,seq_combined,['mr[ORDER_ACKNOWLEDGED-complete]_binned_(-0.0...,['mr[ORDER_ACKNOWLEDGED-complete]_binned_(-0.0...,0,3.387
70,dhl,dhl,seq_combined,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...","['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",0,1.841


In [40]:
# --- helpers (derived from your approach) ---

def _find_outer_brackets_span(text: str):
    """Return (start_idx, end_idx) of the outermost [...] in `text`."""
    start = text.find('[')
    if start < 0:
        return None, None

    depth = 0
    in_s = in_d = esc = False
    end = None
    for i, ch in enumerate(text[start:], start):
        if esc:
            esc = False
            continue
        if ch == '\\':
            esc = True
            continue

        if in_s:
            if ch == "'":
                in_s = False
            continue
        if in_d:
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            in_s = True
            continue
        if ch == '"':
            in_d = True
            continue

        if ch == '[':
            depth += 1
            continue
        if ch == ']':
            depth -= 1
            if depth == 0:
                end = i
                break
    return (start, end)

def _split_top_level_commas(content: str):
    """Split `content` on commas that are outside quotes."""
    parts, curr = [], ""
    in_s = in_d = esc = False
    for ch in content:
        if esc:
            curr += ch
            esc = False
            continue
        if ch == '\\':
            curr += ch
            esc = True
            continue

        if in_s:
            curr += ch
            if ch == "'":
                in_s = False
            continue
        if in_d:
            curr += ch
            if ch == '"':
                in_d = False
            continue

        if ch == "'":
            curr += ch
            in_s = True
            continue
        if ch == '"':
            curr += ch
            in_d = True
            continue

        if ch == ',':
            parts.append(curr.strip())
            curr = ""
        else:
            curr += ch
    parts.append(curr.strip())
    return parts

def _strip_one_layer_quotes(s: str):
    s = s.strip()
    if len(s) >= 2 and ((s[0] == s[-1] == "'") or (s[0] == s[-1] == '"')):
        return s[1:-1]
    return s

def split_lhs_items(lhs_text: str):
    """
    lhs_text is exactly what's before '-->', e.g. "['A', 'B', 'C']" or "['A']".
    Return a list like ['A','B','C'] (no outer quotes/brackets).
    """
    if not isinstance(lhs_text, str):
        return []
    start, end = _find_outer_brackets_span(lhs_text)
    if start is None or end is None:
        return []

    inner = lhs_text[start+1:end]  # content inside [...]
    raw_items = _split_top_level_commas(inner)
    # remove one layer of quotes from each item, keep everything else intact
    return [_strip_one_layer_quotes(x).strip() for x in raw_items if x != ""]

def _pad3(items):
    items = items[:3]
    return items + [""] * (3 - len(items))

# --- apply to your dataframe ---

# top5_crm_rules_expanded['LHS_features'] contains strings like "['A', 'B']"
lhs_split_series = top5_crm_rules_expanded['LHS_features'].apply(split_lhs_items).apply(_pad3)
lhs_df = pd.DataFrame(lhs_split_series.tolist(), columns=['feature_1_lhs','feature_2_lhs','feature_3_lhs'])

top5_crm_rules_expanded = pd.concat([top5_crm_rules_expanded, lhs_df], axis=1)

top5_crm_rules_expanded.sort_values(
    by="LB odds ratio",
    ascending=False
).reset_index(drop=True)


Unnamed: 0,Dataset,Labeling,Encoding,Rule,LHS_features,RHS_label,LB odds ratio,feature_1_lhs,feature_2_lhs,feature_3_lhs
0,dhl,dhl,payload,['trace:Label|first|discrete_0.0'] --> !Label,['trace:Label|first|discrete_0.0'],0,209.037,trace:Label|first|discrete_0.0,,
1,dhl,dhl,hybrid,"[""responded_existence:('PICKED', 'ORDER_ACKNOW...","[""responded_existence:('PICKED', 'ORDER_ACKNOW...",1,144.383,"responded_existence:('PICKED', 'ORDER_ACKNOWLE...",,
2,dhl,dhl,payload,['trace:Label|first|discrete_1.0'] --> Label,['trace:Label|first|discrete_1.0'],1,137.340,trace:Label|first|discrete_1.0,,
3,dhl,dhl,declare,"[""responded_existence:('RELEASED_FOR_PICKING',...","[""responded_existence:('RELEASED_FOR_PICKING',...",1,134.241,"responded_existence:('RELEASED_FOR_PICKING', '...",,
4,dhl,dhl,declare,"[""alternate_response:('ORDER_CLOSED', 'ORDER_A...","[""alternate_response:('ORDER_CLOSED', 'ORDER_A...",1,134.241,"alternate_response:('ORDER_CLOSED', 'ORDER_ACK...",,
...,...,...,...,...,...,...,...,...,...,...
67,dhl,dhl,hybrid_dwd_data,['DEPARTURE_NO|first|continuous_binned_(535102...,['DEPARTURE_NO|first|continuous_binned_(535102...,1,5.217,DEPARTURE_NO|first|continuous_binned_(535102.0...,"not_chain_response:(PICKED,ORDER_ACKNOWLEDGED)...",
68,dhl,dhl,hybrid_dwd_data,"['response:(RELEASED_FOR_PICKING,PICKED):Data_...","['response:(RELEASED_FOR_PICKING,PICKED):Data_...",1,5.217,"response:(RELEASED_FOR_PICKING,PICKED):Data_bi...",DEPARTURE_NO|first|continuous_binned_(535102.0...,
69,dhl,dhl,seq_combined,['mr[ORDER_ACKNOWLEDGED-complete]_binned_(-0.0...,['mr[ORDER_ACKNOWLEDGED-complete]_binned_(-0.0...,0,3.387,mr[ORDER_ACKNOWLEDGED-complete]_binned_(-0.001...,,
70,dhl,dhl,seq_combined,"['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...","['mr[ORDER_GATEWAY-complete, ORDER_OPENED-comp...",0,1.841,"mr[ORDER_GATEWAY-complete, ORDER_OPENED-comple...",,


In [41]:
NUM_SUFFIX_RE = re.compile(r"_(\-?\d+(?:\.\d+)?)$")  # matches _1, _1.0, _0, _0.0, _-1, _-1.0 at the end

LABEL_FOLDER_MAP = {
    "sepsis": {
        "declare":    "sepsis_decl_features",
        "sequential": "sepsis_mr_tr_features",
        "payload":    "sepsis_payload2_features",
    },
    "BPI15A": {
        "declare":    "BPI15A_decl2_features",
        "sequential": "BPI15A_mr_tr_features",
        "payload":    "BPI15A_payload_560925_features",
    },
    "traffic": {
        "declare":    "traffic_decl3_features",
        "sequential": "traffic_mr_tr_features",
        "payload":    "traffic_payload_Pay36_features",
    },
}

base_dir = "3.2_binned_features"

# --- matching helpers ---

def _norm_numeric(col: pd.Series) -> pd.Series:
    """
    Normalize a column for reliable numeric comparisons:
    - bool -> 0/1
    - strings like '1','0','-1.0' -> numeric (where possible)
    - if coercion yields all NaN and original is object, return original for string compares
    """
    if col.dtype == bool:
        return col.astype(int)
    out = pd.to_numeric(col, errors='coerce')
    if out.isna().all() and col.dtype == object:
        return col  # keep as strings for string equality checks
    return out

def _infer_case_col(df: pd.DataFrame) -> str:
    for c in ["Case_ID", "case:concept:name", "Case ID", "case_id"]:
        if c in df.columns:
            return c
    raise KeyError("No Case ID column found (tried: Case_ID, case:concept:name, Case ID, case_id)")

def _match_single_feature(df: pd.DataFrame, feat: str) -> pd.Series:
    """
    Return a boolean mask of rows matching a *single* LHS feature.

    Cases handled:
      1) Exact one-hot column: feat equals a column name -> column == 1
      2) Binned value: base_col_(...) or base_col_[...] -> df[base_col] == "(...]" or "[...]"
      3) Suffix _0/_1: base column exists with 0/1 value
    Fallback -> no rows match.
    """
    # 1) Exact column match (treat as one-hot)
    if feat in df.columns:
        col = _norm_numeric(df[feat])
        return (col == 1) if pd.api.types.is_numeric_dtype(col) else (col.astype(str) == "1")

    # 2) Binned pattern: split at last "_(" or "_["
    pos1 = feat.rfind("_(")
    pos2 = feat.rfind("_[")
    split_pos = max(pos1, pos2)
    if split_pos != -1:
        base_col = feat[:split_pos]
        bin_val  = feat[split_pos+1:]  # drop the underscore before bracket
        if base_col in df.columns:
            return (df[base_col].astype(str) == bin_val)

    # 3) General numeric suffix: ..._<number> at the END (e.g., _1.0, _0, _-1.0)
    m = NUM_SUFFIX_RE.search(feat)
    if m:
        base_col = feat[:m.start()]
        desired_str = m.group(1)        # e.g., "1.0", "0", "-1.0"
        desired = float(desired_str)    # numeric compare works for 1 vs 1.0, -1 vs -1.0

        # Case: base column exists -> compare its value to the numeric suffix
        if base_col in df.columns:
            col = _norm_numeric(df[base_col])
            if pd.api.types.is_numeric_dtype(col):
                mask = (col == desired)
            else:
                # fallback to string comparison if column is non-numeric
                mask = (col.astype(str) == desired_str)
            return mask.fillna(False)

        # Rare fallback: indicator column named with the full suffix exists
        # Treat as one-hot (==1)
        if feat in df.columns:
            col = _norm_numeric(df[feat])
            if pd.api.types.is_numeric_dtype(col):
                mask = (col == 1)
            else:
                mask = (col.astype(str) == "1")
            return mask.fillna(False)

    # No match strategy -> False
    return pd.Series(False, index=df.index)

def _match_rule(df: pd.DataFrame, features: list, rhs_label: int) -> pd.Series:
    mask = pd.Series(True, index=df.index)
    for f in features:
        if f:  # skip empty slots
            mask &= _match_single_feature(df, f)
            if not mask.any():
                break
    if rhs_label in (0, 1):
        mask &= (pd.to_numeric(df["Label"], errors="coerce") == rhs_label)
    else:
        mask &= False
    return mask

# --- helper: map free-form labeling -> {'declare','sequential','payload'} ---
def _label_category(labeling: str) -> str | None:
    s = str(labeling).lower().strip()
    if "mr_tr" in s or "mrtr" in s or s.startswith("seq") or "sequen" in s:
        return "sequential"
    if s.startswith("decl") or "declare" in s:
        return "declare"
    if "payload" in s:
        return "payload"
    if s in {"declare","sequential","payload"}:
        return s
    return None

# --- helper: find full path using explicit per-dataset mapping with light fallbacks ---
def find_encoding_path(base_dir: str, dataset: str, labeling: str, encoding: str):
    cat = _label_category(labeling)
    # 1) explicit mapping
    folder = LABEL_FOLDER_MAP.get(dataset, {}).get(cat, None)
    if folder:
        path = os.path.join(base_dir, dataset, folder, encoding)
        if os.path.isdir(path):
            return path
    # 2) common literal fallbacks
    dataset_dir = os.path.join(base_dir, dataset)
    candidates = []
    if cat:
        candidates += [f"{dataset}_{cat}_features", f"{cat}_features"]
    candidates += [f"{dataset}_{labeling}_features", f"{labeling}_features"]
    for lf in candidates:
        lf_path = os.path.join(dataset_dir, lf)
        if os.path.isdir(lf_path):
            enc_path = os.path.join(lf_path, encoding)
            if os.path.isdir(enc_path):
                return enc_path
    return None

# ---------- PASS 1: per-rule covered cases ----------
# Ensure output columns exist
if "covered_case_ids" not in top5_crm_rules_expanded.columns:
    top5_crm_rules_expanded["covered_case_ids"] = [[] for _ in range(len(top5_crm_rules_expanded))]
if "n_covered_cases" not in top5_crm_rules_expanded.columns:
    top5_crm_rules_expanded["n_covered_cases"] = 0

for (dataset, labeling, encoding), rules_df in top5_crm_rules_expanded.groupby(["Dataset", "Labeling", "Encoding"]):
    enc_path = find_encoding_path(base_dir, dataset, labeling, encoding)
    if enc_path is None:
        continue

    csv_files = [f for f in os.listdir(enc_path) if f.endswith(".csv")]
    if not csv_files:
        continue
    csv_path = os.path.join(enc_path, csv_files[0])

    df_enc = pd.read_csv(csv_path)
    if "Label" not in df_enc.columns:
        raise KeyError(f"No 'Label' column found in: {csv_path}")
    case_col = _infer_case_col(df_enc)

    for idx, row in rules_df.iterrows():
        feats = [row.get("feature_1_lhs",""), row.get("feature_2_lhs",""), row.get("feature_3_lhs","")]
        feats = [f for f in feats if isinstance(f, str) and f.strip() != ""]
        rhs   = row["RHS_label"]

        rule_mask = _match_rule(df_enc, feats, rhs)
        case_ids = df_enc.loc[rule_mask, case_col].dropna().astype(str).unique().tolist()

        top5_crm_rules_expanded.at[idx, "covered_case_ids"] = case_ids
        top5_crm_rules_expanded.at[idx, "n_covered_cases"]  = len(case_ids)

# ---------- PASS 2: percentage of class covered ----------
if "pct_of_class_covered" not in top5_crm_rules_expanded.columns:
    top5_crm_rules_expanded["pct_of_class_covered"] = 0.0

for (dataset, labeling, encoding), rules_df in top5_crm_rules_expanded.groupby(["Dataset", "Labeling", "Encoding"]):
    enc_path = find_encoding_path(base_dir, dataset, labeling, encoding)
    if enc_path is None:
        continue

    csv_files = [f for f in os.listdir(enc_path) if f.endswith(".csv")]
    if not csv_files:
        continue
    csv_path = os.path.join(enc_path, csv_files[0])

    df_enc = pd.read_csv(csv_path)
    if "Label" not in df_enc.columns:
        raise KeyError(f"No 'Label' column found in: {csv_path}")

    labels_num = pd.to_numeric(df_enc["Label"], errors="coerce")
    total_normal  = int((labels_num == 0).sum())
    total_deviant = int((labels_num == 1).sum())

    case_col = _infer_case_col(df_enc)

    for idx, row in rules_df.iterrows():
        feats = [row.get("feature_1_lhs",""), row.get("feature_2_lhs",""), row.get("feature_3_lhs","")]
        feats = [f for f in feats if isinstance(f, str) and f.strip() != ""]
        rhs   = row["RHS_label"]

        rule_mask = _match_rule(df_enc, feats, rhs)
        case_ids = df_enc.loc[rule_mask, case_col].dropna().astype(str).unique().tolist()

        top5_crm_rules_expanded.at[idx, "covered_case_ids"] = case_ids
        top5_crm_rules_expanded.at[idx, "n_covered_cases"]  = len(case_ids)

        denom = total_deviant if rhs == 1 else total_normal
        pct = (len(case_ids) / denom * 100.0) if denom > 0 else 0.0
        top5_crm_rules_expanded.at[idx, "pct_of_class_covered"] = round(pct, 2)

top5_crm_rules_expanded

Unnamed: 0,Dataset,Labeling,Encoding,Rule,LHS_features,RHS_label,LB odds ratio,feature_1_lhs,feature_2_lhs,feature_3_lhs,covered_case_ids,n_covered_cases,pct_of_class_covered
0,dhl,dhl,baseline,"['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]'] -->...","['ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]']",1,65.361,"ORDER_ACKNOWLEDGED_binned_(1.0, 200.0]",,,[],0,0.0
1,dhl,dhl,baseline,"['PALLET_CREATED_binned_(1.0, 199.0]'] --> Label","['PALLET_CREATED_binned_(1.0, 199.0]']",1,8.733,"PALLET_CREATED_binned_(1.0, 199.0]",,,[],0,0.0
2,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'SHIPUNIT...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'SHIPUNIT...",0,7.329,"ORDER_CLOSED_binned_(1.0, 200.0]","SHIPUNIT_BUILT_binned_(-0.001, 1.0]",,[],0,0.0
3,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PALLET_C...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'PALLET_C...",0,7.329,"ORDER_CLOSED_binned_(1.0, 200.0]","PALLET_CREATED_binned_(-0.001, 1.0]",,[],0,0.0
4,dhl,dhl,baseline,"['ORDER_CLOSED_binned_(1.0, 200.0]', 'PACKED_b...","['ORDER_CLOSED_binned_(1.0, 200.0]', 'PACKED_b...",0,7.189,"ORDER_CLOSED_binned_(1.0, 200.0]","PACKED_binned_(-0.001, 1.0]",,[],0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,dhl,dhl,seq_combined_data,"['mra[SHIPUNIT_BUILT-complete, SCAN_TO_GATE-co...","['mra[SHIPUNIT_BUILT-complete, SCAN_TO_GATE-co...",0,43.810,"mra[SHIPUNIT_BUILT-complete, SCAN_TO_GATE-comp...",,,[],0,0.0
68,dhl,dhl,seq_combined_data,"['mr[ORDER_ACKNOWLEDGED-complete, ORDER_TIMEST...","['mr[ORDER_ACKNOWLEDGED-complete, ORDER_TIMEST...",1,37.472,"mr[ORDER_ACKNOWLEDGED-complete, ORDER_TIMESTAM...",,,[],0,0.0
69,dhl,dhl,seq_combined_data,"['mr[ORDER_ACKNOWLEDGED-complete, ORDER_TIMEST...","['mr[ORDER_ACKNOWLEDGED-complete, ORDER_TIMEST...",0,34.654,"mr[ORDER_ACKNOWLEDGED-complete, ORDER_TIMESTAM...",,,[],0,0.0
70,dhl,dhl,seq_combined_data,"['mr[RELEASED_FOR_PICKING-complete, PICKED-com...","['mr[RELEASED_FOR_PICKING-complete, PICKED-com...",1,19.460,"mr[RELEASED_FOR_PICKING-complete, PICKED-compl...","LUID|first|continuous_binned_(-0.001, 40026172...",,[],0,0.0


In [42]:
# --- Build the export frame ---
required_cols = ['Dataset', 'Labeling', 'Encoding', 'Rule', 'LB odds ratio', 'pct_of_class_covered']
missing = [c for c in required_cols if c not in top5_crm_rules_expanded.columns]
if missing:
    raise KeyError(f"top5_crm_rules_expanded is missing columns: {missing}")

export_df = top5_crm_rules_expanded[required_cols].copy()

# --- Output paths ---
out_dir = os.path.join('5_analysis')
os.makedirs(out_dir, exist_ok=True)

csv_path = os.path.join(out_dir, 'top5_rule_coverage.csv')
tex_path = os.path.join(out_dir, 'top5_rule_coverage.tex')

# --- Save CSV (raw values) ---
export_df.to_csv(csv_path, index=False)

export_df.to_latex(
    tex_path,
    index=False,
    escape=False,
    longtable=True,
    formatters={
        "Encoding": fmt_detok,
        "Rule": fmt_detok,
        "pct_of_class_covered": lambda x: f"{x:.2f}" if pd.notna(x) else "",
        "LB odds ratio": lambda x: f"{x:.2f}" if pd.notna(x) else ""
    }
)

print(f"✅ Saved top5_rule_coverage.csv → {csv_path}")
print(f"✅ Saved top5_rule_coverage.tex → {tex_path}")

✅ Saved top5_rule_coverage.csv → 5_analysis/top5_rule_coverage.csv
✅ Saved top5_rule_coverage.tex → 5_analysis/top5_rule_coverage.tex


In [43]:
# --- Top-5 union coverage per experiment (normal vs deviant) ---

# 1) Build union of covered cases per class for each (Dataset, Labeling, Encoding)
try:
    LABEL_FOLDER_MAP
except NameError:
    LABEL_FOLDER_MAP = {"mr_tr":"sequential", "decl":"declare", "payload":"payload"}

def _strip_dataset_prefix(label: str, dataset: str) -> str:
    if not isinstance(label, str) or not isinstance(dataset, str):
        return label
    pref = f"{dataset}_"
    return label[len(pref):] if label.lower().startswith(pref.lower()) else label

def _label_key(label: str, dataset: str) -> str:
    """Normalize a labeling to the folder key used for joining."""
    if not isinstance(label, str):
        return label
    s = label.strip()
    s = re.sub(r"_features$", "", s, flags=re.I)   # drop trailing _features
    s = _strip_dataset_prefix(s, dataset)          # drop leading '<dataset>_'
    key = s.lower().replace(" ", "")
    # dataset-specific mapping has priority
    if isinstance(LABEL_FOLDER_MAP.get(dataset), dict):
        return LABEL_FOLDER_MAP[dataset].get(key, key)
    return LABEL_FOLDER_MAP.get(key, key)

# --- Top-5 union coverage per experiment (normal vs deviant) ---

# 1) Build union of covered cases per class for each (Dataset, Labeling, Encoding)
union_rows = []

for (ds, lab, enc), g in top5_crm_rules_expanded.groupby(["Dataset", "Labeling", "Encoding"]):
    # Resolve path exactly like earlier code
    enc_path = find_encoding_path(base_dir, ds, lab, enc)
    if enc_path is None:
        # couldn't resolve; skip this (Dataset, Labeling, Encoding)
        continue

    # Load the encoded CSV used for this experiment
    csv_files = [f for f in os.listdir(enc_path) if f.endswith(".csv")]
    if not csv_files:
        continue
    csv_path = os.path.join(enc_path, csv_files[0])
    df_enc = pd.read_csv(csv_path)

    if "Label" not in df_enc.columns:
        raise KeyError(f"No 'Label' column found in: {csv_path}")

    # Denominators from the same file (consistent with earlier passes)
    labels_num = pd.to_numeric(df_enc["Label"], errors="coerce")
    total_normal  = int((labels_num == 0).sum())
    total_deviant = int((labels_num == 1).sum())

    # Case ID column detection identical to earlier code
    case_col = _infer_case_col(df_enc)

    # Build unions by recomputing coverage with the SAME matcher
    normal_union  = set()
    deviant_union = set()

    for _, r in g.iterrows():
        feats = [r.get("feature_1_lhs",""), r.get("feature_2_lhs",""), r.get("feature_3_lhs","")]
        feats = [f for f in feats if isinstance(f, str) and f.strip() != ""]
        rhs   = r.get("RHS_label", None)

        rule_mask = _match_rule(df_enc, feats, rhs)
        case_ids = df_enc.loc[rule_mask, case_col].dropna().astype(str).tolist()

        if rhs == 0:
            normal_union.update(case_ids)
        elif rhs == 1:
            deviant_union.update(case_ids)

    # Row for this experiment
    n_normal_union  = len(normal_union)
    n_deviant_union = len(deviant_union)

    union_rows.append({
        "Dataset": ds,
        "Labeling": lab,
        "Encoding": enc,
        "#_normal": total_normal,
        "#_deviant": total_deviant,
        "top5_cov_normal_n":  n_normal_union,
        "top5_cov_deviant_n": n_deviant_union,
        "top5_cov_normal_pct":  round((n_normal_union  / total_normal)  * 100.0, 2) if total_normal  > 0 else 0.0,
        "top5_cov_deviant_pct": round((n_deviant_union / total_deviant) * 100.0, 2) if total_deviant > 0 else 0.0,
    })

crm_top5_union_coverage = pd.DataFrame(union_rows)

# (Optional) If you still want these percentages visible in crm_coverage, do a straight key merge:
# NOTE: This assumes Dataset/Labeling/Encoding strings in crm_coverage match those in top5_crm_rules_expanded.
# If your Labeling strings differ (e.g., with prefixes/suffixes), consider joining on a normalized key.
try:
    crm_coverage = crm_coverage.merge(
        crm_top5_union_coverage[["Dataset", "Labeling", "Encoding",
                                 "top5_cov_normal_pct", "top5_cov_deviant_pct"]],
        on=["Dataset", "Labeling", "Encoding"],
        how="left"
    )
except NameError:
    # crm_coverage not defined in this notebook; nothing to merge
    pass

# Inspect
crm_top5_union_coverage

KeyError: "None of [Index(['Dataset', 'Labeling', 'Encoding', 'top5_cov_normal_pct',\n       'top5_cov_deviant_pct'],\n      dtype='object')] are in the [columns]"

In [None]:
# ---------- Final coverage + top-5 composition per experiment (robust join) ----------

import pandas as pd
import re

# Safety checks
if 'crm_top5_union_coverage' not in globals():
    raise RuntimeError("crm_top5_union_coverage not found. Run the union-coverage step first.")
if 'top5_crm_rules_expanded' not in globals():
    raise RuntimeError("top5_crm_rules_expanded not found. Run the top-5 extraction step first.")

# --- helpers consistent with earlier cells ---
def _strip_dataset_prefix(label: str, dataset: str) -> str:
    if not isinstance(label, str) or not isinstance(dataset, str):
        return str(label)
    pref = f"{dataset}_"
    s = label
    if s.lower().startswith(pref.lower()):
        s = s[len(pref):]
    return s

def _normalize_label_for_join(label: str, dataset: str) -> str:
    """Normalize labeling to a stable category key for joining."""
    s = str(label).strip()
    s = re.sub(r"_features$", "", s, flags=re.I)
    s = _strip_dataset_prefix(s, dataset)
    s_low = s.lower()

    # same categorization as earlier
    if ("mr_tr" in s_low) or ("mrtr" in s_low) or s_low.startswith("seq") or ("sequen" in s_low):
        return "sequential"
    if s_low.startswith("decl") or ("declare" in s_low):
        return "declare"
    if "payload" in s_low:
        return "payload"
    # fallback: return cleaned token
    return s_low.replace(" ", "")

# 1) Start from the union-coverage table (computed with find_encoding_path + _match_rule)
need_cols = ['Dataset', 'Labeling', 'Encoding', 'top5_cov_normal_pct', 'top5_cov_deviant_pct']
missing = [c for c in need_cols if c not in crm_top5_union_coverage.columns]
if missing:
    raise KeyError(f"crm_top5_union_coverage is missing columns: {missing}")

final_df = crm_top5_union_coverage[need_cols].copy()

# Build a normalized join key
final_df["LabelKey"] = final_df.apply(lambda r: _normalize_label_for_join(r["Labeling"], r["Dataset"]), axis=1)

# 2) Count how many of the top-5 rules per (Dataset, Labeling, Encoding) target deviant vs normal
top5_tmp = top5_crm_rules_expanded.copy()
top5_tmp["LabelKey"] = top5_tmp.apply(lambda r: _normalize_label_for_join(r["Labeling"], r["Dataset"]), axis=1)

top5_counts = (
    top5_tmp
    .groupby(['Dataset', 'Encoding', 'LabelKey'], as_index=False)
    .agg(
        top5_rules_deviant=('RHS_label', lambda s: int((s == 1).sum())),
        top5_rules_normal =('RHS_label', lambda s: int((s == 0).sum())),
        top5_rules_total  =('RHS_label', 'size')
    )
)

# 3) Merge counts into coverage percentages on the robust key
final_df = final_df.merge(
    top5_counts[['Dataset','Encoding','LabelKey','top5_rules_deviant','top5_rules_normal']],
    on=['Dataset','Encoding','LabelKey'],
    how='left'
)

# 4) Fill missing with zeros and set integer dtype; also replace Labeling with the normalized category
for c in ['top5_rules_deviant','top5_rules_normal']:
    if c in final_df.columns:
        final_df[c] = pd.to_numeric(final_df[c], errors='coerce').fillna(0).astype('Int64')

# Use the normalized category as the reporting label
final_df["Labeling"] = final_df["LabelKey"]
final_df = final_df.drop(columns=["LabelKey"])

# 5) Pretty names + ordering
rename_map = {
    'top5_cov_normal_pct': 'Top5 !Z Coverage',
    'top5_cov_deviant_pct': 'Top5 Z Coverage',
    'top5_rules_deviant':  'Top5 Z Rules',
    'top5_rules_normal':   'Top5 !Z Rules',
}
final_pretty = final_df.rename(columns=rename_map)

desired_order = [
    'Dataset', 'Labeling', 'Encoding',
    'Top5 Z Rules', 'Top5 Z Coverage',
    'Top5 !Z Rules', 'Top5 !Z Coverage'
]
final_pretty = (
    final_pretty[[c for c in desired_order if c in final_pretty.columns]]
    if (set(desired_order) - set(final_pretty.columns))
    else final_pretty[desired_order]
)

# 6) Sort for readability and expose as coverage_crm
coverage_crm = final_pretty.sort_values(['Dataset','Labeling','Encoding']).reset_index(drop=True)

# Optional: if you want percentages as strings like "40.00%", uncomment:
# for col in ['Top5 Z Coverage','Top5 !Z Coverage']:
#     if col in coverage_crm.columns:
#         coverage_crm[col] = coverage_crm[col].map(lambda v: f"{v:.2f}%" if pd.notna(v) else "")

coverage_crm


In [None]:
out_dir = os.path.join('5_analysis')
os.makedirs(out_dir, exist_ok=True)

csv_path_coverage_crm = os.path.join(out_dir, 'coverage_crm_summary.csv')
tex_path_coverage_crm = os.path.join(out_dir, 'coverage_crm_summary.tex')
coverage_crm.to_csv(csv_path_coverage_crm, index=False)

coverage_crm.to_latex(
    tex_path_coverage_crm,
    index=False,
    escape=False,
    longtable=True,
    formatters={
        "Encoding": fmt_detok,
        "Labeling": fmt_detok,
        "Top5 Z Coverage":    lambda x: f"{x:.2f}%",
        "Top5 !Z Coverage":   lambda x: f"{x:.2f}%",
        "Top5 Z Coverage":    fmt_detok,
        "Top5 !Z Coverage":   fmt_detok
    }
)

print(f"✅ Saved coverage_crm_summary.csv → {csv_path_coverage_crm}")

In [None]:
# # --- Append CRM top-5 union coverage to comparison_counts ---

# # Safety: normalize "Labeling" suffix just in case
# cov_subset = crm_coverage.copy()
# cov_subset["Labeling"] = cov_subset["Labeling"].str.replace(r"_features$", "", regex=True)

# # Keep only what's needed and rename to requested column names
# cov_subset = (
#     cov_subset[["Dataset", "Labeling", "Encoding", "top5_cov_normal_pct", "top5_cov_deviant_pct"]]
#     .rename(columns={
#         "top5_cov_normal_pct": "% CRM !Z Coverage",   # normal (= !Label)
#         "top5_cov_deviant_pct": "% CRM Z Coverage",   # deviant (= Label)
#     })
# )

# # Merge into comparison_counts
# crm_coverage = crm_coverage.merge(
#     cov_subset,
#     on=["Dataset", "Labeling", "Encoding"],
#     how="left"
# )

# # Clean up types and rounding
# for col in ["% CRM !Z Coverage", "% CRM Z Coverage"]:
#     if col in crm_coverage.columns:
#         crm_coverage[col] = pd.to_numeric(crm_coverage[col], errors="coerce").fillna(0.0).round(2)

# crm_coverage


## 80% coverage

In [None]:
# --- Minimal rule set to reach target coverage (greedy set cover) ---
from collections import defaultdict

TARGET_COVERAGE = 0.80  # 80% threshold
MODES = ("overall", "pos", "neg")  # overall = all cases, pos = RHS=1, neg = RHS=0

# Sanity checks
needed_cols = {"Dataset","Labeling","Encoding","RHS_label","covered_case_ids"}
missing = needed_cols - set(top5_crm_rules_expanded.columns)
if missing:
    raise ValueError(f"top5_crm_rules_expanded is missing columns: {missing}. "
                     f"Run your pass that computes 'covered_case_ids' first.")

def _tie_break_score(row):
    """
    Higher is better. Used when two rules add the same # of new cases.
    Prefers LB odds ratio, then Precision, then Recall if present.
    """
    def _g(col): 
        return float(row[col]) if col in row and pd.notna(row[col]) else float("-inf")
    return (_g("LB odds ratio"), _g("Precision"), _g("Recall"))

def _greedy_cover_for_mode(rules_subset_df, target_case_ids, prefer_cols=True):
    """
    Greedy set cover:
    - rules_subset_df must have index (carry it through to reference selected rows)
    - 'covered_case_ids' column contains list of case ids per rule
    - target_case_ids: set of case ids we aim to cover
    Returns dict with selection and coverage stats.
    """
    target = set(map(str, target_case_ids))
    if not target:
        return {
            "selected_rule_indices": [],
            "selected_rules_step_gain": [],
            "covered_cases": set(),
            "coverage_frac": 0.0,
            "achievable_frac": 0.0,
        }

    # Precompute each rule's case set (as strings)
    rule_cases = {}
    for idx, row in rules_subset_df.iterrows():
        rule_cases[idx] = set(map(str, row.get("covered_case_ids", []) or []))

    # Upper bound (achievable) coverage if we use all rules
    achievable = set().union(*rule_cases.values()) if rule_cases else set()
    achievable_on_target = achievable & target
    achievable_frac = len(achievable_on_target) / len(target)

    selected = []
    step_gain = []
    covered = set()

    # Working pool of candidates
    remaining = set(rule_cases.keys())

    while len(covered) / len(target) < TARGET_COVERAGE and remaining:
        # Pick rule with max marginal gain; break ties by _tie_break_score
        best_idx = None
        best_gain = -1
        best_score = tuple()

        for idx in remaining:
            new_gain = len((rule_cases[idx] & target) - covered)
            if new_gain > best_gain:
                best_idx = idx
                best_gain = new_gain
                best_score = _tie_break_score(rules_subset_df.loc[idx])
            elif new_gain == best_gain and best_gain > 0:
                # Tie-break on quality metrics
                score = _tie_break_score(rules_subset_df.loc[idx])
                if score > best_score:
                    best_idx = idx
                    best_score = score

        # No rule adds anything new -> stop
        if best_idx is None or best_gain <= 0:
            break

        selected.append(best_idx)
        step_gain.append(best_gain)
        covered |= (rule_cases[best_idx] & target)
        remaining.remove(best_idx)

    coverage_frac = len(covered) / len(target)

    return {
        "selected_rule_indices": selected,
        "selected_rules_step_gain": step_gain,
        "covered_cases": covered,
        "coverage_frac": coverage_frac,
        "achievable_frac": achievable_frac,
    }

# Containers for outputs
min_cover_rows = []
# Optional markers on the original DF
for col in ["in_min_cover_overall","in_min_cover_pos","in_min_cover_neg"]:
    top5_crm_rules_expanded[col] = False

# Iterate per experiment
for (dataset, labeling, encoding), rules_df in top5_crm_rules_expanded.groupby(["Dataset","Labeling","Encoding"]):
    # Load the encoding CSV once to get the universe of cases + per-class targets
    enc_path = find_encoding_path(base_dir, dataset, labeling, encoding)
    if enc_path is None:
        # Could not resolve path; skip but record a row
        min_cover_rows.append({
            "Dataset": dataset, "Labeling": labeling, "Encoding": encoding,
            "Mode": "overall", "Threshold": TARGET_COVERAGE,
            "Selected Rules": 0, "Coverage (%)": 0.0, "Achievable (%)": 0.0,
            "Covered / Total": "0 / 0", "Selected Indexes": [],
        })
        continue

    csv_files = [f for f in os.listdir(enc_path) if f.endswith(".csv")]
    if not csv_files:
        continue
    df_enc = pd.read_csv(os.path.join(enc_path, csv_files[0]))
    case_col = _infer_case_col(df_enc)

    # Targets
    total_cases_all = set(df_enc[case_col].dropna().astype(str).unique().tolist())
    labels_num = pd.to_numeric(df_enc["Label"], errors="coerce").fillna(-999).astype(int)
    total_cases_pos = set(df_enc.loc[labels_num==1, case_col].dropna().astype(str).unique().tolist())
    total_cases_neg = set(df_enc.loc[labels_num==0, case_col].dropna().astype(str).unique().tolist())

    # Modes: overall / pos / neg
    for mode in MODES:
        if mode == "overall":
            sub = rules_df
            target = total_cases_all
        elif mode == "pos":
            sub = rules_df[rules_df["RHS_label"] == 1]
            target = total_cases_pos
        else:  # "neg"
            sub = rules_df[rules_df["RHS_label"] == 0]
            target = total_cases_neg

        if sub.empty:
            min_cover_rows.append({
                "Dataset": dataset, "Labeling": labeling, "Encoding": encoding,
                "Mode": mode, "Threshold": TARGET_COVERAGE,
                "Selected Rules": 0, "Coverage (%)": 0.0, "Achievable (%)": 0.0,
                "Covered / Total": f"0 / {len(target)}", "Selected Indexes": [],
            })
            continue

        res = _greedy_cover_for_mode(sub, target)
        selected = res["selected_rule_indices"]
        coverage = res["coverage_frac"]
        achievable = res["achievable_frac"]

        # Mark selections on the original DF
        mark_col = {
            "overall": "in_min_cover_overall",
            "pos": "in_min_cover_pos",
            "neg": "in_min_cover_neg",
        }[mode]
        top5_crm_rules_expanded.loc[selected, mark_col] = True

        min_cover_rows.append({
            "Dataset": dataset,
            "Labeling": labeling,
            "Encoding": encoding,
            "Mode": mode,
            "Threshold": TARGET_COVERAGE,
            "Selected Rules": len(selected),
            "Coverage (%)": round(coverage*100, 2),
            "Achievable (%)": round(achievable*100, 2),
            "Covered / Total": f"{len(res['covered_cases'])} / {len(target)}",
            "Selected Indexes": selected,
        })

# Summary DataFrame with one row per (experiment, mode)
min_cover_summary = pd.DataFrame(min_cover_rows).sort_values(
    ["Dataset","Labeling","Encoding","Mode"]
).reset_index(drop=True)

# (Optional) Nice view of the actually selected rules per experiment/mode
def selected_rules_view(df_rules, summary_row):
    """
    Return a small DataFrame showing the selected rules for a summary row.
    """
    dataset, labeling, encoding = summary_row["Dataset"], summary_row["Labeling"], summary_row["Encoding"]
    mode = summary_row["Mode"]
    selected = summary_row["Selected Indexes"]

    sub = df_rules[
        (df_rules["Dataset"]==dataset) &
        (df_rules["Labeling"]==labeling) &
        (df_rules["Encoding"]==encoding)
    ].loc[selected]

    keep_cols = [c for c in [
        "Rule","RHS_label","feature_1_lhs","feature_2_lhs","feature_3_lhs",
        "n_covered_cases","pct_of_class_covered","LB odds ratio","Precision","Recall"
    ] if c in sub.columns]

    return sub[keep_cols].copy()

min_cover_summary


In [None]:
# --- 1) Keep only Mode == 'neg' ---
neg = min_cover_summary[min_cover_summary["Mode"] == "neg"].copy()

# --- 2) Drop requested columns (only if present) ---
to_drop = ["Mode", "Threshold", "Achievable (%)", "Covered / Total", "Selected Indexes"]
neg = neg.drop(columns=[c for c in to_drop if c in neg.columns])

# --- 3) Build formatters like in your example (detokenize only selected columns) ---
def fmt_detok(x):
    return r'\detokenize{' + str(x) + '}'

formatters = {}
for col in ("Encoding", "Rule"):  # add other string cols here if you want the same treatment
    if col in neg.columns:
        formatters[col] = fmt_detok

# --- 4) Save to LaTeX in the same style as your example ---
root_dir = locals().get("root_dir", ".")
out_fp_tex = os.path.join(root_dir, "min_cover_summary_neg.tex")

neg.to_latex(
    out_fp_tex,
    index=False,
    escape=False,     # preserve \detokenize
    longtable=True,   # match your example
    float_format="%.2f",
    formatters=formatters
)

print(f"✅ Saved min_cover_summary_neg.tex → {out_fp_tex}")
neg

### Metric Comparison

In [None]:
# 6. Comparison of Baseline with CRM (Top 5 rules)
try:
    _ = all_rules_crm
except NameError:
    import os
    import pandas as pd
    all_rules_crm_path = os.path.join('5_analysis', 'random', 'combined_sorted_all.csv')
    all_rules_crm = pd.read_csv(all_rules_crm_path, sep=',')

# Harmonize/filter CRM rules like before (if helper exists)
try:
    rules_crm_f = _harmonize_and_filter(all_rules_crm)
except NameError:
    rules_crm_f = all_rules_crm.copy()

required_cols = ['Dataset', 'Labeling', 'Encoding']
missing = [c for c in required_cols if c not in rules_crm_f.columns]
if missing:
    raise ValueError(f"Missing columns in CRM rules: {missing}")

# Select the Top 5 rules per (Dataset, Labeling, Encoding) based on existing sorted order
rules_crm_top5 = (
    rules_crm_f
    .groupby(required_cols, as_index=False, sort=False)
    .head(5)
 )

# Count rules per group using the same helper (keep 'crm' prefix so merge util works unchanged)
crm_counts_top5 = _rule_counts(rules_crm_top5, 'crm')

# Build comparisons analogous to earlier ones, but with CRM limited to Top 5
dt_comparison_top5 = _merge_with_crm(dt_counts, crm_counts_top5)
ripperk_comparison_top5 = _merge_with_crm(ripperk_counts, crm_counts_top5)

# Order columns like before if the templates are available
try:
    dt_comparison_top5 = dt_comparison_top5.reindex(columns=[c for c in _dt_cols if c in dt_comparison_top5.columns])
except NameError:
    pass
try:
    ripperk_comparison_top5 = ripperk_comparison_top5.reindex(columns=[c for c in _rk_cols if c in ripperk_comparison_top5.columns])
except NameError:
    pass

# Optionally attach metrics if present
try:
    dt_comparison_top5 = _attach_metrics(dt_comparison_top5, dt_metrics, prefix='dt')
except NameError:
    pass
try:
    ripperk_comparison_top5 = _attach_metrics(ripperk_comparison_top5, rk_metrics, prefix='ripperk')
except NameError:
    pass

print('dt_comparison_top5 shape:', dt_comparison_top5.shape)
print('ripperk_comparison_top5 shape:', ripperk_comparison_top5.shape)
display(dt_comparison_top5)
display(ripperk_comparison_top5)

In [None]:
# 7. Mean/Median Confidence of Top-5 CRM rules, joined next to precision
import pandas as pd
import os

# Ensure we have the Top-5 CRM rules available
try:
    _ = rules_crm_top5
except NameError:
    try:
        _ = rules_crm_f
    except NameError:
        # Load all CRM rules if not present
        all_rules_crm_path = os.path.join('5_analysis', 'random', 'combined_sorted_all.csv')
        all_rules_crm = pd.read_csv(all_rules_crm_path, sep=',')
        try:
            rules_crm_f = _harmonize_and_filter(all_rules_crm)
        except NameError:
            rules_crm_f = all_rules_crm.copy()
    # Derive Top-5 per (Dataset, Labeling, Encoding) assuming input is already sorted by quality
    rules_crm_top5 = (
        rules_crm_f.groupby(['Dataset','Labeling','Encoding'], as_index=False, sort=False)
                    .head(5)
    )

# Identify the confidence column in CRM rules
possible_conf_cols = [
    'Confidence','confidence','conf','confidence_score','Conf',
    'Rule Confidence','rule_confidence'
 ]
conf_col = next((c for c in possible_conf_cols if c in rules_crm_top5.columns), None)
if conf_col is None:
    raise ValueError(f"Could not find a CRM confidence column among {possible_conf_cols}. Available: {list(rules_crm_top5.columns)}")

# Aggregate mean/median confidence for Top-5 per group
crm_conf_stats = (
    rules_crm_top5
    .groupby(['Dataset','Labeling','Encoding'])[conf_col]
    .agg(['mean','median'])
    .reset_index()
    .rename(columns={'mean':'crm_conf_mean_top5','median':'crm_conf_median_top5'})
 )

# Helper to place new columns right after an anchor column, if present
def _insert_after(df: pd.DataFrame, anchor: str, cols_to_place: list[str]) -> pd.DataFrame:
    if anchor not in df.columns:
        return df
    cols = list(df.columns)
    # Remove to reinsert
    for c in cols_to_place:
        if c in cols:
            cols.remove(c)
    idx = cols.index(anchor) + 1
    for i, c in enumerate(cols_to_place):
        if c in df.columns:
            cols.insert(idx + i, c)
    return df.reindex(columns=cols)

# Merge into dt_comparison_top5 and ripperk_comparison_top5
try:
    dt_comparison_top5 = dt_comparison_top5.merge(crm_conf_stats, on=['Dataset','Labeling','Encoding'], how='left')
    # Try to place next to dt precision if present
    for anchor_name in ['dt_precision','precision_dt','dt_prec','precision']:
        if anchor_name in dt_comparison_top5.columns:
            dt_comparison_top5 = _insert_after(dt_comparison_top5, anchor_name, ['crm_conf_mean_top5','crm_conf_median_top5'])
            break
except NameError:
    pass

try:
    ripperk_comparison_top5 = ripperk_comparison_top5.merge(crm_conf_stats, on=['Dataset','Labeling','Encoding'], how='left')
    # Try to place next to ripperk precision if present
    for anchor_name in ['ripperk_precision','precision_ripperk','rk_precision','precision']:
        if anchor_name in ripperk_comparison_top5.columns:
            ripperk_comparison_top5 = _insert_after(ripperk_comparison_top5, anchor_name, ['crm_conf_mean_top5','crm_conf_median_top5'])
            break
except NameError:
    pass

print(f"Used CRM confidence column: {conf_col}")
try:
    print('dt_comparison_top5 with CRM confidence stats:', dt_comparison_top5.shape)
    display(dt_comparison_top5)
except NameError:
    print('dt_comparison_top5 not found in scope.')
try:
    print('ripperk_comparison_top5 with CRM confidence stats:', ripperk_comparison_top5.shape)
    display(ripperk_comparison_top5)
except NameError:
    print('ripperk_comparison_top5 not found in scope.')

### Rule comparison

In [None]:
# 8. Expand DT and RIPPERK rules to per-rule rows with LHS feature parts (1..7)
import re
import pandas as pd

FEATURE_SLOTS = 10  # how many feature_*_lhs columns to emit

# Helper: find rule text column in a given DF
def _detect_rule_col(df: pd.DataFrame) -> str | None:
    candidates = ['Rule','rule','Rule_Text','rule_text','RuleString','Antecedent','rule_str']
    for c in candidates:
        if c in df.columns:
            return c
    # fallback: find first column whose sample values look like rule strings
    for c in df.columns:
        try:
            sample = df[c].dropna().astype(str).head(20)
        except Exception:
            continue
        if not len(sample):
            continue
        if sample.str.contains(r"\[", regex=True).any() or sample.str.contains('-->', regex=True).any():
            return c
    return None

# Helper: extract antecedent inner text and RHS label, robust to nested [...] inside features
def _split_rule(rule_str: str) -> tuple[str, str | None, str]:
    s = str(rule_str)

    # RHS label: anything after -->
    m_rhs = re.search(r"-->\s*(.+)$", s)
    rhs_label = m_rhs.group(1).strip() if m_rhs else None

    # LHS full: everything before -->
    lhs_full = s.split('-->', 1)[0].strip()

    # Extract inner content of the OUTERMOST [...] only (handle nested brackets inside feature names)
    antecedent_inner = lhs_full
    if lhs_full.startswith('['):
        depth = 0
        end_idx = None
        for i, ch in enumerate(lhs_full):
            if ch == '[':
                depth += 1
            elif ch == ']':
                depth -= 1
                if depth == 0:
                    end_idx = i
                    break
        if end_idx is not None and end_idx == len(lhs_full) - 1:
            antecedent_inner = lhs_full[1:end_idx].strip()
        else:
            if lhs_full.startswith('[') and lhs_full.endswith(']'):
                antecedent_inner = lhs_full[1:-1].strip()
            else:
                antecedent_inner = lhs_full

    return antecedent_inner, rhs_label, lhs_full

# Helper: split LHS into full clause strings by the ∧ sign (only)
def _split_lhs_clauses(antecedent_inner: str) -> list[str]:
    if not antecedent_inner:
        return []
    parts = [p.strip() for p in re.split(r"\s*∧\s*", antecedent_inner) if p.strip()]
    return parts

# Build a per-rule dataframe with up to the first seven LHS clauses as feature_1..7
def _expand_rules(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    if df is None or len(df) == 0:
        return pd.DataFrame()

    rule_col = _detect_rule_col(df)
    if rule_col is None:
        raise ValueError(f"Could not detect rule column in {model_name} rules. Available columns: {list(df.columns)}")

    # keys to keep if present
    meta_cols = [c for c in ['Dataset','Labeling','Encoding'] if c in df.columns]
    # carry common quality columns if present
    keep_qual_cols = [c for c in ['Confidence','confidence','Precision','precision','Support','support','Lift','lift'] if c in df.columns]

    feature_cols = [f'feature_{i}_lhs' for i in range(1, FEATURE_SLOTS + 1)]

    out_rows = []
    for _, row in df.iterrows():
        rule_str = row[rule_col]
        antecedent_inner, rhs_label_raw, lhs_full = _split_rule(rule_str)

        # Map RHS_label: 'Label'->1, '!Label'->0; otherwise keep as-is
        rhs_label_norm = None if rhs_label_raw is None else rhs_label_raw.strip()
        rhs_label_mapped = {'Label': 1, '!Label': 0}.get(rhs_label_norm, rhs_label_norm)

        # Split the antecedent into full clause strings (no parsing)
        lhs_clauses = _split_lhs_clauses(antecedent_inner)

        base = {c: row[c] for c in meta_cols}
        for qc in keep_qual_cols:
            base[qc] = row[qc]

        base.update({
            'Rule': rule_str,
            'LHS_features': lhs_full,           # full text before -->
            'RHS_label': rhs_label_mapped,
        })
        # Fill feature_1_lhs .. feature_7_lhs
        for i, col in enumerate(feature_cols):
            base[col] = lhs_clauses[i] if i < len(lhs_clauses) else None

        out_rows.append(base)

    core_cols = ['Rule','LHS_features','RHS_label'] + feature_cols
    cols_order = meta_cols + core_cols + keep_qual_cols
    return pd.DataFrame(out_rows)[cols_order]

# Acquire DT/RIPPERK rules DFs
dt_source = None
rk_source = None
try:
    dt_source = rules_dt_f
except NameError:
    try:
        dt_source = rules_dt
    except NameError:
        dt_source = None
try:
    rk_source = rules_ripperk_f
except NameError:
    try:
        rk_source = rules_ripperk
    except NameError:
        rk_source = None

if dt_source is None and rk_source is None:
    raise NameError("Neither DT nor RIPPERK rules were found in the current scope. Please run the earlier cells that load 'rules_dt_f' and 'rules_ripperk_f'.")

# Build expanded DataFrames (one row per rule)
dt_rules_all_expanded = _expand_rules(dt_source, 'dt') if dt_source is not None else pd.DataFrame()
ripperk_rules_all_expanded = _expand_rules(rk_source, 'ripperk') if rk_source is not None else pd.DataFrame()

print('dt_rules_all_expanded shape:', dt_rules_all_expanded.shape)
print('ripperk_rules_all_expanded shape:', ripperk_rules_all_expanded.shape)
display(dt_rules_all_expanded)
display(ripperk_rules_all_expanded)


In [None]:
crm_rules_all_expanded

In [None]:
# --- Fix CRM feature/value extraction by parsing from LHS_features (robust to "_binned_(...]" etc.) ---
import ast
import numpy as np
import pandas as pd

def _parse_lhs_list(s: str) -> list[str]:
    """Parse LHS_features like "['A_1','B_(0,1]']" -> ['A_1','B_(0,1]']."""
    if pd.isna(s):
        return []
    txt = str(s).strip()
    try:
        out = ast.literal_eval(txt)
        return [str(x) for x in out] if isinstance(out, list) else [str(out)]
    except Exception:
        # Fallback splitter (best effort)
        inner = txt.strip().strip("[]")
        parts = []
        buff, in_q = [], None
        for ch in inner:
            if ch in ("'", '"'):
                in_q = ch if in_q is None else (None if in_q == ch else in_q)
                buff.append(ch)
            elif ch == "," and in_q is None:
                parts.append("".join(buff).strip().strip("'\""))
                buff = []
            else:
                buff.append(ch)
        if buff:
            parts.append("".join(buff).strip().strip("'\""))
        return parts

def _split_clause(clause: str) -> tuple[str, str | None]:
    """
    Split a clause into (base, value) at the LAST underscore:
      'IV Antibiotics_1'                             -> ('IV Antibiotics', '1')
      'LacticAcid_binned_(-0.001, 1.0]'             -> ('LacticAcid_binned', '(-0.001, 1.0]')
      "responded_existence:('A','B')_0"             -> ("responded_existence:('A','B')", '0')
      'org:group_U|count|literal_0.0'               -> ('org:group_U|count|literal', '0.0')
    If no underscore: value = None.
    """
    if clause is None:
        return (None, None)
    cs = str(clause).strip()
    base, sep, val = cs.rpartition('_')  # last underscore
    if sep == '':
        return (cs, None)
    return (base, val)

# Rebuild feature_{i}_lhs and feature_{i}_value for i = 1..3 from LHS_features
for i in (1, 2, 3):
    lhs_out, val_out = [], []
    for s in crm_rules_all_expanded['LHS_features']:
        lst = _parse_lhs_list(s)
        if len(lst) >= i:
            b, v = _split_clause(lst[i-1])
        else:
            b, v = (np.nan, np.nan)
        lhs_out.append(b)
        val_out.append(v)
    crm_rules_all_expanded[f'feature_{i}_lhs'] = lhs_out
    crm_rules_all_expanded[f'feature_{i}_value'] = val_out

# (Optional) quick spot-check
preview_cols = [
    'Rule', 'LHS_features',
    'feature_1_lhs','feature_1_value',
    'feature_2_lhs','feature_2_value',
    'feature_3_lhs','feature_3_value',
]
display(crm_rules_all_expanded[[c for c in preview_cols if c in crm_rules_all_expanded.columns]])


In [None]:
# --- DT & RIPPERK: split feature_i_lhs into base (feature_i_lhs) and value (feature_i_value) for i=1..3 ---
import re
import numpy as np
import pandas as pd

def _split_dt_rk_clause(clause: str):
    """
    Split a DT/RIPPERK clause into (base, value).
    Rules:
      - If it looks like "<...>_(<interval>) <op> <rhs>", return:
            base = part before the last "_", value = "(<interval>) <op> <rhs>"
        e.g., "X_binned_(0,1] = 0" -> ("X_binned", "(0,1] = 0")
      - Else, if it looks like "<lhs> <op> <rhs>", return:
            base = <lhs>, value = <rhs>   (operator is dropped, matches your examples)
        e.g., "mr[... ] = 1" -> ("mr[...]", "1")
      - Else (no operator), try last "_" split; if interval suffix, value = that suffix; else value=None.
    """
    if clause is None or (isinstance(clause, float) and pd.isna(clause)):
        return (np.nan, np.nan)

    s = str(clause).strip()

    # Extract LHS / operator / RHS if present
    m = re.match(r"^(.*?)(?:\s*)(<=|>=|=|<|>)(?:\s*)(.+)$", s)
    if m:
        lhs = m.group(1).strip()
        op  = m.group(2).strip()
        rhs = m.group(3).strip()

        # Look for interval suffix after the LAST underscore on the LHS
        u = lhs.rfind("_")
        if u != -1:
            suffix = lhs[u+1:].strip()
            # interval if it starts with '(' or '['
            if suffix.startswith("(") or suffix.startswith("["):
                base = lhs[:u].strip()
                value = f"{suffix} {op} {rhs}"   # include operator, e.g. "(0,1] = 0"
                return (base, value)

        # No interval suffix: keep base as full LHS, value is RHS only (operator dropped)
        return (lhs, rhs)

    # If there's no explicit operator, try to peel off an interval suffix by last underscore
    u = s.rfind("_")
    if u != -1:
        suffix = s[u+1:].strip()
        if suffix.startswith("(") or suffix.startswith("["):
            return (s[:u].strip(), suffix)
    # Fallback: no split
    return (s, np.nan)

def _apply_split_base_value(df: pd.DataFrame, name: str):
    if df is None or len(df) == 0:
        return
    for i in (1, 2, 3):
        col = f"feature_{i}_lhs"
        if col not in df.columns:
            continue
        bases, vals = [], []
        for v in df[col]:
            b, val = _split_dt_rk_clause(v)
            bases.append(b)
            vals.append(val)
        df[col] = bases
        df[f"feature_{i}_value"] = vals

# Apply to DT and RIPPERK (these DataFrames should already exist in your notebook)
_apply_split_base_value(dt_rules_all_expanded, "dt")
_apply_split_base_value(ripperk_rules_all_expanded, "ripperk")

# Optional: quick sanity check
def _preview(df, title):
    if df is None or len(df) == 0: 
        return
    show = ['Rule','LHS_features',
            'feature_1_lhs','feature_1_value',
            'feature_2_lhs','feature_2_value',
            'feature_3_lhs','feature_3_value']
    print(f"\n=== Preview: {title} ===")
    display(df[[c for c in show if c in df.columns]])




In [None]:
_preview(dt_rules_all_expanded, "DT")
_preview(ripperk_rules_all_expanded, "RIPPERK")

## Similarity based on 80% similarity

In [None]:
# --- Similar DT ↔ CRM rules with variable feature counts, positional matching, per-feature similarity ≥ 80% ---
import re
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

THRESHOLD = 0.80  # 80%

def _norm_text(x):
    """Normalize text for fair string similarity: lower, trim, collapse spaces."""
    if pd.isna(x):
        return ""
    s = str(x).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

def _sim_ratio(a, b):
    """SequenceMatcher ratio on normalized strings; returns 0.0 if any empty."""
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a, b).ratio()

# Columns to carry & compare
meta_cols = [c for c in ['Dataset','Labeling','Encoding'] if c in dt_rules_all_expanded.columns and c in crm_rules_all_expanded.columns]
compare_cols = ['feature_1_lhs','feature_2_lhs','feature_3_lhs']

# Select and normalize
sel_cols = meta_cols + ['RHS_label','Rule'] + compare_cols
dt = dt_rules_all_expanded[sel_cols].copy()
crm = crm_rules_all_expanded[sel_cols].copy()

for i in range(1, 4):
    dt[f'f{i}_norm'] = dt[f'feature_{i}_lhs'].map(_norm_text)
    crm[f'f{i}_norm'] = crm[f'feature_{i}_lhs'].map(_norm_text)

# Build a presence pattern (positional) like "110" meaning f1,f2 present; f3 absent
def _presence_pattern(df, side_suffix=""):
    pres = []
    for i in range(1, 4):
        pres.append((df[f'f{i}_norm{side_suffix}'] != "").astype(int).astype(str))
    pat = pres[0] + pres[1] + pres[2]
    return pat

# Merge inside same group (Dataset, Labeling, Encoding, RHS_label)
pairs = dt.merge(crm, on=meta_cols + ['RHS_label'], how='inner', suffixes=('_dt', '_crm'))

# Presence patterns per side
pairs['pattern_dt']  = _presence_pattern(pairs, side_suffix="_dt")
pairs['pattern_crm'] = _presence_pattern(pairs, side_suffix="_crm")

# Require identical presence pattern and at least one feature present
mask_same_pattern = pairs['pattern_dt'] == pairs['pattern_crm']
mask_nonempty     = pairs['pattern_dt'].str.contains('1')  # at least one '1'

# Similarity per feature (only meaningful when present on both sides)
for i in range(1, 4):
    pairs[f'sim_f{i}'] = [
        _sim_ratio(a, b) for a, b in zip(pairs[f'f{i}_norm_dt'], pairs[f'f{i}_norm_crm'])
    ]
    # If either side is empty at this position, mark sim as NaN (excluded from avg and checks)
    empties = (pairs[f'f{i}_norm_dt'] == "") | (pairs[f'f{i}_norm_crm'] == "")
    pairs.loc[empties, f'sim_f{i}'] = np.nan

# For each present feature position, require sim_fi ≥ THRESHOLD
def _meets_threshold(row):
    sims = []
    ok = True
    for i in (1,2,3):
        a_present = row[f'f{i}_norm_dt']  != ""
        b_present = row[f'f{i}_norm_crm'] != ""
        if a_present and b_present:
            sims.append(row[f'sim_f{i}'])
            if row[f'sim_f{i}'] < THRESHOLD:
                ok = False
    # If no positions present (shouldn’t happen due to mask_nonempty), mark False
    if not sims:
        return False
    return ok

pairs['meets_threshold'] = pairs.apply(_meets_threshold, axis=1)

# Average similarity across present positions only (for sorting)
pairs['avg_sim'] = pairs[['sim_f1','sim_f2','sim_f3']].mean(axis=1, skipna=True)

# Keep only good matches
similar_rules = (
    pairs.loc[mask_same_pattern & mask_nonempty & pairs['meets_threshold'],
              meta_cols +
              ['RHS_label',
               'pattern_dt', 'avg_sim',
               'Rule_dt','feature_1_lhs_dt','feature_2_lhs_dt','feature_3_lhs_dt',
               'Rule_crm','feature_1_lhs_crm','feature_2_lhs_crm','feature_3_lhs_crm',
               'sim_f1','sim_f2','sim_f3']]
    .sort_values(by=meta_cols + ['avg_sim'], ascending=[True, True, True, False])
    .reset_index(drop=True)
)

print("Similar DT↔CRM rule pairs found:", similar_rules.shape[0])
display(similar_rules)


## Visualizing IMpresseD Features

In [None]:
# === Build pattern-attribute table using the repository functions (no guessing) ===
# Input  : 3_extracted_features/BPI15A/BPI15A_decl2_features/IMPresseD/IMPresseD.csv
# Assumes: every column except 'Case_ID' and 'Label' is a 0/1 pattern indicator
# Output : pattern_attributes.csv saved next to the input

import os
import numpy as np
import pandas as pd

from helper_functions.IMPresseD.IMIPD import (
    create_pattern_attributes,
)

# ----- paths -----
ENCODED_CSV = "3_extracted_features/BPI15A/BPI15A_payload_560925_features/IMPresseD/IMPresseD.csv"
OUT_CSV = "IMPresseD_patterns/BPI15A/BPI15A_payload_560925_features/pattern_attributes.csv"

# ----- load -----
df = pd.read_csv(ENCODED_CSV)

# sanity checks
assert "Case_ID" in df.columns, "Expected a 'Case_ID' column."
assert "Outcome"   in df.columns, "Expected a 'Outcome' column (binary 0/1)."

# identify pattern columns (everything except Case_ID & Outcome)
pattern_cols = [c for c in df.columns if c not in ("Case_ID", "Outcome")]
if not pattern_cols:
    raise RuntimeError("No pattern columns found. Expected all non-Case_ID/Outcome columns to be patterns.")

# ensure binary ints for patterns and label
df[pattern_cols] = df[pattern_cols].fillna(0).astype(int)
df["Outcome"] = df["Outcome"].fillna(0).astype(int)

# IMIPD expects a "patient_data" table with pattern columns + the label
patient_data = df[["Case_ID", "Outcome"] + pattern_cols].reset_index(drop=True)

# ----- similarity inputs (required by the API) -----
# You indicated there are no extra case-level covariates here,
# so we pass a zero vector for all pairwise distances.
n = len(patient_data)
pair_cases = []
start_search_points = {}
k = 0
for i in range(n - 1):
    start_search_points[i] = k
    for j in range(i + 1, n):
        pair_cases.append((i, j))
        k += 1

pairwise = np.zeros(len(pair_cases), dtype=float)  # similarity will become 0 for all patterns

# ----- compute attributes via the repo function -----
attrs = create_pattern_attributes(
    patient_data=patient_data,
    label_class="Outcome",
    pattern_list=pattern_cols,
    pairwise_distances_array=pairwise,
    pair_cases=pair_cases,
    start_search_points=start_search_points,
    outcome_type="binary",
)

# nice ordering
attrs = attrs.sort_values(
    by=["Outcome_Interest", "Frequency_Interest"],
    ascending=[False, False]
).reset_index(drop=True)

# save & preview
attrs.to_csv(OUT_CSV, index=False)
print(f"✅ Saved pattern attributes → {OUT_CSV}")
display(attrs.head(10))


In [None]:
# === Make Case_Distance_Interest informative by adding real case-level covariates ===
# Uses the repo functions from IMPresseD.IMIPD

import os, numpy as np, pandas as pd
from helper_functions.IMPresseD.IMIPD import (
    create_pattern_attributes,
    calculate_pairwise_case_distance,
)

# ---------- PATHS (adjust if needed) ----------
ENCODED_CSV = "3_extracted_features/BPI15A/BPI15A_payload_560925_features/IMPresseD/IMPresseD.csv"
RAW_LOG_CSV = "2_labelled_logs/BPI15A/bpi15A_payload_560925.csv"  # <-- the event log passed to --log_path
OUT_CSV     = "IMPresseD_patterns/BPI15A/BPI15A_payload_560925_features/pattern_attributes_with_similarity.csv"

# ---------- COLUMN NAMES (as in your earlier CLI args) ----------
CASE_ID_COL = "case:concept:name"
TS_COL      = "time:timestamp"
# LABEL_COL = "case:Label"         # not needed below, we use the encoded Label
ENC_CASE_ID = "Case_ID"            # in the encoded file
ENC_LABEL   = "Outcome"              # in the encoded file (0/1)

# Optional: attributes you listed earlier (only those present in RAW_LOG_CSV will be used)
NUMERIC_ATTRS_HINT = ["case:SUMleges"]
CATEGORICAL_ATTRS_HINT = [
    "question", "dateFinished", "action_code", "activityNameEN", "activityNameNL",
    "lifecycle:transition", "case:caseStatus", "case:last_phase", "case:case_type",
    "case:Responsible_actor", "case:parts", "case:termName", "case:requestComplete",
    "case:IDofConceptCase", "case:Label", "case:landRegisterID", "case:caseProcedure",
    "case:Includes_subCases", "monitoringResource", "org:resource", "dateStop"
]

# ---------- 1) LOAD DATA ----------
enc = pd.read_csv(ENCODED_CSV)
log = pd.read_csv(RAW_LOG_CSV)

# Basic checks
assert ENC_CASE_ID in enc.columns and ENC_LABEL in enc.columns, "Encoded CSV must have Case_ID and Label."
assert CASE_ID_COL in log.columns, "Raw log must have the case id column."

# Keep only the cases that appear in the encoded set (order matters later)
keep_cases = enc[ENC_CASE_ID].dropna().astype(str).unique().tolist()
log[CASE_ID_COL] = log[CASE_ID_COL].astype(str)
log = log[log[CASE_ID_COL].isin(keep_cases)].copy()

# Parse timestamps for duration features
if TS_COL in log.columns:
    log[TS_COL] = pd.to_datetime(log[TS_COL], errors="coerce")

# ---------- 2) BUILD CASE-LEVEL COVARIATES ----------
# Helpers for aggregations
def first_valid(s):
    s = s.dropna()
    return s.iloc[0] if len(s) else np.nan

def mode_or_last(s):
    s = s.dropna()
    if s.empty:
        return np.nan
    m = s.mode(dropna=True)
    return m.iloc[0] if not m.empty else s.iloc[-1]

# Which hints are actually present in the raw log?
present_numeric = [c for c in NUMERIC_ATTRS_HINT if c in log.columns]
present_cats    = [c for c in CATEGORICAL_ATTRS_HINT if c in log.columns]

# Group by case
gb = log.groupby(CASE_ID_COL, sort=False)

# Start with core derived covariates per case
case_df = pd.DataFrame(index=gb.size().index)
case_df.index.name = CASE_ID_COL
case_df["event_count"] = gb.size()
if TS_COL in log.columns:
    case_df["duration_hours"] = (gb[TS_COL].max() - gb[TS_COL].min()).dt.total_seconds() / 3600.0
else:
    case_df["duration_hours"] = np.nan

# Add numeric “case:*” columns (first_valid is safe for true case-level attributes)
for c in present_numeric:
    case_df[c] = gb[c].agg(first_valid)

# Add categorical columns
for c in present_cats:
    # If it's a case-level attribute (starts with "case:"), take first_valid.
    # Otherwise (event-level, e.g., org:resource), take the mode (fallback to last).
    agg_fn = first_valid if c.startswith("case:") else mode_or_last
    case_df[c] = gb[c].agg(agg_fn)

# ---------- 3) ALIGN COVARIATES TO ENCODED ORDER (robustly) ----------
# Normalize ids to string for matching (AFTER dropping NaNs)
enc_ids = enc[ENC_CASE_ID].dropna().astype(str)
raw_index = case_df.index.astype(str)

def try_direct_merge():
    overlap = set(enc_ids).intersection(set(raw_index))
    # If most IDs overlap by value, do a direct value-based merge
    if len(overlap) >= max(1, int(0.8 * len(enc_ids))):
        merged = (
            enc[[ENC_CASE_ID]]
            .astype({ENC_CASE_ID: str})
            .merge(
                case_df.reset_index().rename(columns={CASE_ID_COL: ENC_CASE_ID}).astype({ENC_CASE_ID: str}),
                on=ENC_CASE_ID,
                how="left"
            )
        )
        print(f"Alignment strategy: direct id match (overlap={len(overlap)}/{len(enc_ids)})")
        return merged
    return None

def try_enumeration(order_list, tag):
    # If encoded IDs are 0..n-1 as strings, map them to an enumeration of raw cases
    if not enc_ids.str.fullmatch(r"\d+").all():
        return None
    mapping = pd.Series(order_list, index=[str(i) for i in range(len(order_list))])
    mapped = enc_ids.map(mapping)
    if mapped.notna().all():
        tmp = enc[[ENC_CASE_ID]].copy()
        tmp["_RAW_ID_"] = mapped
        merged = (
            tmp.merge(
                case_df.reset_index().rename(columns={CASE_ID_COL: "_RAW_ID_"}),
                on="_RAW_ID_",
                how="left"
            ).drop(columns=["_RAW_ID_"])
        )
        print(f"Alignment strategy: enumeration ({tag})")
        return merged
    return None

# Try strategies in order
case_df_aligned = try_direct_merge()
if case_df_aligned is None:
    # First-occurrence enumeration (order as they appear in raw log)
    raw_first = log[CASE_ID_COL].dropna().astype(str).drop_duplicates().tolist()
    case_df_aligned = try_enumeration(raw_first, "first-occurrence")

if case_df_aligned is None:
    # Sorted enumeration
    raw_sorted = sorted(log[CASE_ID_COL].dropna().astype(str).unique().tolist())
    case_df_aligned = try_enumeration(raw_sorted, "sorted")

USE_PATTERN_SPACE_FALLBACK = False
if case_df_aligned is None:
    print("⚠️ Could not align encoded Case_ID to raw cases by value or enumeration. "
          "Falling back to pattern-based distances for Case_Distance_Interest.")
    USE_PATTERN_SPACE_FALLBACK = True
else:
    case_df = case_df_aligned  # proceed with aligned covariates

# ---------- 4) COMPUTE PAIRWISE DISTANCES ----------
# Prepare covariate matrix (numeric + one-hot categoricals)
if not USE_PATTERN_SPACE_FALLBACK:
    # Numeric columns available in the aligned frame
    num_cols = ["event_count", "duration_hours"] + [c for c in present_numeric if c in case_df.columns]
    num_cols = [c for c in num_cols if c in case_df.columns]  # guard

    cat_cols = [c for c in present_cats if c in case_df.columns]
    if cat_cols:
        cat_onehot = pd.get_dummies(case_df[cat_cols].astype("category"), dummy_na=False)
    else:
        cat_onehot = pd.DataFrame(index=case_df.index)

    X_features = pd.concat([case_df[num_cols], cat_onehot], axis=1).fillna(0)

    # If there are literally no covariates, fall back to pattern space
    if X_features.shape[1] == 0:
        USE_PATTERN_SPACE_FALLBACK = True

# If alignment failed or no covariates exist, compute distances in pattern space (binary)
if USE_PATTERN_SPACE_FALLBACK:
    pattern_cols = [c for c in enc.columns if c not in (ENC_CASE_ID, ENC_LABEL)]
    X_features = enc[pattern_cols].astype(int)
    num_for_distance = []  # all binary → Jaccard by the repo function
else:
    num_for_distance = [c for c in X_features.columns if c in num_cols]

pairwise = calculate_pairwise_case_distance(X_features, num_col=num_for_distance)

# Build (i,j) index pairs in condensed pdist order (must match the row order of enc/patient_data)
n = len(enc)
pair_cases = []
start_search_points = {}
k = 0
for i in range(n - 1):
    start_search_points[i] = k
    for j in range(i + 1, n):
        pair_cases.append((i, j))
        k += 1

# ---------- 5) RECOMPUTE PATTERN ATTRIBUTES (NOW WITH SIMILARITY) ----------
pattern_cols = [c for c in enc.columns if c not in (ENC_CASE_ID, ENC_LABEL)]
enc[pattern_cols] = enc[pattern_cols].fillna(0).astype(int)
enc[ENC_LABEL] = enc[ENC_LABEL].fillna(0).astype(int)

patient_data = enc[[ENC_CASE_ID, ENC_LABEL] + pattern_cols].reset_index(drop=True)

attrs = create_pattern_attributes(
    patient_data=patient_data,
    label_class=ENC_LABEL,
    pattern_list=pattern_cols,
    pairwise_distances_array=pairwise,
    pair_cases=pair_cases,
    start_search_points=start_search_points,
    outcome_type="binary",
)

attrs = attrs.sort_values(
    by=["Outcome_Interest", "Frequency_Interest", "Case_Distance_Interest"],
    ascending=[False, False, False]
).reset_index(drop=True)

os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)
attrs.to_csv(OUT_CSV, index=False)
print(f"✅ Saved pattern attributes with similarity → {OUT_CSV}")

# Quick sanity check: distribution of the similarity score
print(attrs["Case_Distance_Interest"].describe())
attrs.head(10)


In [None]:
from helper_functions.IMPresseD import tools

target_patterns = ["01-HOOFD-100_160", "01-HOOFD-101"]
df3 = attrs[attrs["patterns"].isin(target_patterns)]

# Ensure the save directory exists (tools.threeD_ploting saves to "<folder>/pattern/...")
out_dir = "IMPresseD_patterns/sepsis/sepsis_decl_features/"
os.makedirs(os.path.join(out_dir, "pattern"), exist_ok=True)

# choose three axes that actually vary in your data
tools.threeD_ploting(
    df3,
    "Frequency_Interest", "Outcome_Interest", 'Case_Distance_Interest',
    folder_address=out_dir,
    activity_level=False
)

### Phase 3: comparing runtimes

In [None]:
# --- Collect and average runtimes across labelings for two experiment sets ---
from pathlib import Path
import pandas as pd
import re

def _pick_root(candidates=("4.output", "4_output")) -> Path:
    for c in candidates:
        p = Path(c)
        if p.exists():
            return p
    raise FileNotFoundError("Could not find either '4.output' or '4_output'.")

def _list_subdirs(path: Path):
    return sorted([d for d in path.iterdir() if d.is_dir()])

def _choose_txt_file(enc_dir: Path, priority=("runtime_seconds.txt", "runtime.txt")) -> Path:
    # Prefer a known filename if present; otherwise take the first *.txt lexicographically
    files = sorted(enc_dir.glob("*.txt"))
    if not files:
        return None
    by_name = {f.name.lower(): f for f in files}
    for name in priority:
        if name.lower() in by_name:
            return by_name[name.lower()]
    return files[0]

_float_re = re.compile(r"([-+]?\d+(?:\.\d+)?)")

def _read_seconds(txt_file: Path):
    try:
        s = txt_file.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        s = txt_file.read_text(errors="ignore")
    m = _float_re.search(s)
    return float(m.group(1)) if m else None

def collect_runtime_averages(
    dataset="traffic",
    experiment_sets=("random", "ipweights"),
    root_candidates=("4.output", "4_output"),
):
    root = _pick_root(root_candidates)

    rows = []
    for exp in experiment_sets:
        exp_dir = root / exp / dataset
        if not exp_dir.exists():
            # Skip missing experiment set or dataset
            continue

        # Labelings = subfolders under /<exp>/<dataset>/
        labelings = _list_subdirs(exp_dir)

        # Gather the union of encodings across all labelings
        encodings = set()
        for lab in labelings:
            for enc_dir in _list_subdirs(lab):
                encodings.add(enc_dir.name)

        # For each encoding, try to read the runtime for each labeling
        for enc in sorted(encodings):
            for lab in labelings:
                enc_dir = lab / enc
                if not enc_dir.exists():
                    continue
                txt = _choose_txt_file(enc_dir)
                if txt is None:
                    continue
                secs = _read_seconds(txt)
                if secs is None:
                    continue
                rows.append({
                    "experiment": exp,
                    "labeling": lab.name,
                    "encoding": enc,
                    "seconds": secs,
                    "txt_path": str(txt),
                })

    df_runs = pd.DataFrame(rows).sort_values(["experiment", "labeling", "encoding"]).reset_index(drop=True)

    # Average over labelings for each (experiment, encoding)
    if df_runs.empty:
        print("No runtimes found. Check the folder structure and .txt files.")
        return df_runs, pd.DataFrame(), pd.DataFrame()

    agg = (
        df_runs
        .groupby(["experiment", "encoding"])["seconds"]
        .agg(count="count", mean="mean", std="std", min="min", max="max")
        .reset_index()
        .rename(columns={
            "count": "n_labelings",
            "mean": "avg_seconds",
            "std": "std_seconds",
            "min": "min_seconds",
            "max": "max_seconds",
        })
    )
    # Convenience: also add hours
    agg["avg_hours"] = agg["avg_seconds"] / 3600.0

    # Wide comparison: rows=encoding, cols=experiment, values=avg_seconds
    df_avg_wide = agg.pivot(index="encoding", columns="experiment", values="avg_seconds").sort_index()

    # Optional: show quick previews
    display_cols = ["experiment", "encoding", "n_labelings", "avg_seconds", "avg_hours"]
    try:
        from IPython.display import display
        print("Per-run entries (df_runs):")
        display(df_runs)
        print("\nAverages per (experiment, encoding) (df_avg):")
        display(agg[display_cols].sort_values(["experiment", "encoding"]))
        print("\nWide comparison of avg seconds by experiment (df_avg_wide):")
        display(df_avg_wide)
    except Exception:
        # Fallback if display is not available
        print(df_runs)
        print(agg[display_cols].sort_values(["experiment", "encoding"]))
        print(df_avg_wide)

    return df_runs, agg, df_avg_wide

# ---- Run it ----
df_runs, df_avg, df_avg_wide = collect_runtime_averages()


In [None]:
# --- Convert df_avg_wide (avg seconds) → minutes ---
if 'df_avg_wide' not in globals():
    raise NameError("df_avg_wide is not defined. Run the previous cell first.")

df_avg_wide_min = (df_avg_wide / 60).rename_axis("encoding").copy()
df_avg_wide_min.columns = [f"{c} (min)" for c in df_avg_wide_min.columns]

from IPython.display import display
display(df_avg_wide_min.round(2))

# Optional: persist
# df_avg_wide_min.to_csv("5_analysis/runtime_averages_wide_minutes.csv")
# To overwrite the original variable:
# df_avg_wide = df_avg_wide_min.copy()


## Comparing rules of IPW and Random

In [None]:
all_rules_random_path = os.path.join('5_analysis', 'random', 'combined_sorted_all.csv')
all_rules_random = pd.read_csv(all_rules_random_path, sep=',')

all_rules_ipw_path = os.path.join('5_analysis', 'ipweights', 'combined_sorted_all.csv')
all_rules_ipw = pd.read_csv(all_rules_ipw_path, sep=',')

In [None]:
def expand_rules(df_in: pd.DataFrame, exclude_list=None) -> pd.DataFrame:
    df = df_in.copy()

    # Harmonize encoding column name
    if 'Encoding' not in df.columns and 'Feature Encoding' in df.columns:
        df = df.rename(columns={'Feature Encoding': 'Encoding'})

    # Exclude encodings (case/whitespace-insensitive)
    if 'Encoding' in df.columns:
        excl = {e.lower().strip() for e in (exclude_list or ["mr", "mra", "tr", "tra"])}
        enc_norm = df['Encoding'].astype(str).str.strip().str.lower()
        df = df[~enc_norm.isin(excl)].copy()

    # Ensure Odds ratio is numeric and filter OR > 1
    if 'Odds ratio' in df.columns:
        df['Odds ratio'] = pd.to_numeric(df['Odds ratio'], errors='coerce')
        df = df[df['Odds ratio'] > 1].copy()

    # ---------- Normalize Labeling ----------
    if 'Labeling' in df.columns:
        try:
            KNOWN_PREFIXES
        except NameError:
            KNOWN_PREFIXES = set(
                df.get('Dataset', pd.Series([], dtype=str))
                  .astype(str).str.strip().str.lower()
                  .str.replace(r'\s+', '', regex=True)
                  .unique().tolist()
            ) | {'sepsis','traffic','bpi15a','bpic15a','bpic2015','bpi2015','bpi15'}

        def _strip_prefix_suffix(x: str) -> str:
            s = str(x)
            s = re.sub(r'(_features?)$', '', s, flags=re.I)
            s = s.strip().lower().replace(' ', '_')
            changed = True
            while changed:
                changed = False
                for p in sorted(KNOWN_PREFIXES, key=len, reverse=True):
                    if s.startswith(p + '_'):
                        s = s[len(p) + 1:]
                        changed = True
            return s

        base_series = df['Labeling']
        if '_normalize_side' in globals() and callable(globals()['_normalize_side']):
            try:
                tmp = _normalize_side(df)
                if isinstance(tmp, pd.DataFrame):
                    if 'Labeling_norm' in tmp.columns:
                        base_series = tmp['Labeling_norm']
                    elif 'Labeling' in tmp.columns:
                        base_series = tmp['Labeling']
            except Exception:
                pass

        df['Labeling'] = base_series.apply(_strip_prefix_suffix)
        low = df['Labeling'].astype(str).str.lower()
        df.loc[low.str.contains('decl', na=False),    'Labeling'] = 'declare'
        df.loc[low.str.contains('payload', na=False), 'Labeling'] = 'payload'
        df.loc[low.str.contains('mr', na=False),      'Labeling'] = 'sequential'

    # ---------- Extract exact LHS and RHS label ----------
    def extract_lhs_exact(rule_str: str) -> str:
        m = re.search(r"^(.*?)(?=\s*-->)", str(rule_str))
        return m.group(1) if m else str(rule_str)

    def parse_rhs_label(rule_str: str):
        m = re.search(r"-->\s*(Label|!Label)", str(rule_str))
        if not m:
            return None
        return 1 if m.group(1) == "Label" else 0

    df['LHS_features'] = df['Rule'].apply(extract_lhs_exact)
    df['RHS_label']    = df['Rule'].apply(parse_rhs_label)

    # ---------- Split LHS into up to 3 features ----------
    def _find_outer_brackets_span(text: str):
        s = str(text); start = s.find('[')
        if start < 0: return (None, None)
        depth = 0; in_s = in_d = esc = False; end = None
        for i, ch in enumerate(s[start:], start):
            if esc: esc = False; continue
            if ch == '\\': esc = True; continue
            if in_s: 
                if ch == "'": in_s = False
                continue
            if in_d:
                if ch == '"': in_d = False
                continue
            if ch == "'": in_s = True; continue
            if ch == '"': in_d = True; continue
            if ch == '[': depth += 1; continue
            if ch == ']':
                depth -= 1
                if depth == 0: end = i; break
        return (start, end)

    def _split_top_level_commas(content: str):
        parts, curr = [], ""
        in_s = in_d = esc = False
        for ch in content:
            if esc:
                curr += ch; esc = False; continue
            if ch == '\\':
                curr += ch; esc = True; continue
            if in_s:
                curr += ch
                if ch == "'": in_s = False
                continue
            if in_d:
                curr += ch
                if ch == '"': in_d = False
                continue
            if ch == "'":
                curr += ch; in_s = True; continue
            if ch == '"':
                curr += ch; in_d = True; continue
            if ch == ',':
                parts.append(curr.strip()); curr = ""
            else:
                curr += ch
        parts.append(curr.strip())
        return parts

    def _strip_one_layer_quotes(s_: str):
        s_ = s_.strip()
        if len(s_) >= 2 and ((s_[0] == s_[-1] == "'") or (s_[0] == s_[-1] == '"')):
            return s_[1:-1]
        return s_

    def split_lhs_items(lhs_text: str):
        s = str(lhs_text)
        start, end = _find_outer_brackets_span(s)
        if start is None or end is None: return []
        inner = s[start+1:end]
        raw_items = _split_top_level_commas(inner)
        return [_strip_one_layer_quotes(x).strip() for x in raw_items if x != ""]

    def _pad3(items):
        items = items[:3]
        return items + [""] * (3 - len(items))

    lhs_split = df['LHS_features'].apply(split_lhs_items).apply(_pad3)
    lhs_df = pd.DataFrame(lhs_split.tolist(), columns=['feature_1_lhs','feature_2_lhs','feature_3_lhs'])

    # ---------- Final table (now appending LB odds ratio, Support LHS, Confidence if present) ----------
    # Coerce to numeric if they exist (safe no-ops if not present)
    for col in ['LB odds ratio', 'Support LHS', 'Confidence']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    base_cols = ['Dataset','Labeling','Encoding','Rule','LHS_features','RHS_label']
    extra_cols = [c for c in ['LB odds ratio','Support LHS','Confidence'] if c in df.columns]
    select_cols = base_cols + extra_cols

    have_cols = [c for c in select_cols if c in (list(df.columns) + ['LHS_features','RHS_label'])]
    out = pd.concat(
        [df[have_cols].reset_index(drop=True), lhs_df.reset_index(drop=True)],
        axis=1
    ).reset_index(drop=True)

    sort_cols = [c for c in ['Dataset','Labeling','Encoding'] if c in out.columns]
    if sort_cols:
        out = out.sort_values(by=sort_cols, ascending=True).reset_index(drop=True)

    return out

# ---------- Build the two expanded DataFrames ----------
if 'all_rules_random' not in globals() or not isinstance(all_rules_random, pd.DataFrame):
    raise ValueError("all_rules_random is not available as a DataFrame.")
if 'all_rules_ipw' not in globals() or not isinstance(all_rules_ipw, pd.DataFrame):
    raise ValueError("all_rules_ipw is not available as a DataFrame.")

random_rules_all_expanded = expand_rules(all_rules_random, exclude_encodings)
ipw_rules_all_expanded    = expand_rules(all_rules_ipw,    exclude_encodings)

# (Optional) peek
display(random_rules_all_expanded)
display(ipw_rules_all_expanded)

In [None]:
# ---------- Find rules unique to each DF, with extra (RHS_label, feature_1_lhs) check ----------
import pandas as pd

def _prepare_sets(df):
    # Make comparable representations
    rules = set(df['Rule'].astype(str))
    pairs = set(zip(df['RHS_label'], df['feature_1_lhs'].astype(str).str.strip()))
    return rules, pairs

def _unique_after_pair_check(dfA, dfB, source_name):
    """
    Keep rows from dfA whose Rule is not in dfB.
    Then drop rows whose (RHS_label, feature_1_lhs) pair appears in dfB.
    Return Dataset, Labeling, Encoding, Rule + requested metrics (if present) + Source.
    """
    # Ensure necessary columns exist
    needed = {'Dataset','Labeling','Encoding','Rule','RHS_label','feature_1_lhs'}
    missingA = needed - set(dfA.columns)
    missingB = {'Rule','RHS_label','feature_1_lhs'} - set(dfB.columns)
    if missingA:
        raise KeyError(f"{source_name}: missing columns in dfA: {sorted(missingA)}")
    if missingB:
        raise KeyError(f"{source_name}: missing columns in dfB: {sorted(missingB)}")

    rules_B, pairs_B = _prepare_sets(dfB)

    # Step 1: unique by full Rule string
    mask_rule_unique = ~dfA['Rule'].astype(str).isin(rules_B)
    cand = dfA.loc[mask_rule_unique].copy()

    # Step 2: remove those whose (RHS_label, feature_1_lhs) pair appears in dfB
    cand['_pair'] = list(zip(cand['RHS_label'], cand['feature_1_lhs'].astype(str).str.strip()))
    cand = cand[~cand['_pair'].isin(pairs_B)].drop(columns=['_pair'])

    # Choose metric cols if they exist
    metric_cols = [c for c in ['LB odds ratio', 'Support LHS', 'Confidence'] if c in cand.columns]
    # Optional: ensure numeric
    for c in metric_cols:
        cand[c] = pd.to_numeric(cand[c], errors='coerce')

    base_cols = ['Dataset','Labeling','Encoding','Rule']
    select_cols = base_cols + metric_cols

    # Keep only requested columns, dedupe by base rule identity
    out = (
        cand[select_cols]
        .drop_duplicates(subset=base_cols)
        .copy()
    )
    out.insert(0, 'Variant', source_name)
    return out

# Compute unique sets
unique_random_rules = _unique_after_pair_check( random_rules_all_expanded, ipw_rules_all_expanded, source_name='random' )
unique_ipw_rules = _unique_after_pair_check( ipw_rules_all_expanded, random_rules_all_expanded, source_name='ipw' )

# Combined (optional)
combined_unique_rules = (
    pd.concat([unique_random_rules, unique_ipw_rules], ignore_index=True)
      .sort_values(['Variant','Dataset','Labeling', 'LB odds ratio','Encoding'], ascending=[True,True,True,False,True])
      .reset_index(drop=True)
)

combined_unique_rules
#unique_ipw_rules.sort_values(['Dataset','Labeling', 'LB odds ratio','Encoding'], ascending=[True,True,False,True])


In [None]:
    # save LaTeX
    out_fp_tex = os.path.join(root_dir, 'combined_unique_rules_ipw.tex')

    def fmt_rule(x):
        return r'\detokenize{' + str(x) + '}'

    combined_unique_rules.to_latex(
        out_fp_tex,
        index=False,
        escape=False,
        longtable=True,
        float_format="%.2f",
        formatters={
            "Rule": fmt_rule,
            "Encoding": fmt_rule
        }
    )
    print(f"✅ Saved combined_sorted_all.tex → {out_fp_tex}")

In [None]:
# make a summarized version of combined_unique_rules for each variant with total number of rules per labeling, en encoding
summary_random = (
    unique_random_rules
    .groupby(['Labeling'])
    .size()
    .reset_index(name='num_unique_rules')
    .sort_values(['Labeling','num_unique_rules'], ascending=[True,False])
    .reset_index(drop=True)
)
summary_ipw = (
    unique_ipw_rules
    .groupby(['Labeling'])
    .size()
    .reset_index(name='num_unique_rules')
    .sort_values(['Labeling','num_unique_rules'], ascending=[True,False])
    .reset_index(drop=True)
)
display(summary_random)
display(summary_ipw)

### Boxplots