### DHL qualitative evaluation

In [1]:
import pandas as pd
from pathlib import Path

# --- 1) Define path ---
csv_path = Path(r"5_analysis\random\DHL\dhl_features\k3\combined_sorted.csv")

# --- 2) Load CSV ---
df = pd.read_csv(csv_path)

# --- 3) Filter on LB odds ratio > 1 AND Conviction > 1 ---
filtered = df[(df["LB odds ratio"] > 1) & (df["Conviction"] > 1)].copy()

print(f"Loaded {len(df)} rules, kept {len(filtered)} after LB OR > 1 and Conviction > 1 filter.")
filtered.head()


Loaded 829 rules, kept 829 after LB OR > 1 and Conviction > 1 filter.


Unnamed: 0,Rule,Feature Encoding,Odds ratio,LB odds ratio,UB odds ratio,Support LHS,Confidence,Lift,Conviction,n12,n21,Fair set count,Stratified
0,['trace:Label|first|discrete_0.0'] --> !Label,payload,1485.0,209.037,10549.459,0.455,1.0,2.199,inf,1485.0,1.0,2970,True
1,"[""responded_existence:('PICKED', 'ORDER_ACKNOW...",hybrid,1026.0,144.383,7290.869,0.563,0.846,1.551,2.947,1026.0,1.0,2182,True
2,['trace:Label|first|discrete_1.0'] --> Label,payload,976.0,137.34,6935.902,0.545,1.0,1.834,inf,976.0,1.0,1952,True
3,"[""responded_existence:('RELEASED_FOR_PICKING',...",declare,954.0,134.241,6779.717,0.563,0.846,1.551,2.948,954.0,1.0,2238,True
4,"[""alternate_response:('ORDER_CLOSED', 'ORDER_A...",declare,954.0,134.241,6779.717,0.563,0.846,1.551,2.948,954.0,1.0,2238,True


### Ranking per encoding

In [2]:
import pandas as pd
import numpy as np

# Use the filtered DataFrame from the previous cell if available; else fall back to df
_base = globals().get("filtered", globals().get("df"))
if _base is None:
    raise RuntimeError("No DataFrame named 'filtered' or 'df' found. Run the previous cell first.")
data = _base.copy()

# --- Configuration ---
ENC_COL = "Feature Encoding"
METRICS = ["LB odds ratio", "Support LHS", "Confidence", "Lift", "Conviction"]
TOP_K = 2  # change to 3 if you want top-3
TIE_BREAKERS = ["Support LHS", "LB odds ratio"]  # higher is better

# --- Sanity checks ---
missing = [c for c in [ENC_COL] + METRICS if c not in data.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

def _rank_within_group(g: pd.DataFrame, col: str) -> pd.Series:
    """
    Rank a single metric within an encoding group (higher is better).
    NaNs get worst rank (group_size + 1).
    """
    r = g[col].rank(method="dense", ascending=False)
    return r.fillna(len(g) + 1)

# --- Compute per-metric ranks within each encoding ---
rank_cols = []
for m in METRICS:
    rcol = f"rank::{m}"
    data[rcol] = data.groupby(ENC_COL, group_keys=False).apply(_rank_within_group, col=m)
    rank_cols.append(rcol)

# --- Aggregate ranks: lower is better (since rank 1 is best) ---
data["rank_agg"] = data[rank_cols].mean(axis=1)

# --- Sort for selection: primary by aggregated rank; tie-breakers by higher support/LB OR ---
sort_cols = ["rank_agg"] + [c for c in TIE_BREAKERS if c in data.columns]
ascending = [True] + [False] * (len(sort_cols) - 1)

scored = data.sort_values(sort_cols, ascending=ascending)

# --- Pick top-k per encoding ---
topk_per_encoding = (
    scored.groupby(ENC_COL, group_keys=False)
          .head(TOP_K)
          .reset_index(drop=True)
)

print(f"Selected top-{TOP_K} rules per encoding using rank aggregation over: {', '.join(METRICS)}")
topk_per_encoding


Selected top-2 rules per encoding using rank aggregation over: LB odds ratio, Support LHS, Confidence, Lift, Conviction


  data[rcol] = data.groupby(ENC_COL, group_keys=False).apply(_rank_within_group, col=m)
  data[rcol] = data.groupby(ENC_COL, group_keys=False).apply(_rank_within_group, col=m)
  data[rcol] = data.groupby(ENC_COL, group_keys=False).apply(_rank_within_group, col=m)
  data[rcol] = data.groupby(ENC_COL, group_keys=False).apply(_rank_within_group, col=m)
  data[rcol] = data.groupby(ENC_COL, group_keys=False).apply(_rank_within_group, col=m)


Unnamed: 0,Rule,Feature Encoding,Odds ratio,LB odds ratio,UB odds ratio,Support LHS,Confidence,Lift,Conviction,n12,n21,Fair set count,Stratified,rank::LB odds ratio,rank::Support LHS,rank::Confidence,rank::Lift,rank::Conviction,rank_agg
0,['trace:Label|first|discrete_0.0'] --> !Label,payload,1485.0,209.037,10549.459,0.455,1.0,2.199,inf,1485.0,1.0,2970,True,1.0,2.0,1.0,1.0,1.0,1.2
1,['trace:Label|first|discrete_1.0'] --> Label,payload,976.0,137.34,6935.902,0.545,1.0,1.834,inf,976.0,1.0,1952,True,2.0,1.0,1.0,2.0,1.0,1.4
2,"['responded_existence:(PICKED,ORDER_ACKNOWLEDG...",bs_dwd,99.0,13.808,709.808,0.447,0.988,2.172,44.456,99.0,1.0,276,True,1.0,4.0,1.0,1.0,1.0,1.6
3,"['not_chain_response:(RELEASED_FOR_PICKING,PAC...",dwd,25.0,3.387,184.508,0.367,0.983,2.161,31.526,25.0,1.0,636,True,3.0,4.0,1.0,1.0,1.0,2.0
4,"['mr[ORDER_ACKNOWLEDGED-complete, ORDER_TIMEST...",seq_combined,244.0,34.232,1739.2,0.36,1.0,2.199,inf,244.0,1.0,716,True,2.0,5.0,1.0,1.0,1.0,2.0
5,"['mr[ORDER_ACKNOWLEDGED-complete, ORDER_TIMEST...",seq_combined,296.0,41.556,2108.364,0.64,0.853,1.563,3.085,296.0,1.0,832,True,1.0,4.0,2.0,2.0,2.0,2.2
6,"[""responded_existence:('RELEASED_FOR_PICKING',...",dec_dwd,134.0,18.738,958.28,0.36,1.0,2.199,inf,134.0,1.0,2286,True,2.0,6.0,1.0,1.0,1.0,2.2
7,"[""responded_existence:('PACKED', 'ORDER_ACKNOW...",dec_dwd,134.0,18.738,958.28,0.36,1.0,2.199,inf,134.0,1.0,2286,True,2.0,6.0,1.0,1.0,1.0,2.2
8,"[""responded_existence:('RELEASED_FOR_PICKING',...",declare,954.0,134.241,6779.717,0.563,0.846,1.551,2.948,954.0,1.0,2238,True,1.0,5.0,2.0,2.0,2.0,2.4
9,"[""alternate_response:('ORDER_CLOSED', 'ORDER_A...",declare,954.0,134.241,6779.717,0.563,0.846,1.551,2.948,954.0,1.0,2238,True,1.0,5.0,2.0,2.0,2.0,2.4


### Ranking over all encodings

In [3]:
import pandas as pd
import numpy as np

# Use the filtered DataFrame from previous cell if available; else fall back to df
_base = globals().get("filtered", globals().get("df"))
if _base is None:
    raise RuntimeError("No DataFrame named 'filtered' or 'df' found. Run the previous cell first.")
data = _base.copy()

# --- Configuration ---
METRICS = ["Odds ratio", "Support LHS", "Confidence", "Lift", "Conviction"]
TOP_K = 5  # set to 3 if you want top-3 overall
TIE_BREAKERS = ["Support LHS", "Odds ratio"]  # higher is better
ENC_COL = "Feature Encoding"
RULE_COL = "Rule"

# --- Sanity checks ---
missing = [c for c in METRICS if c not in data.columns]
if missing:
    raise ValueError(f"Missing expected metric columns: {missing}")

# --- 1) Rank each metric globally (higher is better -> rank 1 is best) ---
rank_cols = []
for m in METRICS:
    rcol = f"rank::{m}"
    data[rcol] = data[m].rank(method="dense", ascending=False)
    rank_cols.append(rcol)

# --- 2) Aggregate ranks (lower is better) ---
data["rank_agg"] = data[rank_cols].mean(axis=1)

# --- 3) Sort by aggregated rank, then tie-breakers ---
sort_cols = ["rank_agg"] + [c for c in TIE_BREAKERS if c in data.columns]
ascending = [True] + [False] * (len(sort_cols) - 1)

scored_overall = data.sort_values(sort_cols, ascending=ascending).reset_index(drop=True)

# --- 4) Pick top-k overall ---
topk_overall = scored_overall.head(TOP_K).copy()

print(f"Selected top-{TOP_K} rules overall using rank aggregation over: {', '.join(METRICS)}")

# --- 5) Display with 'Rule' first ---
display_cols = []
if RULE_COL in topk_overall.columns:
    display_cols.append(RULE_COL)
if ENC_COL in topk_overall.columns:
    display_cols.append(ENC_COL)
display_cols += [c for c in METRICS if c in topk_overall.columns] + ["rank_agg"]

topk_overall.loc[:, display_cols]


Selected top-5 rules overall using rank aggregation over: Odds ratio, Support LHS, Confidence, Lift, Conviction


Unnamed: 0,Rule,Feature Encoding,Odds ratio,Support LHS,Confidence,Lift,Conviction,rank_agg
0,['trace:Label|first|discrete_0.0'] --> !Label,payload,1485.0,0.455,1.0,2.199,inf,11.6
1,['trace:Label|first|discrete_1.0'] --> Label,payload,976.0,0.545,1.0,1.834,inf,14.2
2,"[""alternate_precedence:('PACKED', 'ORDER_ACKNO...",dec_data,905.0,0.36,1.0,2.199,inf,14.8
3,"[""responded_existence:('RELEASED_FOR_PICKING',...",dec_data,905.0,0.36,1.0,2.199,inf,14.8
4,"[""responded_existence:('PICKED', 'ORDER_ACKNOW...",hybrid,304.0,0.361,0.999,2.198,969.258,18.2


### Global but then only positive deviance

In [4]:
import pandas as pd
import numpy as np
import re

# Use the filtered DataFrame from previous cell if available; else fall back to df
_base = globals().get("filtered", globals().get("df"))
if _base is None:
    raise RuntimeError("No DataFrame named 'filtered' or 'df' found. Run the previous cell first.")
data = _base.copy()

# --- 0) Keep only rules with RHS == 'Label' (exclude '!Label') ---
# This matches "... --> Label" at the end of the Rule string, ignoring trailing spaces.
rhs_label_mask = data["Rule"].astype(str).str.contains(r"-->\s*Label\s*$", regex=True, na=False)
data = data[rhs_label_mask].copy()

# --- Configuration ---
METRICS = ["Odds ratio", "Support LHS", "Confidence", "Lift", "Conviction"]
TOP_K = 50  # set to 3 if you want top-3 overall
TIE_BREAKERS = ["Support LHS", "Odds ratio"]  # higher is better
ENC_COL = "Feature Encoding"
RULE_COL = "Rule"

# --- Sanity checks ---
missing = [c for c in METRICS if c not in data.columns]
if missing:
    raise ValueError(f"Missing expected metric columns: {missing}")

# --- 1) Rank each metric globally (higher is better -> rank 1 is best) ---
rank_cols = []
for m in METRICS:
    rcol = f"rank::{m}"
    data[rcol] = data[m].rank(method="dense", ascending=False)
    rank_cols.append(rcol)

# --- 2) Aggregate ranks (lower is better) ---
data["rank_agg"] = data[rank_cols].mean(axis=1)

# --- 3) Sort by aggregated rank, then tie-breakers ---
sort_cols = ["rank_agg"] + [c for c in TIE_BREAKERS if c in data.columns]
ascending = [True] + [False] * (len(sort_cols) - 1)

scored_overall = data.sort_values(sort_cols, ascending=ascending).reset_index(drop=True)

# --- 4) Pick top-k overall ---
topk_overall = scored_overall.head(TOP_K).copy()

print(
    f"Selected top-{TOP_K} rules overall with RHS == 'Label', "
    f"using rank aggregation over: {', '.join(METRICS)} "
    f"(kept {len(data)} rules after RHS filter)."
)

# --- 5) Display with 'Rule' first ---
display_cols = []
if RULE_COL in topk_overall.columns:
    display_cols.append(RULE_COL)
if ENC_COL in topk_overall.columns:
    display_cols.append(ENC_COL)
display_cols += [c for c in METRICS if c in topk_overall.columns] + ["rank_agg"]

topk_overall.loc[:, display_cols]


Selected top-50 rules overall with RHS == 'Label', using rank aggregation over: Odds ratio, Support LHS, Confidence, Lift, Conviction (kept 420 rules after RHS filter).


Unnamed: 0,Rule,Feature Encoding,Odds ratio,Support LHS,Confidence,Lift,Conviction,rank_agg
0,['trace:Label|first|discrete_1.0'] --> Label,payload,976.0,0.545,1.0,1.834,inf,4.6
1,"['responded_existence:(PACKED,ORDER_ACKNOWLEDG...",bs_dwd,97.0,0.483,0.975,1.788,18.343,16.4
2,"['responded_existence:(RELEASED_FOR_PICKING,OR...",bs_dwd,97.0,0.483,0.975,1.788,18.343,16.4
3,"['responded_existence:(PACKED,ORDER_ACKNOWLEDG...",hybrid_dwd,91.0,0.483,0.975,1.788,18.343,16.6
4,"['responded_existence:(RELEASED_FOR_PICKING,OR...",hybrid_dwd,91.0,0.483,0.975,1.788,18.343,16.6
5,"['mra[ORDER_OPENED-complete, SHIPUNIT_BUILT-co...",seq_combined_data,8.0,0.257,0.988,1.812,37.588,24.8
6,"['not_chain_response:(PACKED,PICKED):Data_binn...",dec_dwd,15.029,0.482,0.965,1.77,13.122,27.4
7,"['not_chain_response:(RELEASED_FOR_PICKING,PAC...",bs_dwd,10.0,0.477,0.966,1.772,13.528,29.2
8,"['chain_precedence:(RELEASED_FOR_PICKING,PICKE...",hybrid_dwd_data,14.2,0.403,0.953,1.748,9.734,29.6
9,"['mr[PACKED-complete, ORDER_ACKNOWLEDGED-compl...",seq_combined_data,64.0,0.017,0.992,1.819,57.059,29.8


In [5]:
# --- Config ---
OUT_DIR = Path("5_analysis")
OUT_NAME = "top_deviant_rules_dhl.tex"   # change if you like
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_PATH = OUT_DIR / OUT_NAME

# --- Columns to export (exclude rank_agg if present) ---
wanted_cols = [
    "Rule",
    "Feature Encoding",
    "Odds ratio",
    "Support LHS",
    "Confidence",
    "Lift",
    "Conviction",
]
available_cols = [c for c in wanted_cols if c in topk_overall.columns]
df_export = topk_overall[available_cols].copy()

# --- Detokenize helper: wrap strings so LaTeX won't choke on special chars ---
def detok(series: pd.Series) -> pd.Series:
    return (
        series.astype(str)
        .fillna("")
        .map(lambda x: rf"\detokenize{{{x}}}" if x != "" else "")
    )

# --- Apply detokenize to text columns ---
for col in ["Rule", "Feature Encoding"]:
    if col in df_export.columns:
        df_export[col] = detok(df_export[col])

# --- Write LaTeX (no escaping, because we use \detokenize) ---
latex_str = df_export.to_latex(
    index=False,
    escape=False,        # keep \detokenize intact
    longtable=False,     # set True if you want a longtable
    float_format="%.3f", # tweak as needed
)

OUT_PATH.write_text(latex_str, encoding="utf-8")
print(f"Wrote LaTeX table to: {OUT_PATH}")








Wrote LaTeX table to: 5_analysis\top_deviant_rules_dhl.tex
