### 0. Initizalization

In [None]:
import os
import sys
import shutil
from typing import Callable, Dict, List, Tuple

import numpy as np
import pandas as pd
import yaml
from skfeature.function.similarity_based import fisher_score

# Add repository root to PYTHONPATH for local imports
PROJECT_ROOT = os.path.abspath(os.getcwd())
sys.path.insert(0, PROJECT_ROOT)

# Load configuration
with open(os.path.join(PROJECT_ROOT, "config", "config_feature_selection.yaml"), "r") as f:
    cfg = yaml.safe_load(f)

# Configured dataset identifiers
EVENT_LOG = cfg["event_log"]
EXP_NAME  = cfg["experiment_name"]


### Preprocessing for IMPressed
1. Renames training_encoded_log.csv → IMPresseD.csv
2. Renames column Outcome → Label inside IMPresseD.csv
3. Copies all other files from each IMPresseD encoding folder into PROJECT_ROOT/IMPresseD_features/<dataset>/<labeling>/ (excluding IMPresseD.csv)

In [None]:
processed_root = os.path.join(PROJECT_ROOT, "3_extracted_features")
target_root    = os.path.join(PROJECT_ROOT, "IMPresseD_patterns")
os.makedirs(target_root, exist_ok=True)

# Walk: <root>/3_extracted_features/<dataset>/<labeling>/IMPresseD/
for dataset in sorted(os.listdir(processed_root)):
    dataset_dir = os.path.join(processed_root, dataset)
    if not os.path.isdir(dataset_dir):
        continue

    for labeling in sorted(os.listdir(dataset_dir)):
        labeling_dir = os.path.join(dataset_dir, labeling)
        if not os.path.isdir(labeling_dir):
            continue

        imp_dir = os.path.join(labeling_dir, "IMPresseD")
        if not os.path.isdir(imp_dir):
            continue

        old_csv = os.path.join(imp_dir, "training_encoded_log.csv")
        new_csv = os.path.join(imp_dir, "IMPresseD.csv")

        # 1) Ensure canonical CSV filename
        try:
            if os.path.exists(old_csv) and not os.path.exists(new_csv):
                os.rename(old_csv, new_csv)
                print(f"[RENAME] {old_csv} -> {new_csv}")
            elif os.path.exists(new_csv):
                print(f"[OK] Canonical file present: {new_csv}")
            else:
                print(f"[WARN] Missing training file in {imp_dir} (skip rename)")
        except Exception as e:
            print(f"[ERROR] Rename in {imp_dir}: {e}")

        # 2) Normalize column name: Outcome -> Label
        if os.path.exists(new_csv):
            try:
                df = pd.read_csv(new_csv)
                if "Outcome" in df.columns:
                    df = df.rename(columns={"Outcome": "Label"})
                    df.to_csv(new_csv, index=False)
                    print(f"[COLUMNS] Renamed 'Outcome' → 'Label' in {new_csv}")
                else:
                    print(f"[COLUMNS] No 'Outcome' column in {new_csv}")
            except Exception as e:
                print(f"[ERROR] Read/write {new_csv}: {e}")

        # 3) Move auxiliary files to <root>/IMPresseD_patterns/<dataset>/<labeling>/
        out_dir = os.path.join(target_root, dataset, labeling)
        os.makedirs(out_dir, exist_ok=True)

        try:
            for fname in os.listdir(imp_dir):
                if fname == "IMPresseD.csv":
                    continue  # keep canonical CSV in place

                fpath = os.path.join(imp_dir, fname)
                if not os.path.isfile(fpath):
                    continue

                dest = os.path.join(out_dir, fname)

                # Overwrite destination if necessary
                if os.path.exists(dest):
                    try:
                        os.remove(dest)
                    except IsADirectoryError:
                        shutil.rmtree(dest)

                shutil.move(fpath, dest)
                print(f"[MOVE] {dataset}/{labeling}: {fname} -> {dest}")
        except Exception as e:
            print(f"[ERROR] Moving extras from {imp_dir}: {e}")

        # 4) Remove IMPresseD directory if no files remain
        try:
            remaining = [
                f for f in os.listdir(imp_dir)
                if os.path.isfile(os.path.join(imp_dir, f))
            ]
            if not remaining:
                os.rmdir(imp_dir)
                print(f"[CLEAN] Removed empty directory {imp_dir}")
        except Exception as e:
            print(f"[WARN] Cleanup {imp_dir} skipped: {e}")

### 1. Define function for Fisher Scoring (coverage and topK)

In [None]:
# Fisher scoring for binary classification (standalone)
def _fisher_scores_binary(X: np.ndarray, y: np.ndarray) -> np.ndarray:
    """
    Compute the Fisher score per feature for binary labels (0/1).
    Score = between-class variance / within-class (pooled) variance.
    """
    y = np.asarray(y).ravel()
    if not set(np.unique(y)).issubset({0, 1}):
        raise ValueError("Labels must be binary (0/1).")

    pos = (y == 1)
    neg = (y == 0)
    n_pos, n_neg = pos.sum(), neg.sum()

    mu      = X.mean(axis=0)
    mu_pos  = X[pos].mean(axis=0) if n_pos else np.zeros(X.shape[1])
    mu_neg  = X[neg].mean(axis=0) if n_neg else np.zeros(X.shape[1])

    var_pos = X[pos].var(axis=0, ddof=0) if n_pos else np.zeros(X.shape[1])
    var_neg = X[neg].var(axis=0, ddof=0) if n_neg else np.zeros(X.shape[1])

    between = n_pos * (mu_pos - mu) ** 2 + n_neg * (mu_neg - mu) ** 2
    within  = n_pos * var_pos + n_neg * var_neg

    # Features with zero within-class variance get score 0
    with np.errstate(divide="ignore", invalid="ignore"):
        scores = np.where(within > 0, between / within, 0.0).astype(float)
    return scores


In [None]:
# Fisher helpers (prefer scikit-feature; fallback to local implementation)
def _rank_by_fisher(X: np.ndarray, y: np.ndarray):
    """
    Return (rank_idx_desc, scores_or_None).
    Uses skfeature.fisher_score if available; otherwise falls back to _fisher_scores_binary.
    """
    try:
        from skfeature.function.similarity_based import fisher_score
        try:
            rank_idx = fisher_score.fisher_score(X, y, mode="rank")
            scores = None
        except TypeError:
            scores = fisher_score.fisher_score(X, y)
            rank_idx = np.argsort(-scores)
    except Exception:
        scores = _fisher_scores_binary(X, y)
        rank_idx = np.argsort(-scores)
    return rank_idx, scores


# Top-k Fisher selector (single DataFrame)
def fisher_topk_select(
    df: pd.DataFrame,
    label_col: str = "Label",
    case_id_col: str = "Case_ID",
    k: int = 100,
    drop_constant: bool = True,
    drop_duplicate_columns: bool = True,
    fillna_value: float = 0.0,
    return_features_only: bool = False,
):
    """
    Rank numeric features by Fisher score and keep the top k after removing
    duplicate and constant columns. Always preserves Label and Case_ID if present.
    """
    if label_col not in df.columns:
        raise KeyError(f"Label column '{label_col}' not found.")

    # Labels → binary {0,1}
    y = df[label_col]
    if y.dtype == bool:
        y = y.astype(int)
    uniq = set(pd.unique(y))
    if not uniq.issubset({0, 1}):
        y = pd.Series(pd.Categorical(y).codes, index=y.index)
        if not set(pd.unique(y)).issubset({0, 1}):
            raise ValueError("Label column must be binary (0/1 or bool).")

    # Columns to retain in the output
    id_cols = [c for c in [case_id_col, label_col] if c in df.columns]

    # Numeric feature pool (exclude ID columns)
    feats_df = df.drop(columns=id_cols, errors="ignore").select_dtypes(include=["number"]).copy()

    # Remove duplicate and constant columns
    if drop_duplicate_columns and feats_df.shape[1] > 1:
        feats_df = feats_df.T.drop_duplicates().T
    if drop_constant and not feats_df.empty:
        non_constant = (feats_df != feats_df.iloc[0]).any()
        feats_df = feats_df.loc[:, non_constant]

    feats_df = feats_df.fillna(fillna_value)
    X = feats_df.to_numpy(dtype=float)
    y_arr = y.to_numpy(dtype=int)

    if X.shape[1] == 0:
        out_df = df[id_cols].copy() if not return_features_only else df.iloc[:, 0:0].copy()
        return out_df, [], {
            "method": "fisher_topk",
            "reason": "no_features_after_filtering",
            "selected_count": 0,
            "postfilter_features": 0,
            "k": k,
        }

    rank_idx, scores = _rank_by_fisher(X, y_arr)
    k_eff = min(k, X.shape[1])
    keep_idx = rank_idx[:k_eff]
    selected_cols = feats_df.columns[keep_idx].tolist()

    # Assemble output
    if return_features_only:
        selected_df = feats_df[selected_cols].copy()
    else:
        selected_df = pd.concat([df[id_cols].copy(), feats_df[selected_cols]], axis=1)

    if scores is None:
        scores = _fisher_scores_binary(X, y_arr)

    info = {
        "method": "fisher_topk",
        "k": k,
        "selected_count": len(selected_cols),
        "postfilter_features": feats_df.shape[1],
        "ranked_features": feats_df.columns[rank_idx].tolist(),
        "ranked_scores": scores[rank_idx].tolist(),
    }
    return selected_df, selected_cols, info


# Greedy coverage selector (single DataFrame)
def fisher_coverage_select(
    df: pd.DataFrame,
    label_col: str = "Label",
    case_id_col: str = "Case_ID",
    coverage_threshold: int = 20,
    positive_predicate: Callable[[pd.Series], pd.Series] = None,
    drop_constant: bool = True,
    drop_duplicate_columns: bool = True,
    fillna_value: float = 0.0,
    return_features_only: bool = False,
) -> Tuple[pd.DataFrame, List[str], Dict]:
    """
    Fisher ranking combined with greedy coverage selection.
    Select features until each row reaches the desired number of positive predicates.
    Preserves Label and Case_ID if present.
    """
    if label_col not in df.columns:
        raise KeyError(f"Label column '{label_col}' not found.")

    # Labels → binary {0,1}
    y = df[label_col]
    if y.dtype == bool:
        y = y.astype(int)
    uniq = set(pd.unique(y))
    if not uniq.issubset({0, 1}):
        y = pd.Series(pd.Categorical(y).codes, index=y.index)
        if not set(pd.unique(y)).issubset({0, 1}):
            raise ValueError("Label column must be binary (0/1 or bool).")

    # Columns to retain in the output
    id_cols = [c for c in [case_id_col, label_col] if c in df.columns]

    # Numeric feature pool (exclude ID columns)
    feats_df = df.drop(columns=id_cols, errors="ignore").select_dtypes(include=["number"]).copy()

    # Remove duplicate and constant columns
    if drop_duplicate_columns and feats_df.shape[1] > 1:
        feats_df = feats_df.T.drop_duplicates().T
    if drop_constant and not feats_df.empty:
        non_constant = (feats_df != feats_df.iloc[0]).any()
        feats_df = feats_df.loc[:, non_constant]

    feats_df = feats_df.fillna(fillna_value)
    X = feats_df.to_numpy(dtype=float)
    y_arr = y.to_numpy(dtype=int)

    if X.shape[1] == 0:
        out_df = df[id_cols].copy() if not return_features_only else df.iloc[:, 0:0].copy()
        return out_df, [], {
            "method": "coverage",
            "reason": "no_features_after_filtering",
            "coverage_threshold": coverage_threshold,
            "coverage_per_row": np.zeros(len(df), dtype=int).tolist(),
            "unattainable_rows": list(range(len(df))),
            "selected_count": 0,
        }

    rank_idx, scores = _rank_by_fisher(X, y_arr)

    if positive_predicate is None:
        def positive_predicate(col: pd.Series) -> pd.Series:
            return col > 0

    needed = np.full(X.shape[0], coverage_threshold, dtype=int)
    selected_indices: List[int] = []

    positives_matrix = (X > 0)
    max_attainable_per_row = positives_matrix.sum(axis=1)
    unattainable_rows = np.where(max_attainable_per_row < coverage_threshold)[0].tolist()

    for j in rank_idx:
        if selected_indices and (needed <= 0).all():
            break
        col = feats_df.iloc[:, j]
        pos_mask = positive_predicate(col).to_numpy()

        helps_mask = pos_mask & (needed > 0)
        if not np.any(helps_mask):
            continue

        selected_indices.append(j)
        needed = np.where(helps_mask, needed - 1, needed)

    selected_cols = feats_df.columns[selected_indices].tolist()

    # Assemble output
    if return_features_only:
        selected_df = feats_df[selected_cols].copy()
    else:
        selected_df = pd.concat([df[id_cols].copy(), feats_df[selected_cols]], axis=1)

    if scores is None:
        scores = _fisher_scores_binary(X, y_arr)

    info = {
        "method": "coverage",
        "coverage_threshold": coverage_threshold,
        "coverage_per_row": (max_attainable_per_row - needed.clip(min=0)).tolist(),
        "unattainable_rows": unattainable_rows,
        "attained_min": int((max_attainable_per_row - needed.clip(min=0)).min()) if len(df) else 0,
        "selected_count": len(selected_cols),
        "ranked_features": feats_df.columns[rank_idx].tolist(),
        "ranked_scores": scores[rank_idx].tolist(),
    }
    return selected_df, selected_cols, info

### 1.1 Process all logs

In [None]:
# Batch feature selection runner

# Selection method: "coverage" or "fisher_topk"
SELECTION_METHOD = "coverage"

# I/O roots
IN_ROOT  = "3_extracted_features"
OUT_ROOT = "3.1_selected_features"

# Resolve a labeling folder name from an experiment name
def _resolve_labeling_folder(experiment_name):
    if experiment_name is None:
        return None
    name = str(experiment_name).strip()
    if name.lower() in {"", "all", "*"}:
        return None
    return name if name.endswith("_features") else f"{name}_features"

# Optional filters supplied via config (if defined earlier)
try:
    DATASET_FILTER = None if (EVENT_LOG is None or str(EVENT_LOG).strip().lower() in {"", "all", "*"}) else str(EVENT_LOG).strip()
except NameError:
    DATASET_FILTER = None

try:
    LABELING_FILTER = _resolve_labeling_folder(EXP_NAME)
except NameError:
    LABELING_FILTER = None

# Method parameters (defaults + optional overrides)
THRESHOLD_DEFAULT = 10
THRESHOLD_OVERRIDE = {
    # "declare": 11,
    # ("traffic", "traffic_decl3_features", "declare"): 11,
}
K_DEFAULT = 100
K_OVERRIDE = {
    # "declare": 150,
    # ("sepsis", "sepsis_mr_tr_features", "declare"): 200,
}

def _get_threshold(dataset: str, labeling: str, encoding: str) -> int:
    if (dataset, labeling, encoding) in THRESHOLD_OVERRIDE:
        return THRESHOLD_OVERRIDE[(dataset, labeling, encoding)]
    if encoding in THRESHOLD_OVERRIDE:
        return THRESHOLD_OVERRIDE[encoding]
    return THRESHOLD_DEFAULT

def _get_k(dataset: str, labeling: str, encoding: str) -> int:
    if (dataset, labeling, encoding) in K_OVERRIDE:
        return K_OVERRIDE[(dataset, labeling, encoding)]
    if encoding in K_OVERRIDE:
        return K_OVERRIDE[encoding]
    return K_DEFAULT

def _find_csv_path(encoding_dir: str, encoding_name: str) -> str:
    expected = os.path.join(encoding_dir, f"{encoding_name}.csv")
    if os.path.isfile(expected):
        return expected
    csvs = [f for f in os.listdir(encoding_dir) if f.lower().endswith(".csv")]
    if len(csvs) == 1:
        return os.path.join(encoding_dir, csvs[0])
    raise FileNotFoundError(f"No CSV found for encoding '{encoding_name}' in {encoding_dir}")

def _ensure_out_dir(path: str):
    os.makedirs(path, exist_ok=True)

def _count_original_features(df: pd.DataFrame, label_col: str = "Label", case_id_col: str = "Case_ID") -> int:
    feats = df.drop(columns=[label_col, case_id_col], errors="ignore").select_dtypes(include=["number"])
    return feats.shape[1]

def _count_postfilter_features(df: pd.DataFrame, label_col: str = "Label", case_id_col: str = "Case_ID") -> int:
    feats = df.drop(columns=[label_col, case_id_col], errors="ignore").select_dtypes(include=["number"]).copy()
    if feats.shape[1] > 1:
        feats = feats.T.drop_duplicates().T
    if not feats.empty:
        non_constant = (feats != feats.iloc[0]).any()
        feats = feats.loc[:, non_constant]
    return feats.shape[1]

# -------------------- Batch over folders --------------------
if not os.path.isdir(IN_ROOT):
    raise FileNotFoundError(f"Input root not found: {IN_ROOT}")

datasets = sorted([d for d in os.listdir(IN_ROOT) if os.path.isdir(os.path.join(IN_ROOT, d))])
if DATASET_FILTER:
    if DATASET_FILTER not in datasets:
        raise FileNotFoundError(f"Dataset '{DATASET_FILTER}' not found under {IN_ROOT}. Available: {datasets}")
    datasets = [DATASET_FILTER]

total_sets = 0
ok_sets = 0
skipped_sets = 0
errors = []
summary_rows = []

for dataset in datasets:
    ds_dir = os.path.join(IN_ROOT, dataset)
    labelings = sorted([l for l in os.listdir(ds_dir) if os.path.isdir(os.path.join(ds_dir, l))])
    if LABELING_FILTER:
        if LABELING_FILTER not in labelings:
            raise FileNotFoundError(f"Labeling '{LABELING_FILTER}' not found under {ds_dir}. Available: {labelings}")
        labelings = [LABELING_FILTER]

    for labeling in labelings:
        lab_dir = os.path.join(ds_dir, labeling)
        encodings = sorted([e for e in os.listdir(lab_dir) if os.path.isdir(os.path.join(lab_dir, e))])

        for encoding in encodings:
            enc_dir = os.path.join(lab_dir, encoding)
            total_sets += 1
            try:
                csv_in = _find_csv_path(enc_dir, encoding)
                df = pd.read_csv(csv_in)
                rows_total = len(df)

                if "Label" not in df.columns:
                    skipped_sets += 1
                    print(f"[SKIP] {dataset}/{labeling}/{encoding}: no 'Label' column found.")
                    summary_rows.append({
                        "Dataset": dataset,
                        "Labeling": labeling,
                        "Encoding": encoding,
                        "method": SELECTION_METHOD,
                        "threshold": _get_threshold(dataset, labeling, encoding) if SELECTION_METHOD == "coverage" else None,
                        "k": _get_k(dataset, labeling, encoding) if SELECTION_METHOD == "fisher_topk" else None,
                        "rows_total": rows_total,
                        "original_features": _count_original_features(df),
                        "postfilter_features": _count_postfilter_features(df),
                        "selected_features": 0,
                        "unattainable_rows": None,
                        "min_attained": None,
                        "status": "SKIP_no_label",
                        "output_csv": None,
                    })
                    continue

                if SELECTION_METHOD == "coverage":
                    thr = _get_threshold(dataset, labeling, encoding)
                    selected_df, selected_cols, info = fisher_coverage_select(
                        df,
                        label_col="Label",
                        case_id_col="Case_ID",
                        coverage_threshold=thr,
                        positive_predicate=None,  # coverage defined as value > 0
                        drop_duplicate_columns=True,
                        drop_constant=True,
                        return_features_only=False,
                    )
                elif SELECTION_METHOD == "fisher_topk":
                    k_use = _get_k(dataset, labeling, encoding)
                    selected_df, selected_cols, info = fisher_topk_select(
                        df,
                        label_col="Label",
                        case_id_col="Case_ID",
                        k=k_use,
                        drop_duplicate_columns=True,
                        drop_constant=True,
                        return_features_only=False,
                    )
                else:
                    raise ValueError(f"Unknown SELECTION_METHOD: {SELECTION_METHOD}")

                out_dir = os.path.join(OUT_ROOT, dataset, labeling, encoding)
                _ensure_out_dir(out_dir)
                csv_out = os.path.join(out_dir, os.path.basename(csv_in))
                selected_df.to_csv(csv_out, index=False)

                ok_sets += 1
                if SELECTION_METHOD == "coverage":
                    print(
                        f"[OK] {dataset}/{labeling}/{encoding}: "
                        f"{SELECTION_METHOD}, selected {len(selected_cols)} features; "
                        f"min_attained={info.get('attained_min','NA')}/{info.get('coverage_threshold','NA')}; "
                        f"unattainable_rows={len(info.get('unattainable_rows',[]))} -> {csv_out}"
                    )
                else:
                    print(
                        f"[OK] {dataset}/{labeling}/{encoding}: "
                        f"{SELECTION_METHOD}, selected {len(selected_cols)}/{info.get('k')} -> {csv_out}"
                    )

                summary_rows.append({
                    "Dataset": dataset,
                    "Labeling": labeling,
                    "Encoding": encoding,
                    "method": info.get("method"),
                    "threshold": info.get("coverage_threshold") if SELECTION_METHOD == "coverage" else None,
                    "k": info.get("k") if SELECTION_METHOD == "fisher_topk" else None,
                    "rows_total": rows_total,
                    "original_features": _count_original_features(df),
                    "postfilter_features": info.get("postfilter_features", _count_postfilter_features(df)),
                    "selected_features": len(selected_cols),
                    "unattainable_rows": len(info.get("unattainable_rows", [])) if SELECTION_METHOD == "coverage" else None,
                    "min_attained": info.get("attained_min") if SELECTION_METHOD == "coverage" else None,
                    "status": "OK",
                    "output_csv": csv_out,
                })

            except Exception as e:
                errors.append((dataset, labeling, encoding, str(e)))
                print(f"[ERR] {dataset}/{labeling}/{encoding}: {e}")
                summary_rows.append({
                    "Dataset": dataset,
                    "Labeling": labeling,
                    "Encoding": encoding,
                    "method": SELECTION_METHOD,
                    "threshold": _get_threshold(dataset, labeling, encoding) if SELECTION_METHOD == "coverage" else None,
                    "k": _get_k(dataset, labeling, encoding) if SELECTION_METHOD == "fisher_topk" else None,
                    "rows_total": None,
                    "original_features": None,
                    "postfilter_features": None,
                    "selected_features": None,
                    "unattainable_rows": None,
                    "min_attained": None,
                    "status": f"ERR: {e}",
                    "output_csv": None,
                })

# Build summary
summary_df = pd.DataFrame(summary_rows)
if not summary_df.empty:
    summary_df = summary_df.sort_values(["Dataset", "Labeling", "Encoding"]).reset_index(drop=True)

print(f"\n=== Summary ({SELECTION_METHOD}) ===")
print(f"Total sets found:   {total_sets}")
print(f"Processed (OK):     {ok_sets}")
print(f"Skipped (no Label): {skipped_sets}")
print(f"Errors:             {len(errors)}")

display_cols = [
    "Dataset", "Labeling", "Encoding",
    "method", "threshold", "k",
    "rows_total", "original_features", "postfilter_features", "selected_features",
    "unattainable_rows", "min_attained",
]
if not summary_df.empty:
    print("\nFeature selection summary (key fields):")
    print(summary_df[display_cols].to_string(index=False))

summary_out = os.path.join(OUT_ROOT, f"feature_selection_summary_{SELECTION_METHOD}.csv")
summary_df.to_csv(summary_out, index=False)
print(f"\nSaved full summary to: {summary_out}")

In [None]:
# Export selected columns to LaTeX (clean names and formatting)

from pathlib import Path
import os
import re
import pandas as pd

OUT_ROOT = "3.1_selected_features"
summary_out = os.path.join(OUT_ROOT, f"feature_selection_summary_{SELECTION_METHOD}.csv")

# Reuse in-memory summary_df if available; otherwise read from disk
try:
    _ = summary_df
except NameError:
    summary_df = pd.read_csv(summary_out)

# Columns to include
keep_cols = [
    "Dataset", "Labeling", "Encoding",
    "original_features", "postfilter_features",
    "selected_features", "unattainable_rows",
]
df = summary_df.loc[:, keep_cols].copy()

# Strip trailing "_features" from labeling folder names
df["Labeling"] = df["Labeling"].astype(str).str.replace(r"_features$", "", regex=True)

# Convert tokens to readable text: underscores/hyphens → space; Title Case if all-lowercase
def detokenize_text(x: str) -> str:
    if pd.isna(x):
        return x
    s = str(x)
    s = re.sub(r"[_\-]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    if s.islower():  # keep acronyms as-is
        s = s.title()
    return s

df["Labeling"] = df["Labeling"].apply(detokenize_text)
df["Encoding"] = df["Encoding"].apply(detokenize_text)

# Rename columns for presentation
rename_map = {
    "original_features": "Original features",
    "postfilter_features": "Postfilter features",
    "selected_features": "Selected features",
    "unattainable_rows": "Unattainable rows",
}
df = df.rename(columns=rename_map)

# Cast counts to nullable integers (preserve NA as <NA>)
int_cols = ["Original features", "Postfilter features", "Selected features", "Unattainable rows"]
for c in int_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

# Stable ordering
df = df.sort_values(["Dataset", "Labeling", "Encoding"]).reset_index(drop=True)

# Output path (share stem with the CSV)
try:
    latex_out = Path(summary_out).with_suffix(".tex")
except NameError:
    latex_out = Path("feature_selection_summary_coverage.tex")

# Write LaTeX longtable
df.to_latex(
    latex_out,
    index=False,
    escape=True,
    longtable=True,
    column_format="lllrrrr",
    caption="Feature selection summary (coverage).",
    label="tab:feature_selection_summary_coverage",
    na_rep="--",
)

print(f"Wrote LaTeX table to: {latex_out.resolve()}")


### 2.2 Process specific log + encoding (LEGACY)

In [None]:
# # --- Batch Fisher+Coverage selection + summary (config-driven: EVENT_LOG + EXP_NAME) ---

# # Roots
# IN_ROOT  = "3_extracted_features"
# OUT_ROOT = "3.1_selected_features"

# # Coverage threshold: default + optional overrides
# THRESHOLD_DEFAULT = 10
# THRESHOLD_OVERRIDE = {
#     # Examples:
#     # "declare": 11,
#     # ("sepsis", "sepsis_mr_tr_features", "declare"): 11,
# }

# def _get_threshold(dataset: str, labeling: str, encoding: str) -> int:
#     if (dataset, labeling, encoding) in THRESHOLD_OVERRIDE:
#         return THRESHOLD_OVERRIDE[(dataset, labeling, encoding)]
#     if encoding in THRESHOLD_OVERRIDE:
#         return THRESHOLD_OVERRIDE[encoding]
#     return THRESHOLD_DEFAULT

# def _find_csv_path(encoding_dir: str, encoding_name: str) -> str:
#     expected = os.path.join(encoding_dir, f"{encoding_name}.csv")
#     if os.path.isfile(expected):
#         return expected
#     csvs = [f for f in os.listdir(encoding_dir) if f.lower().endswith(".csv")]
#     if len(csvs) == 1:
#         return os.path.join(encoding_dir, csvs[0])
#     raise FileNotFoundError(f"No CSV found for encoding '{encoding_name}' in {encoding_dir}")

# def _ensure_out_dir(path: str):
#     os.makedirs(path, exist_ok=True)

# def _count_original_features(df: pd.DataFrame, label_col: str = "Label") -> int:
#     feats = df.drop(columns=[label_col], errors="ignore").select_dtypes(include=["number"])
#     return feats.shape[1]

# def _count_postfilter_features(df: pd.DataFrame, label_col: str = "Label") -> int:
#     feats = df.drop(columns=[label_col], errors="ignore").select_dtypes(include=["number"]).copy()
#     if feats.shape[1] > 1:
#         feats = feats.T.drop_duplicates().T
#     if not feats.empty:
#         non_constant = (feats != feats.iloc[0]).any()
#         feats = feats.loc[:, non_constant]
#     return feats.shape[1]

# # --- Resolve config filters ---
# def _resolve_labeling_folder(experiment_name: str) -> str:
#     if experiment_name is None or str(experiment_name).strip().lower() in {"", "all", "*"}:
#         return None  # means: process all labelings under the dataset
#     name = str(experiment_name).strip()
#     return name if name.endswith("_features") else f"{name}_features"

# DATASET_FILTER = None if (EVENT_LOG is None or str(EVENT_LOG).strip().lower() in {"", "all", "*"}) else str(EVENT_LOG).strip()
# LABELING_FILTER = _resolve_labeling_folder(EXP_NAME)

# if not os.path.isdir(IN_ROOT):
#     raise FileNotFoundError(f"Input root not found: {IN_ROOT}")

# # Build dataset list respecting config
# datasets = sorted(os.listdir(IN_ROOT))
# datasets = [d for d in datasets if os.path.isdir(os.path.join(IN_ROOT, d))]
# if DATASET_FILTER:
#     if DATASET_FILTER not in datasets:
#         raise FileNotFoundError(f"Dataset '{DATASET_FILTER}' not found under {IN_ROOT}. Available: {datasets}")
#     datasets = [DATASET_FILTER]

# total_sets = 0
# ok_sets = 0
# skipped_sets = 0
# errors = []
# summary_rows = []

# for dataset in datasets:
#     ds_dir = os.path.join(IN_ROOT, dataset)
#     labelings = sorted([l for l in os.listdir(ds_dir) if os.path.isdir(os.path.join(ds_dir, l))])

#     if LABELING_FILTER:
#         if LABELING_FILTER not in labelings:
#             raise FileNotFoundError(
#                 f"Labeling '{LABELING_FILTER}' not found under {ds_dir}. "
#                 f"Available: {labelings}"
#             )
#         labelings = [LABELING_FILTER]

#     for labeling in labelings:
#         lab_dir = os.path.join(ds_dir, labeling)
#         encodings = sorted([e for e in os.listdir(lab_dir) if os.path.isdir(os.path.join(lab_dir, e))])

#         for encoding in encodings:
#             enc_dir = os.path.join(lab_dir, encoding)
#             total_sets += 1
#             try:
#                 csv_in = _find_csv_path(enc_dir, encoding)
#                 df = pd.read_csv(csv_in)
#                 rows_total = len(df)

#                 if "Label" not in df.columns:
#                     skipped_sets += 1
#                     print(f"[SKIP] {dataset}/{labeling}/{encoding}: no 'Label' column found.")
#                     summary_rows.append({
#                         "Dataset": dataset,
#                         "Labeling": labeling,
#                         "Encoding": encoding,
#                         "threshold": _get_threshold(dataset, labeling, encoding),
#                         "rows_total": rows_total,
#                         "original_features": _count_original_features(df),
#                         "postfilter_features": _count_postfilter_features(df),
#                         "selected_features": 0,
#                         "unattainable_rows": rows_total,
#                         "min_attained": 0,
#                         "status": "SKIP_no_label",
#                         "output_csv": None,
#                     })
#                     continue

#                 thr = _get_threshold(dataset, labeling, encoding)

#                 # Apply Fisher + coverage (value > 0 = coverage; mirrors paper's code path)
#                 selected_df, selected_cols, info = fisher_coverage_select(
#                     df,
#                     label_col="Label",
#                     coverage_threshold=thr,
#                     positive_predicate=None,           # coverage = value > 0
#                     drop_duplicate_columns=True,
#                     drop_constant=True,
#                     return_features_only=False
#                 )

#                 out_dir = os.path.join(OUT_ROOT, dataset, labeling, encoding)
#                 _ensure_out_dir(out_dir)
#                 csv_out = os.path.join(out_dir, os.path.basename(csv_in))
#                 selected_df.to_csv(csv_out, index=False)

#                 ok_sets += 1
#                 print(
#                     f"[OK] {dataset}/{labeling}/{encoding}: "
#                     f"selected {len(selected_cols)} features; "
#                     f"min_attained={info.get('attained_min', 'NA')}/{info.get('coverage_threshold', thr)}; "
#                     f"unattainable_rows={len(info.get('unattainable_rows', []))} -> {csv_out}"
#                 )

#                 summary_rows.append({
#                     "Dataset": dataset,
#                     "Labeling": labeling,
#                     "Encoding": encoding,
#                     "threshold": info.get("coverage_threshold", thr),
#                     "rows_total": rows_total,
#                     "original_features": _count_original_features(df),
#                     "postfilter_features": _count_postfilter_features(df),
#                     "selected_features": len(selected_cols),
#                     "unattainable_rows": len(info.get("unattainable_rows", [])),
#                     "min_attained": info.get("attained_min", None),
#                     "status": "OK",
#                     "output_csv": csv_out,
#                 })

#             except Exception as e:
#                 errors.append((dataset, labeling, encoding, str(e)))
#                 print(f"[ERR] {dataset}/{labeling}/{encoding}: {e}")
#                 summary_rows.append({
#                     "Dataset": dataset,
#                     "Labeling": labeling,
#                     "Encoding": encoding,
#                     "threshold": _get_threshold(dataset, labeling, encoding),
#                     "rows_total": None,
#                     "original_features": None,
#                     "postfilter_features": None,
#                     "selected_features": None,
#                     "unattainable_rows": None,
#                     "min_attained": None,
#                     "status": f"ERR: {e}",
#                     "output_csv": None,
#                 })

# # Build and show summary DataFrame
# summary_df = pd.DataFrame(summary_rows)
# if not summary_df.empty:
#     summary_df = summary_df.sort_values(["Dataset", "Labeling", "Encoding"]).reset_index(drop=True)

# print("\n=== Summary ===")
# print(f"Total sets found:   {total_sets}")
# print(f"Processed (OK):     {ok_sets}")
# print(f"Skipped (no Label): {skipped_sets}")
# print(f"Errors:             {len(errors)}")

# display_cols = [
#     "Dataset", "Labeling", "Encoding",
#     "threshold", "rows_total",
#     "original_features", "postfilter_features", "selected_features",
#     "unattainable_rows", "min_attained"
# ]
# if not summary_df.empty:
#     print("\nFeature selection summary (key fields):")
#     print(summary_df[display_cols].to_string(index=False))

# summary_out = os.path.join(OUT_ROOT, "feature_selection_summary.csv")
# summary_df.to_csv(summary_out, index=False)
# print(f"\nSaved full summary to: {summary_out}")
