In [1]:
from tqdm import TqdmWarning
import warnings
warnings.filterwarnings("ignore", category=TqdmWarning)
from tqdm.auto import tqdm  # will not emit the IProgress warning now

In [2]:
from utils import base_configs, deps, tr_va_te_split
from utils.helpers import rw_csv_helpers

In [3]:
import math
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
CONFIGS = base_configs.get_base_configs()
# path to your global SHAP file

#SWITCH THE FOLLOWING ACCORDINGLY
CONFIGS["RUN_TS"] = "20251202_200541"

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

# =========================================================
# Configuration
# =========================================================
INPUT_CSV = "op/all_test_report.csv"  # adjust if needed

# Metrics to include in the composite strength
# True  = higher is better
# False = lower is better (these will be negated)
METRICS_HIGHER_BETTER = {
    "auc_roc":      True,
    "auc_pr":       True,
    "brier":        False,   # lower is better
    "acc":          True,
    "macro_avg_f1": True,
    "mcc":          True,
}


# =========================================================
# Helpers
# =========================================================
def add_suffix(path: str | Path, suffix: str) -> Path:
    """
    Append a suffix before the file extension.

    Example:
        all_test_report.csv + "_step1" -> all_test_report_step1.csv
    """
    p = Path(path)
    return p.with_name(f"{p.stem}{suffix}{p.suffix}")


def compute_x(df: pd.DataFrame,
              metrics_higher_better: dict) -> pd.DataFrame:
    """
    Step 1: compute oriented metrics x_{j,k}.
    For higher-is-better metrics: x = value
    For lower-is-better metrics:  x = -value

    Output columns are named "x_<metric>".
    """
    out = {}
    for metric, hib in metrics_higher_better.items():
        vals = df[metric].astype(float)
        out[f"x_{metric}"] = vals if hib else -vals
    return pd.DataFrame(out, index=df.index)


def compute_z_from_x(x_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series, pd.Series]:
    """
    Step 2: compute z_{j,k} from x_{j,k}.

    For each x-column:
        mean mu, std sigma (population, ddof=0)
        z = (x - mu) / sigma

    Output columns are named "z_<metric>" (metric name after "x_").
    """
    z_cols = {}
    means = {}
    stds = {}
    for col in x_df.columns:
        x = x_df[col].astype(float)
        mu = x.mean()
        sigma = x.std(ddof=0)
        means[col] = mu
        stds[col] = sigma

        if sigma == 0:
            z = pd.Series(0.0, index=x.index)
        else:
            z = (x - mu) / sigma

        metric_name = col.removeprefix("x_")
        z_cols[f"z_{metric_name}"] = z

    z_df = pd.DataFrame(z_cols, index=x_df.index)
    return z_df, pd.Series(means), pd.Series(stds)


def compute_strength(z_df: pd.DataFrame) -> tuple[pd.Series, pd.Series]:
    """
    Step 3: composite strength S_j (mean of z_{j,k} across metrics).
    Step 4: rescaled strength S*_j in [0,1] via minâ€“max across models.
    """
    # composite strength: mean of z across metrics
    S = z_df.mean(axis=1)

    # rescaling to [0,1]
    S_min = S.min()
    S_max = S.max()
    if S_max > S_min:
        S_star = (S - S_min) / (S_max - S_min)
    else:
        S_star = pd.Series(1.0, index=S.index)

    return S, S_star


# =========================================================
# Main pipeline
# =========================================================
if __name__ == "__main__":
    base_path = Path(INPUT_CSV)

    # ---------------------------------------------
    # Read and clean original file
    # ---------------------------------------------
    df0 = pd.read_csv(base_path)

    # Remove columns "threshold" and "logloss"
    df0 = df0.drop(columns=["threshold", "logloss"], errors="ignore")

    print("=== Original data after removing 'threshold' and 'logloss' ===")
    print(df0)

    # ---------------------------------------------
    # STEP 1: x_{j,k} (oriented metrics)
    # ---------------------------------------------
    x_df = compute_x(df0, METRICS_HIGHER_BETTER)
    step1_df = pd.concat([df0, x_df], axis=1)

    step1_path = add_suffix(base_path, "_step1")
    step1_df.to_csv(step1_path, index=False)

    print("\n=== STEP 1: oriented metrics x_{j,k} (saved to", step1_path, ") ===")
    print(step1_df)

    # ---------------------------------------------
    # STEP 2: z_{j,k} (standardized metrics)
    # ---------------------------------------------
    z_df, means, stds = compute_z_from_x(x_df)
    step2_df = pd.concat([step1_df, z_df], axis=1)

    step2_path = add_suffix(base_path, "_step2")
    step2_df.to_csv(step2_path, index=False)

    print("\n=== STEP 2: standardized metrics z_{j,k} (saved to", step2_path, ") ===")
    print(step2_df)

    # ---------------------------------------------
    # STEP 3: composite strength S_j
    # ---------------------------------------------
    S, S_star = compute_strength(z_df)
    step3_df = step2_df.copy()
    step3_df["S_j"] = S  # composite strength

    step3_path = add_suffix(base_path, "_step3")
    step3_df.to_csv(step3_path, index=False)

    print("\n=== STEP 3: composite strength S_j (saved to", step3_path, ") ===")
    print(step3_df)

    # ---------------------------------------------
    # STEP 4: rescaled strength S*_j in [0,1]
    # ---------------------------------------------
    step4_df = step3_df.copy()
    step4_df["S_j_rescaled_01"] = S_star

    step4_path = add_suffix(base_path, "_step4")
    step4_df.to_csv(step4_path, index=False)

    print("\n=== STEP 4: rescaled strength S*_j in [0,1] (saved to", step4_path, ") ===")
    print(step4_df)


=== Original data after removing 'threshold' and 'logloss' ===
    model   auc_roc    auc_pr     brier       acc  macro_avg_f1       mcc
0      lr  0.798861  0.280234  0.073360  0.785200      0.623193  0.329669
1      rf  0.767818  0.232818  0.076559  0.661871      0.537879  0.254194
2     xgb  0.765824  0.251859  0.075583  0.791367      0.605361  0.266011
3   xlstm  0.736416  0.214320  0.079404  0.793422      0.598450  0.243894
4  tabnet  0.796766  0.287958  0.073639  0.815005      0.628947  0.302363
5     mlp  0.760872  0.235884  0.076605  0.785200      0.610147  0.288732

=== STEP 1: oriented metrics x_{j,k} (saved to op/all_test_report_step1.csv ) ===
    model   auc_roc    auc_pr     brier       acc  macro_avg_f1       mcc  \
0      lr  0.798861  0.280234  0.073360  0.785200      0.623193  0.329669   
1      rf  0.767818  0.232818  0.076559  0.661871      0.537879  0.254194   
2     xgb  0.765824  0.251859  0.075583  0.791367      0.605361  0.266011   
3   xlstm  0.736416  0.21432