In [None]:
# ==============================================================
# DataCo: DL/XGB experts + Sequential PPO with drift-aware episodes
# Goal: PPO adds value by maximizing late-class recall/F1 beyond static models
# ==============================================================

import os, gc, time, warnings, random
warnings.filterwarnings("ignore")

# --- mac stability
os.environ["OMP_NUM_THREADS"]="4"; os.environ["OPENBLAS_NUM_THREADS"]="4"
os.environ["MKL_NUM_THREADS"]="4"; os.environ["VECLIB_MAXIMUM_THREADS"]="4"
os.environ["NUMEXPR_NUM_THREADS"]="4"

import numpy as np; np.random.seed(42)
import pandas as pd; random.seed(42)

# ============== TensorFlow (DL heads) ==============
import tensorflow as tf
tf.random.set_seed(42)
from tensorflow.keras import mixed_precision
try:
    mixed_precision.set_global_policy("mixed_float16")
except: pass
try:
    for g in tf.config.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(g, True)
except: pass

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, LSTM, Bidirectional, Flatten, Input
from tensorflow.keras.optimizers import Adam

# ============== PyTorch / SB3 (PPO) ==============
import torch
rl_device = "mps" if torch.backends.mps.is_available() else "cpu"
print("[INFO] RL device:", rl_device)

# Sklearn & metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix)

from xgboost import XGBClassifier

# Gym/SB3
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize, VecMonitor

# ================= A) Load / cache parquet =================
# >>>> CHANGE THIS PATH <<<<
# Portable loader for DataCoSupplyChainDataset.csv
from pathlib import Path
import os
import pandas as pd

FILENAME = "DataCoSupplyChainDataset.csv"

def resolve_data_path(filename=FILENAME):
    """
    Find the CSV in common repo locations or via the SCAI_DATA env var.
    Works when running from repo root OR from scripts/.
    """
   
    env_dir = os.getenv("SCAI_DATA")
    if env_dir:
        p = Path(env_dir).expanduser() / filename
        if p.exists():
            return p

   
    cwd = Path.cwd()
    candidates = [
        cwd / filename,                   # .
        cwd / "data" / filename,          # ./data
        cwd / "Data" / filename,          # ./Data
        cwd.parent / filename,            # ..
        cwd.parent / "data" / filename,   # ../data
        cwd.parent / "Data" / filename,   # ../Data
        cwd.parents[1] / "data" / filename,  # ../../data
        cwd.parents[1] / "Data" / filename,  # ../../Data
    ]
    for p in candidates:
        if p.exists():
            return p

    tried = "\n".join(str(p) for p in candidates)
    raise FileNotFoundError(
        f"Could not find {filename}. Tried:\n{tried}\nCWD={cwd}\n"
        "Tip: place the file under your repo 'data/' or set SCAI_DATA to its folder."
    )

FILE_PATH = resolve_data_path()
print(f"[info] Using data at: {FILE_PATH}")


df = pd.read_csv(FILE_PATH, low_memory=False)
print(df.shape, "rows x cols")

parquet_path = os.path.splitext(file_path)[0] + ".parquet"

t0_all = time.time()
if not os.path.exists(parquet_path):
    print("[INFO] Converting CSV -> Parquet...")
    chunks = []
    for ch in pd.read_csv(file_path, engine="python", encoding="latin-1",
                          on_bad_lines="skip", chunksize=20000):
        chunks.append(ch)
    pd.concat(chunks, ignore_index=True).to_parquet(parquet_path, index=False)
    del chunks; gc.collect()

df = pd.read_parquet(parquet_path)
print("[INFO] Loaded:", df.shape)

# ================= B) Column mapping =================
def first_col(d, names):
    for n in names:
        if n in d.columns: return n
    return None

col_real  = first_col(df, ["Days for shipping (real)", "Days for shipping (real)_", "TPT"])
col_sched = first_col(df, ["Days for shipment (scheduled)", "Days for shipment (scheduled)_"])
col_y     = first_col(df, ["Late_delivery_risk", "Is_Late"])
col_qty   = first_col(df, ["Order Item Quantity", "Unit quantity", "Quantity"])
col_org   = first_col(df, ["Order City", "Order Region", "Order Country"])
col_dst   = first_col(df, ["Customer City", "Customer Region", "Customer Country"])
col_car   = first_col(df, ["Shipping Mode", "Carrier", "Shipment Mode", "Ship Mode"])
col_date  = first_col(df, ["Order Date (DateOrders)", "Order Date", "DateOrders", "Date"])
col_val   = first_col(df, ["Order Item Total", "Sales"])

# target if missing
if col_y is None:
    if (col_real is None) or (col_sched is None):
        raise ValueError("Need Late_delivery_risk or both real/scheduled days.")
    df["__y__"] = (pd.to_numeric(df[col_real], errors="coerce") >
                   pd.to_numeric(df[col_sched], errors="coerce")).astype(int)
    col_y = "__y__"

keep = [c for c in [col_qty, col_real, col_sched, col_y, col_org, col_dst, col_car, col_date, col_val] if c]
df = df[keep].copy()

# numerics
for c in [col_qty, col_real, col_sched]:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=[col_y, col_real, col_sched, col_qty])

y_all = df[col_y].astype(int).to_numpy()

# ================= C) Base features (compact) =================
num_df = pd.DataFrame(index=df.index)
num_df["qty"]         = df[col_qty]
num_df["tpt_real"]    = df[col_real]
num_df["tpt_sched"]   = df[col_sched]
num_df["lead_dev"]    = num_df["tpt_real"] - num_df["tpt_sched"]
num_df["tpt_per_unit"]= num_df["tpt_real"] / np.clip(num_df["qty"].replace(0, np.nan), 1, None)
num_df["log_qty"]     = np.log1p(np.clip(num_df["qty"], 1, None))

cat_df = pd.DataFrame(index=df.index)
cat_df["org"] = df[col_org].astype(str) if col_org else "NA"
cat_df["dst"] = df[col_dst].astype(str) if col_dst else "NA"
cat_df["car"] = df[col_car].astype(str) if col_car else "NA"

# ================= D) Split then TME (no leakage) =================
X_num_all = num_df.to_numpy(dtype="float32")
X_cat_all = cat_df.copy()

X_cat_tr, X_cat_te, X_num_tr, X_num_te, y_tr, y_te = train_test_split(
    X_cat_all, X_num_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)

def kfold_tme(X_cat: pd.DataFrame, y: np.ndarray, n_splits=5, alpha=10.0, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    enc_maps = {col: [] for col in X_cat.columns}
    X_enc = pd.DataFrame(index=X_cat.index, dtype="float32")
    global_mean = float(y.mean())
    for col in X_cat.columns:
        col_enc = pd.Series(index=X_cat.index, dtype="float32")
        for tr_idx, va_idx in skf.split(np.zeros(len(y)), y):
            keys_tr = X_cat.iloc[tr_idx, X_cat.columns.get_loc(col)]
            y_tr_f  = y[tr_idx]
            stats = pd.DataFrame({"k": keys_tr.values, "y": y_tr_f}).groupby("k")["y"].agg(["count","mean"])
            smooth = (stats["count"]*stats["mean"] + alpha*global_mean) / (stats["count"] + alpha)
            keys_va = X_cat.iloc[va_idx, X_cat.columns.get_loc(col)]
            col_enc.iloc[va_idx] = keys_va.map(smooth).fillna(global_mean).astype("float32")
        X_enc[col] = col_enc.values
        stats_full = pd.DataFrame({"k": X_cat[col].values, "y": y}).groupby("k")["y"].agg(["count","mean"])
        smooth_full = (stats_full["count"]*stats_full["mean"] + alpha*global_mean) / (stats_full["count"] + alpha)
        enc_maps[col] = [smooth_full.to_dict(), float(global_mean)]
    return X_enc.astype("float32"), enc_maps

def apply_tme(X_cat: pd.DataFrame, maps):
    out = pd.DataFrame(index=X_cat.index, dtype="float32")
    for col in X_cat.columns:
        m, g = maps[col]
        out[col] = X_cat[col].map(m).fillna(g).astype("float32")
    return out

t0 = time.time()
Xtr_cat_tme, encoders = kfold_tme(X_cat_tr, y_tr, n_splits=5, alpha=10.0)
Xte_cat_tme = apply_tme(X_cat_te, encoders)
print(f"[INFO] TME: {time.time()-t0:.2f}s")

Xtr_tab = np.hstack([X_num_tr, Xtr_cat_tme.to_numpy()])
Xte_tab = np.hstack([X_num_te, Xte_cat_tme.to_numpy()])

scaler_tab = StandardScaler()
Xtr_tab_s = scaler_tab.fit_transform(Xtr_tab).astype("float32")
Xte_tab_s = scaler_tab.transform(Xte_tab).astype("float32")

Xtr_3d = Xtr_tab_s.reshape(-1, Xtr_tab_s.shape[1], 1)
Xte_3d = Xte_tab_s.reshape(-1, Xte_tab_s.shape[1], 1)

def ds(X, y, batch=1024, shuffle=True):
    d = tf.data.Dataset.from_tensor_slices((X, y.astype(np.int32)))
    if shuffle: d = d.shuffle(65536, seed=42, reshuffle_each_iteration=True)
    return d.batch(batch).prefetch(tf.data.AUTOTUNE)
ds_tr, ds_te = ds(Xtr_3d, y_tr), ds(Xte_3d, y_te, shuffle=False)

# ================= E) Metrics helpers =================
def metric_dict(y_true, y_hat_or_proba):
    arr = np.asarray(y_hat_or_proba).reshape(-1)
    ypred = arr.astype(int) if set(np.unique(arr)) <= {0,1} else (arr >= 0.5).astype(int)
    return dict(
        Accuracy  = float(accuracy_score(y_true, ypred)),
        Precision = float(precision_score(y_true, ypred, average="weighted", zero_division=1)),
        Recall    = float(recall_score(y_true, ypred, average="weighted", zero_division=1)),
        F1        = float(f1_score(y_true, ypred, average="weighted", zero_division=1))
    )

# ================= F) Experts =================
pos_ratio = float(np.mean(y_tr))
scale_pos_weight = float((1 - pos_ratio) / max(pos_ratio, 1e-6))

xgb = XGBClassifier(
    n_estimators=600, learning_rate=0.06, max_depth=6,
    subsample=0.9, colsample_bytree=0.9,
    tree_method="hist", eval_metric="logloss",
    random_state=42, n_jobs=4,
    scale_pos_weight=scale_pos_weight
)
xgb.fit(Xtr_tab, y_tr); proba_xgb = xgb.predict_proba(Xte_tab)[:,1]

def build_dl(kind, d):
    m = Sequential([Input(shape=(d,1))])
    if kind=="CNN":
        m.add(Conv1D(64, 3, activation="relu")); m.add(Flatten())
    elif kind=="LSTM":
        m.add(LSTM(64))
    elif kind=="Stacked LSTM":
        m.add(LSTM(64, return_sequences=True)); m.add(LSTM(32))
    elif kind=="Bi-LSTM":
        m.add(Bidirectional(LSTM(64)))
    m.add(Dense(1, activation="sigmoid", dtype="float32"))
    m.compile(optimizer=Adam(1e-3), loss="binary_crossentropy")
    return m

DL_EPOCHS=6
dl_outs={}
for kind in ["CNN","LSTM","Bi-LSTM","Stacked LSTM"]:
    mdl=build_dl(kind, Xtr_3d.shape[1])
    mdl.fit(ds_tr, epochs=DL_EPOCHS, verbose=0)
    dl_outs[kind]=mdl.predict(ds_te, verbose=0).reshape(-1)
    del mdl; gc.collect()

# ============== Build stacked logits for PPO ==============
def to_logit(p, eps=1e-6):
    p = np.clip(p, eps, 1-eps)
    return np.log(p) - np.log(1-p)

stack_logits = np.vstack([
    to_logit(proba_xgb),
    to_logit(dl_outs["CNN"]),
    to_logit(dl_outs["LSTM"]),
    to_logit(dl_outs["Bi-LSTM"]),
    to_logit(dl_outs["Stacked LSTM"])
]).T.astype("float32")

scaler_stack = StandardScaler()
stack5 = scaler_stack.fit_transform(stack_logits).astype("float32")

# ============== Build episodes with temporal drift ==============
idx_te = X_cat_te.index
if col_date and col_date in df.columns:
    tseries = pd.to_datetime(df.loc[idx_te, col_date], errors="coerce")
else:
    tseries = pd.Series(pd.date_range("2000-01-01", periods=len(idx_te), freq="H"), index=idx_te)

def safe_series(colname):
    if colname and colname in df.columns:
        return df.loc[idx_te, colname].astype(str)
    return pd.Series("NA", index=idx_te)

route = safe_series(col_org) + "|" + safe_series(col_dst) + "|" + safe_series(col_car)
month = tseries.dt.to_period("M").astype(str)

# sort by time, then group by (route, month) to surface drift
ord_idx = np.argsort(tseries.values)
y_seq  = y_te[ord_idx]
stack5 = stack5[ord_idx]
route  = route.iloc[ord_idx].values
month  = month.iloc[ord_idx].values

# engineered sequential signals for PPO
num_te_sorted = num_df.loc[idx_te].iloc[ord_idx]
slack = (num_te_sorted["tpt_sched"].values - num_te_sorted["tpt_real"].values).astype("float32")

# per-route rolling late rate (window 10 within sorted order)
late_rolling = np.zeros_like(y_seq, dtype="float32")
last_by_route = {}
for i,(r,yv) in enumerate(zip(route, y_seq)):
    if r not in last_by_route: last_by_route[r]=[]
    arr = last_by_route[r]
    arr.append(int(yv))
    wnd = arr[-10:] if len(arr)>=10 else arr
    late_rolling[i] = np.mean(wnd) if len(wnd)>0 else 0.0

# EWMA of XGB logit as risk proxy
xgb_sig = stack5[:,0]
ewma = pd.Series(xgb_sig).ewm(alpha=0.2, adjust=False).mean().values.astype("float32")

# cyclical time features
dow  = pd.to_datetime(tseries.iloc[ord_idx]).dt.dayofweek.values
hour = pd.to_datetime(tseries.iloc[ord_idx]).dt.hour.values
def cyc(a, K): 
    return np.stack([np.sin(2*np.pi*a/K), np.cos(2*np.pi*a/K)], axis=1).astype("float32")
dow2 = cyc(dow, 7); hour2 = cyc(hour, 24)

# value for FN scaling
if col_val and col_val in df.columns:
    value = df.loc[idx_te, col_val].fillna(0).values.astype("float32")[ord_idx]
else:
    value = np.ones_like(y_seq, dtype="float32")
v90 = np.percentile(value, 90) if value.size else 1.0
value = (value / (v90 + 1e-6)).clip(0.5, 3.0).astype("float32")

# episodes: contiguous blocks of same (route, month), min length
MIN_LEN=6
episodes=[]; start=0
def same_pair(i,j): return (route[i]==route[j]) and (month[i]==month[j])
for i in range(1,len(y_seq)+1):
    if i==len(y_seq) or not same_pair(i-1,i):
        if i-start >= MIN_LEN:
            episodes.append(slice(start,i))
        start=i
if not episodes: episodes=[slice(0,len(y_seq))]

# ============== PPO Env ==============
class SeqEnv(gym.Env):
    metadata={"render_modes":[]}
    def __init__(self, X, y, episodes, slack, late_roll, ewma, dow2, hour2, value,
                 K=5, init_pen=0.002, final_pen=0.02, anneal_steps=80_000):
        super().__init__()
        self.X, self.y = X, y.astype(int)
        self.episodes = episodes
        self.slack = slack; self.late_roll=late_roll; self.ewma=ewma
        self.dow2=dow2; self.hour2=hour2; self.value=value
        self.K=K; self.init_pen=init_pen; self.final_pen=final_pen; self.anneal_steps=anneal_steps
        # obs: 5 scores + 2(pos) + K(hist) + 2(FP/FN) + 1(slack) + 1(roll) + 1(ewma) + 2(dow) + 2(hour) = 5+2+K+2+1+1+1+2+2
        self.obs_dim = 5 + 2 + K + 2 + 1 + 1 + 1 + 2 + 2
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.obs_dim,), dtype=np.float32)
        self.action_space = spaces.Discrete(2)
        self._ep=-1; self.global_steps=0

    def _tfeat(self, t, T):
        pos = t/max(T-1,1)
        return np.array([np.sin(2*np.pi*pos), np.cos(2*np.pi*pos)], dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._ep = (self._ep+1) % len(self.episodes)
        sl = self.episodes[self._ep]
        self.idx = np.arange(sl.start, sl.stop, dtype=int)
        self.t=0; self.last=np.zeros(self.K,dtype=np.float32)
        self.fp=0.0; self.fn=0.0; self.prev_fn=0.0
        self.tp=0.0; self.ppo_preds=[]
        return self._obs(), {}

    def _obs(self):
        T=len(self.idx); i=self.idx[self.t]
        return np.concatenate([
            self.X[i], self._tfeat(self.t,T), self.last,
            np.array([self.fp,self.fn],dtype=np.float32),
            np.array([self.slack[i], self.late_roll[i], self.ewma[i]], dtype=np.float32),
            self.dow2[i], self.hour2[i]
        ]).astype(np.float32)

    def step(self, a):
        i=self.idx[self.t]; yv=self.y[i]
        # TP/TN/FP/FN rewards with heavy FN penalty scaled by value & duration
        dur = 1.0 + np.clip(float(self.slack[i] < 0) * (-self.slack[i]), 0, 10.0)  # if real>sched, duration magnitude
        if   (a==1 and yv==1): r = 3.0; self.tp += 1.0
        elif (a==0 and yv==0): r = 1.0
        elif (a==1 and yv==0): r = -2.0
        else:                  r = -10.0 * self.value[i] * (1.0 + 0.1*dur)  # **very strong**
        # curriculum step penalty
        pen = self.init_pen + (self.final_pen - self.init_pen) * min(1.0, self.global_steps/self.anneal_steps)
        r -= pen
        # shaping: reward FN-rate improvement
        self.fn += float(yv==1 and a==0); self.fp += float(yv==0 and a==1)
        steps=float(self.t+1); fn_rate=self.fn/max(steps,1.0)
        if fn_rate < self.prev_fn: r += 0.2
        self.prev_fn = fn_rate
        # discourage over-streak of "late"
        if self.t>=3 and np.allclose(self.last[-3:], 1.0): r -= 0.05

        self.last = np.roll(self.last, -1); self.last[-1]=float(a)
        self.ppo_preds.append(a)
        self.t += 1
        done = self.t >= len(self.idx)

        # terminal bonuses: emphasize late-class Recall/F1
        if done:
            T = len(self.idx)
            idxs = self.idx
            y_ep = self.y[idxs]; a_ep = np.array(self.ppo_preds, dtype=int)
            # per-episode confusion
            tp = np.sum((a_ep==1) & (y_ep==1)); fn = np.sum((a_ep==0) & (y_ep==1))
            fp = np.sum((a_ep==1) & (y_ep==0)); tn = np.sum((a_ep==0) & (y_ep==0))
            rec = tp / max(tp+fn, 1)
            prec= tp / max(tp+fp, 1)
            f1 = 2*prec*rec / max(prec+rec, 1e-9)
            # bonuses
            r += 2.0 * rec              # strong recall bonus
            r += 1.0 * f1               # F1 shaping
            if rec >= 0.90: r += 1.0
            if f1  >= 0.90: r += 0.5

        obs = np.zeros(self.observation_space.shape[0], dtype=np.float32) if done else self._obs()
        self.global_steps += 1
        return obs, float(r), done, False, {}

# ============== Make vectorized envs ==============
def make_env():
    return SeqEnv(stack5, y_seq, episodes, slack, late_rolling, ewma, dow2, hour2, value, K=5)

n_envs=4
venv = make_vec_env(make_env, n_envs=n_envs)
venv = VecMonitor(venv)
venv = VecNormalize(venv, norm_obs=True, norm_reward=False, clip_obs=10.0)

# ============== PPO Agent (exploratory but stable) ==============
ppo = PPO(
    "MlpPolicy", venv, seed=42, verbose=0, device=rl_device,
    batch_size=8192, n_steps=2048, learning_rate=3e-4,
    gamma=0.995, gae_lambda=0.95, clip_range=0.2,
    ent_coef=0.03, vf_coef=0.5, n_epochs=12,
    target_kl=0.02, policy_kwargs=dict(net_arch=[128,128], ortho_init=True)
)

PPO_STEPS = 300_000
t0=time.time(); venv.training=True
ppo.learn(total_timesteps=PPO_STEPS)
print(f"[TIME] PPO learn: {time.time()-t0:.1f}s")
venv.save("ppo_vecnorm.pkl")

# ============== Deterministic rollout (same norm stats) ==============
eval_env = make_vec_env(make_env, n_envs=1)
eval_env = VecMonitor(eval_env)
eval_env = VecNormalize.load("ppo_vecnorm.pkl", eval_env)
eval_env.training=False

obs = eval_env.reset(); preds=[]
total_steps = sum(ep.stop-ep.start for ep in episodes)
for _ in range(total_steps):
    a,_ = ppo.predict(obs, deterministic=True)
    preds.append(int(a))
    obs,_,dones,_ = eval_env.step(a)
    if dones: obs = eval_env.reset()
preds = np.array(preds[:len(y_seq)], dtype=int)

# ============== Static ensemble comparator ==============
# Strong static comparator: mean of expert logits -> sigmoid -> 0/1 @ 0.5
def sigmoid(z): return 1/(1+np.exp(-z))
ens_logit = stack_logits[ord_idx][:,:5].mean(axis=1)  # average of 5 logits aligned with ord_idx
ens_pred  = (sigmoid(ens_logit) >= 0.5).astype(int)

# ============== Metrics: overall + per-class + delta ==============
def report(name, ytrue, yhat):
    print(f"\n--- {name} ---")
    print("Acc/Prec/Rec/F1 (weighted):",
          {k:round(v,6) for k,v in metric_dict(ytrue,yhat).items()})
    print(classification_report(ytrue, yhat, digits=4))
    cm = confusion_matrix(ytrue, yhat)
    print("Confusion Matrix:\n", cm)
    return {
        "Accuracy": accuracy_score(ytrue, yhat),
        "Precision": precision_score(ytrue, yhat, average="weighted", zero_division=1),
        "Recall": recall_score(ytrue, yhat, average="weighted", zero_division=1),
        "F1": f1_score(ytrue, yhat, average="weighted", zero_division=1),
        "Recall_late": recall_score(ytrue, yhat, labels=[1], average=None, zero_division=1)[0] if 1 in np.unique(ytrue) else np.nan
    }

# also collect expert single-model stats (prob->0/1)
res = {}
res["XGBoost"] = report("XGBoost", y_te, (proba_xgb>=0.5).astype(int))
for k in ["CNN","LSTM","Bi-LSTM","Stacked LSTM"]:
    res[k] = report(k, y_te, (dl_outs[k]>=0.5).astype(int))

# Map y_seq metrics back to y_te order for fair table? Here we just compare on the same (sorted) subset:
print("\n[NOTE] PPO/Ensemble evaluated on time-sorted test sequence (same items).")
res["Static Ensemble"] = report("Static Ensemble (avg logits)", y_seq, ens_pred)
res["PPO Sequential"]  = report("PPO Sequential", y_seq, preds)

# concise final table
def row(d): return [d["Accuracy"], d["Precision"], d["Recall"], d["F1"], d["Recall_late"]]
import pprint
table = pd.DataFrame({
    "Model": list(res.keys()),
    "Accuracy": [res[m]["Accuracy"] for m in res],
    "Precision": [res[m]["Precision"] for m in res],
    "Recall": [res[m]["Recall"] for m in res],
    "F1": [res[m]["F1"] for m in res],
    "Recall_late": [res[m]["Recall_late"] for m in res],
})
print("\n=== Summary (weighted) + Recall_late ===")
print(table.to_string(index=False))
table.to_csv("metrics_ppov3_with_recall_late.csv", index=False)

print(f"\n[OK] Saved: metrics_ppov3_with_recall_late.csv")
print(f"[TOTAL] {time.time()-t0_all:.1f}s")


[INFO] RL device: cpu
[INFO] Loaded: (180519, 53)
[INFO] TME: 0.18s


2025-08-17 13:47:46.950740: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.
2025-08-17 13:48:51.971308: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[TIME] PPO learn: 34.3s

--- XGBoost ---
Acc/Prec/Rec/F1 (weighted): {'Accuracy': 0.974435, 'Precision': 0.975569, 'Recall': 0.974435, 'F1': 0.974354}
              precision    recall  f1-score   support

           0     0.9999    0.9435    0.9709     16308
           1     0.9555    0.9999    0.9772     19796

    accuracy                         0.9744     36104
   macro avg     0.9777    0.9717    0.9740     36104
weighted avg     0.9756    0.9744    0.9744     36104

Confusion Matrix:
 [[15386   922]
 [    1 19795]]

--- CNN ---
Acc/Prec/Rec/F1 (weighted): {'Accuracy': 0.974463, 'Precision': 0.975599, 'Recall': 0.974463, 'F1': 0.974382}
              precision    recall  f1-score   support

           0     1.0000    0.9435    0.9709     16308
           1     0.9555    1.0000    0.9772     19796

    accuracy                         0.9745     36104
   macro avg     0.9777    0.9717    0.9741     36104
weighted avg     0.9756    0.9745    0.9744     36104

Confusion Matrix:
 [[1

In [11]:
# ==============================================================
# DataCo: Experts (XGB + 4 DL) → Sequential PPO that MUST add value
# Upgrades:
# - Drift-aware episodes: (route, month)
# - Inputs: expert logits + slack + EWMA risk + rolling late-rate + DOW/HOUR
# - Rewards: value/duration-scaled FN, TP bonus, terminal Recall/F1 bonus
# - Anti-imitation: penalize matching static ensemble near uncertainty/drift
# - VecNormalize, multi-env, more exploration, longer training
# ==============================================================

import os, gc, time, warnings, random
warnings.filterwarnings("ignore")

# mac stability
os.environ["OMP_NUM_THREADS"]="4"; os.environ["OPENBLAS_NUM_THREADS"]="4"
os.environ["MKL_NUM_THREADS"]="4"; os.environ["VECLIB_MAXIMUM_THREADS"]="4"
os.environ["NUMEXPR_NUM_THREADS"]="4"

import numpy as np; np.random.seed(42)
import pandas as pd; random.seed(42)

# --------- TensorFlow for DL heads ----------
import tensorflow as tf
tf.random.set_seed(42)
from tensorflow.keras import mixed_precision
try:
    mixed_precision.set_global_policy("mixed_float16")
except: pass
try:
    for g in tf.config.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(g, True)
except: pass

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, LSTM, Bidirectional, Flatten, Input
from tensorflow.keras.optimizers import Adam

# --------- PyTorch / SB3 for PPO ----------
import torch
rl_device = "mps" if torch.backends.mps.is_available() else "cpu"
print("[INFO] RL device:", rl_device)

# Sklearn & metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix)

from xgboost import XGBClassifier

# Gym/SB3
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize, VecMonitor

# ================= A) Load / cache parquet =================
# >>>> CHANGE THIS PATH <<<<
file_path = "/Users/dhadel/Downloads/DataCoSupplyChainDataset.csv"
parquet_path = os.path.splitext(file_path)[0] + ".parquet"

t0_all = time.time()
if not os.path.exists(parquet_path):
    print("[INFO] Converting CSV -> Parquet...")
    chunks = []
    for ch in pd.read_csv(file_path, engine="python", encoding="latin-1",
                          on_bad_lines="skip", chunksize=20000):
        chunks.append(ch)
    pd.concat(chunks, ignore_index=True).to_parquet(parquet_path, index=False)
    del chunks; gc.collect()

df = pd.read_parquet(parquet_path)
print("[INFO] Loaded:", df.shape)

# ================= B) Column mapping =================
def first_col(d, names):
    for n in names:
        if n in d.columns: return n
    return None

col_real  = first_col(df, ["Days for shipping (real)", "Days for shipping (real)_", "TPT"])
col_sched = first_col(df, ["Days for shipment (scheduled)", "Days for shipment (scheduled)_"])
col_y     = first_col(df, ["Late_delivery_risk", "Is_Late"])
col_qty   = first_col(df, ["Order Item Quantity", "Unit quantity", "Quantity"])
col_org   = first_col(df, ["Order City", "Order Region", "Order Country"])
col_dst   = first_col(df, ["Customer City", "Customer Region", "Customer Country"])
col_car   = first_col(df, ["Shipping Mode", "Carrier", "Shipment Mode", "Ship Mode"])
col_date  = first_col(df, ["Order Date (DateOrders)", "Order Date", "DateOrders", "Date"])
col_val   = first_col(df, ["Order Item Total", "Sales"])

# target if missing
if col_y is None:
    if (col_real is None) or (col_sched is None):
        raise ValueError("Need Late_delivery_risk or both real/scheduled days.")
    df["__y__"] = (pd.to_numeric(df[col_real], errors="coerce") >
                   pd.to_numeric(df[col_sched], errors="coerce")).astype(int)
    col_y = "__y__"

keep = [c for c in [col_qty, col_real, col_sched, col_y, col_org, col_dst, col_car, col_date, col_val] if c]
df = df[keep].copy()

# numerics
for c in [col_qty, col_real, col_sched]:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=[col_y, col_real, col_sched, col_qty])

y_all = df[col_y].astype(int).to_numpy()

# ================= C) Base features =================
num_df = pd.DataFrame(index=df.index)
num_df["qty"]         = df[col_qty]
num_df["tpt_real"]    = df[col_real]
num_df["tpt_sched"]   = df[col_sched]
num_df["lead_dev"]    = num_df["tpt_real"] - num_df["tpt_sched"]
num_df["tpt_per_unit"]= num_df["tpt_real"] / np.clip(num_df["qty"].replace(0, np.nan), 1, None)
num_df["log_qty"]     = np.log1p(np.clip(num_df["qty"], 1, None))

cat_df = pd.DataFrame(index=df.index)
cat_df["org"] = df[col_org].astype(str) if col_org else "NA"
cat_df["dst"] = df[col_dst].astype(str) if col_dst else "NA"
cat_df["car"] = df[col_car].astype(str) if col_car else "NA"

# ================= D) Split then Target Mean Encoding =================
X_num_all = num_df.to_numpy(dtype="float32")
X_cat_all = cat_df.copy()

X_cat_tr, X_cat_te, X_num_tr, X_num_te, y_tr, y_te = train_test_split(
    X_cat_all, X_num_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)

def kfold_tme(X_cat: pd.DataFrame, y: np.ndarray, n_splits=5, alpha=10.0, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    enc_maps = {col: [] for col in X_cat.columns}
    X_enc = pd.DataFrame(index=X_cat.index, dtype="float32")
    global_mean = float(y.mean())
    for col in X_cat.columns:
        col_enc = pd.Series(index=X_cat.index, dtype="float32")
        for tr_idx, va_idx in skf.split(np.zeros(len(y)), y):
            keys_tr = X_cat.iloc[tr_idx, X_cat.columns.get_loc(col)]
            y_tr_f  = y[tr_idx]
            stats = pd.DataFrame({"k": keys_tr.values, "y": y_tr_f}).groupby("k")["y"].agg(["count","mean"])
            smooth = (stats["count"]*stats["mean"] + alpha*global_mean) / (stats["count"] + alpha)
            keys_va = X_cat.iloc[va_idx, X_cat.columns.get_loc(col)]
            col_enc.iloc[va_idx] = keys_va.map(smooth).fillna(global_mean).astype("float32")
        X_enc[col] = col_enc.values
        stats_full = pd.DataFrame({"k": X_cat[col].values, "y": y}).groupby("k")["y"].agg(["count","mean"])
        smooth_full = (stats_full["count"]*stats_full["mean"] + alpha*global_mean) / (stats_full["count"] + alpha)
        enc_maps[col] = [smooth_full.to_dict(), float(global_mean)]
    return X_enc.astype("float32"), enc_maps

def apply_tme(X_cat: pd.DataFrame, maps):
    out = pd.DataFrame(index=X_cat.index, dtype="float32")
    for col in X_cat.columns:
        m, g = maps[col]
        out[col] = X_cat[col].map(m).fillna(g).astype("float32")
    return out

t0 = time.time()
Xtr_cat_tme, encoders = kfold_tme(X_cat_tr, y_tr, n_splits=5, alpha=10.0)
Xte_cat_tme = apply_tme(X_cat_te, encoders)
print(f"[INFO] TME: {time.time()-t0:.2f}s")

Xtr_tab = np.hstack([X_num_tr, Xtr_cat_tme.to_numpy()])
Xte_tab = np.hstack([X_num_te, Xte_cat_tme.to_numpy()])

scaler_tab = StandardScaler()
Xtr_tab_s = scaler_tab.fit_transform(Xtr_tab).astype("float32")
Xte_tab_s = scaler_tab.transform(Xte_tab).astype("float32")

Xtr_3d = Xtr_tab_s.reshape(-1, Xtr_tab_s.shape[1], 1)
Xte_3d = Xte_tab_s.reshape(-1, Xte_tab_s.shape[1], 1)

def ds(X, y, batch=1024, shuffle=True):
    d = tf.data.Dataset.from_tensor_slices((X, y.astype(np.int32)))
    if shuffle: d = d.shuffle(65536, seed=42, reshuffle_each_iteration=True)
    return d.batch(batch).prefetch(tf.data.AUTOTUNE)
ds_tr, ds_te = ds(Xtr_3d, y_tr), ds(Xte_3d, y_te, shuffle=False)

# ================= E) Metrics helpers =================
def metric_dict(y_true, y_hat_or_proba):
    arr = np.asarray(y_hat_or_proba).reshape(-1)
    ypred = arr.astype(int) if set(np.unique(arr)) <= {0,1} else (arr >= 0.5).astype(int)
    return dict(
        Accuracy  = float(accuracy_score(y_true, ypred)),
        Precision = float(precision_score(y_true, ypred, average="weighted", zero_division=1)),
        Recall    = float(recall_score(y_true, ypred, average="weighted", zero_division=1)),
        F1        = float(f1_score(y_true, ypred, average="weighted", zero_division=1))
    )

# ================= F) Train experts =================
pos_ratio = float(np.mean(y_tr))
scale_pos_weight = float((1 - pos_ratio) / max(pos_ratio, 1e-6))

xgb = XGBClassifier(
    n_estimators=650, learning_rate=0.06, max_depth=6,
    subsample=0.9, colsample_bytree=0.9,
    tree_method="hist", eval_metric="logloss",
    random_state=42, n_jobs=4, scale_pos_weight=scale_pos_weight
)
xgb.fit(Xtr_tab, y_tr); proba_xgb = xgb.predict_proba(Xte_tab)[:,1]

def build_dl(kind, d):
    m = Sequential([Input(shape=(d,1))])
    if kind=="CNN":
        m.add(Conv1D(64, 3, activation="relu")); m.add(Flatten())
    elif kind=="LSTM":
        m.add(LSTM(64))
    elif kind=="Stacked LSTM":
        m.add(LSTM(64, return_sequences=True)); m.add(LSTM(32))
    elif kind=="Bi-LSTM":
        m.add(Bidirectional(LSTM(64)))
    m.add(Dense(1, activation="sigmoid", dtype="float32"))
    m.compile(optimizer=Adam(1e-3), loss="binary_crossentropy")
    return m

DL_EPOCHS=6
dl_outs={}
for kind in ["CNN","LSTM","Bi-LSTM","Stacked LSTM"]:
    mdl=build_dl(kind, Xtr_3d.shape[1])
    mdl.fit(ds_tr, epochs=DL_EPOCHS, verbose=0)
    dl_outs[kind]=mdl.predict(ds_te, verbose=0).reshape(-1)
    del mdl; gc.collect()

# ================= G) Build stacked logits for PPO =================
def to_logit(p, eps=1e-6):
    p = np.clip(p, eps, 1-eps)
    return np.log(p) - np.log(1-p)

stack_logits = np.vstack([
    to_logit(proba_xgb),
    to_logit(dl_outs["CNN"]),
    to_logit(dl_outs["LSTM"]),
    to_logit(dl_outs["Bi-LSTM"]),
    to_logit(dl_outs["Stacked LSTM"])
]).T.astype("float32")

scaler_stack = StandardScaler()
stack5 = scaler_stack.fit_transform(stack_logits).astype("float32")

# ================= H) Drift-aware episodes: (route, month) =================
idx_te = X_cat_te.index
if col_date and col_date in df.columns:
    tseries = pd.to_datetime(df.loc[idx_te, col_date], errors="coerce")
else:
    tseries = pd.Series(pd.date_range("2000-01-01", periods=len(idx_te), freq="H"), index=idx_te)

def s(col):
    return df.loc[idx_te, col].astype(str) if col and col in df.columns else pd.Series("NA", index=idx_te)

route = s(col_org) + "|" + s(col_dst) + "|" + s(col_car)
month = tseries.dt.to_period("M").astype(str)

# sort by time
ord_idx = np.argsort(tseries.values)
y_seq  = y_te[ord_idx]
stack5 = stack5[ord_idx]
route  = route.iloc[ord_idx].values
month  = month.iloc[ord_idx].values

# sequential signals
num_te_sorted = num_df.loc[idx_te].iloc[ord_idx]
slack = (num_te_sorted["tpt_sched"].values - num_te_sorted["tpt_real"].values).astype("float32")

# per-route rolling late rate (window=10)
late_roll = np.zeros_like(y_seq, dtype="float32")
memo = {}
for i,(r,yv) in enumerate(zip(route, y_seq)):
    if r not in memo: memo[r]=[]
    memo[r].append(int(yv))
    wnd = memo[r][-10:] if len(memo[r])>=10 else memo[r]
    late_roll[i] = np.mean(wnd) if wnd else 0.0

# EWMA of XGB signal (use XGB logit col 0 of stack5 after std-scale)
xgb_sig = stack5[:,0]
ewma = pd.Series(xgb_sig).ewm(alpha=0.2, adjust=False).mean().values.astype("float32")
# convert EWMA to [0,1] "risk" via logistic
ewma_sig = 1/(1+np.exp(-ewma))

# cyclical DOW/HOUR
dow  = pd.to_datetime(tseries.iloc[ord_idx]).dt.dayofweek.values
hour = pd.to_datetime(tseries.iloc[ord_idx]).dt.hour.values
def cyc(a, K): 
    return np.stack([np.sin(2*np.pi*a/K), np.cos(2*np.pi*a/K)], axis=1).astype("float32")
dow2 = cyc(dow, 7); hour2 = cyc(hour, 24)

# value for FN scaling
if col_val and col_val in df.columns:
    value = df.loc[idx_te, col_val].fillna(0).values.astype("float32")[ord_idx]
else:
    value = np.ones_like(y_seq, dtype="float32")
v90 = np.percentile(value, 90) if value.size else 1.0
value = (value / (v90 + 1e-6)).clip(0.5, 3.0).astype("float32")

# episodes: contiguous (route, month) with min length
MIN_LEN=6
episodes=[]; start=0
def same_pair(i,j): return (route[i]==route[j]) and (month[i]==month[j])
for i in range(1,len(y_seq)+1):
    if i==len(y_seq) or not same_pair(i-1,i):
        if i-start >= MIN_LEN:
            episodes.append(slice(start,i))
        start=i
if not episodes: episodes=[slice(0,len(y_seq))]

# ================= I) Static ensemble (for anti-imitation) =================
# average of expert logits → sigmoid probability → action
ens_logit_sorted = stack_logits[ord_idx][:,:5].mean(axis=1)
ens_prob_sorted  = 1/(1+np.exp(-ens_logit_sorted))
ens_act_sorted   = (ens_prob_sorted >= 0.5).astype(int)

# ================= J) PPO Env with anti-imitation =================
class SeqEnv(gym.Env):
    metadata={"render_modes":[]}
    def __init__(self, X, y, episodes, slack, late_roll, ewma_sig, dow2, hour2, value,
                 ens_prob, ens_act, K=5,
                 init_pen=0.002, final_pen=0.02, anneal_steps=100_000):
        super().__init__()
        self.X, self.y = X, y.astype(int)
        self.episodes = episodes
        self.slack = slack; self.late_roll=late_roll; self.ewma_sig=ewma_sig
        self.dow2=dow2; self.hour2=hour2; self.value=value
        self.ens_prob=ens_prob; self.ens_act=ens_act
        self.K=K; self.init_pen=init_pen; self.final_pen=final_pen; self.anneal_steps=anneal_steps
        # obs: 5 scores + 2(pos) + K(hist) + 2(FP/FN) + 1(slack) + 1(roll) + 1(ewma) + 2(dow) + 2(hour)
        self.obs_dim = 5 + 2 + K + 2 + 1 + 1 + 1 + 2 + 2
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.obs_dim,), dtype=np.float32)
        self.action_space = spaces.Discrete(2)
        self._ep=-1; self.global_steps=0

    def _tfeat(self, t, T):
        pos = t/max(T-1,1)
        return np.array([np.sin(2*np.pi*pos), np.cos(2*np.pi*pos)], dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._ep = (self._ep+1) % len(self.episodes)
        sl = self.episodes[self._ep]
        self.idx = np.arange(sl.start, sl.stop, dtype=int)
        self.t=0; self.last=np.zeros(self.K,dtype=np.float32)
        self.fp=0.0; self.fn=0.0; self.prev_fn=0.0; self.ppo_preds=[]
        return self._obs(), {}

    def _obs(self):
        T=len(self.idx); i=self.idx[self.t]
        return np.concatenate([
            self.X[i], self._tfeat(self.t,T), self.last,
            np.array([self.fp,self.fn],dtype=np.float32),
            np.array([self.slack[i], self.late_roll[i], self.ewma_sig[i]], dtype=np.float32),
            self.dow2[i], self.hour2[i]
        ]).astype(np.float32)

    def step(self, a):
        i=self.idx[self.t]; yv=self.y[i]
        # Base rewards: heavy FN penalty scaled by value and duration
        dur = 1.0 + np.clip(float(self.slack[i] < 0) * (-self.slack[i]), 0, 10.0)
        if   (a==1 and yv==1): r = 3.0   # TP bonus
        elif (a==0 and yv==0): r = 1.0   # TN
        elif (a==1 and yv==0): r = -2.0  # FP
        else:                  r = -12.0 * self.value[i] * (1.0 + 0.1*dur)  # **very strong FN**
        # curriculum step penalty
        pen = self.init_pen + (self.final_pen - self.init_pen) * min(1.0, self.global_steps/self.anneal_steps)
        r -= pen

        # Anti-imitation: discourage copying ensemble near uncertainty/drift
        ens_p = self.ens_prob[i]; ens_a = self.ens_act[i]
        uncertainty = 1.0 - min(1.0, 2.0*abs(ens_p - 0.5))     # 1 @ 0.5, 0 @ 0 or 1
        drift = 0.5*self.late_roll[i] + 0.5*self.ewma_sig[i]   # ~[0,1], high → drift/risk
        weight = 0.02 * (0.5*uncertainty + 0.5*drift)          # up to ~0.02
        if a == ens_a: r -= weight
        else:          r += weight

        # shaping: reward improvement in online FN-rate
        self.fn += float(yv==1 and a==0); self.fp += float(yv==0 and a==1)
        steps=float(self.t+1); fn_rate=self.fn/max(steps,1.0)
        if fn_rate < self.prev_fn: r += 0.2
        self.prev_fn = fn_rate

        # discourage "late" streaks without evidence
        if self.t>=3 and np.allclose(self.last[-3:], 1.0): r -= 0.05

        self.last = np.roll(self.last, -1); self.last[-1]=float(a)
        self.ppo_preds.append(a)
        self.t += 1
        done = self.t >= len(self.idx)

        # terminal bonuses: emphasize late recall & F1
        if done:
            idxs = self.idx
            y_ep = self.y[idxs]; a_ep = np.array(self.ppo_preds, dtype=int)
            tp = np.sum((a_ep==1) & (y_ep==1)); fn = np.sum((a_ep==0) & (y_ep==1))
            fp = np.sum((a_ep==1) & (y_ep==0))
            rec = tp / max(tp+fn, 1)
            prec= tp / max(tp+fp, 1)
            f1 = 2*prec*rec / max(prec+rec, 1e-9)
            r += 2.5 * rec + 1.0 * f1
            if rec >= 0.92: r += 1.0
            if f1  >= 0.92: r += 0.5

        obs = np.zeros(self.observation_space.shape[0], dtype=np.float32) if done else self._obs()
        self.global_steps += 1
        return obs, float(r), done, False, {}

# ================= K) Vectorized envs + PPO =================
def make_env():
    return SeqEnv(stack5, y_seq, episodes, slack, late_roll, ewma_sig, dow2, hour2, value,
                  ens_prob_sorted, ens_act_sorted, K=5)

n_envs=4
venv = make_vec_env(make_env, n_envs=n_envs)
venv = VecMonitor(venv)
venv = VecNormalize(venv, norm_obs=True, norm_reward=False, clip_obs=10.0)

ppo = PPO(
    "MlpPolicy", venv, seed=42, verbose=0, device=rl_device,
    batch_size=8192, n_steps=2048, learning_rate=3e-4,
    gamma=0.997, gae_lambda=0.95, clip_range=0.2,
    ent_coef=0.05, vf_coef=0.5, n_epochs=12,
    target_kl=0.02, policy_kwargs=dict(net_arch=[128,128], ortho_init=True)
)

PPO_STEPS = 500_000  # raise experience + exploration
t0=time.time(); venv.training=True
ppo.learn(total_timesteps=PPO_STEPS)
print(f"[TIME] PPO learn: {time.time()-t0:.1f}s")
venv.save("ppo_vecnorm_addvalue.pkl")

# ================= L) Deterministic rollout =================
eval_env = make_vec_env(make_env, n_envs=1)
eval_env = VecMonitor(eval_env)
eval_env = VecNormalize.load("ppo_vecnorm_addvalue.pkl", eval_env)
eval_env.training=False

obs = eval_env.reset(); preds=[]
total_steps = sum(ep.stop-ep.start for ep in episodes)
for _ in range(total_steps):
    a,_ = ppo.predict(obs, deterministic=True)
    preds.append(int(a))
    obs,_,dones,_ = eval_env.step(a)
    if dones: obs = eval_env.reset()
preds = np.array(preds[:len(y_seq)], dtype=int)

# ================= M) Baseline metrics + Static Ensemble =================
def report(name, ytrue, yhat):
    print(f"\n--- {name} ---")
    print("Acc/Prec/Rec/F1 (weighted):",
          {k:round(v,6) for k,v in metric_dict(ytrue,yhat).items()})
    print(classification_report(ytrue, yhat, digits=4))
    cm = confusion_matrix(ytrue, yhat)
    print("Confusion Matrix:\n", cm)
    # class 1 recall
    if 1 in np.unique(ytrue):
        recall_late = recall_score(ytrue, yhat, labels=[1], average=None, zero_division=1)[0]
    else:
        recall_late = np.nan
    # FP count
    fp = int(((yhat==1) & (ytrue==0)).sum())
    return {
        "Accuracy": accuracy_score(ytrue, yhat),
        "Precision": precision_score(ytrue, yhat, average="weighted", zero_division=1),
        "Recall": recall_score(ytrue, yhat, average="weighted", zero_division=1),
        "F1": f1_score(ytrue, yhat, average="weighted", zero_division=1),
        "Recall_late": recall_late,
        "FP_count": fp
    }

# Expert single-model summaries (prob→0/1 at 0.5)
res = {}
res["XGBoost"] = report("XGBoost", y_te, (proba_xgb>=0.5).astype(int))
for k in ["CNN","LSTM","Bi-LSTM","Stacked LSTM"]:
    res[k] = report(k, y_te, (dl_outs[k]>=0.5).astype(int))

print("\n[NOTE] Ensemble/PPO evaluated on the same time-sorted test items.")
ens_pred = (ens_prob_sorted >= 0.5).astype(int)
res["Static Ensemble"] = report("Static Ensemble (avg logits)", y_seq, ens_pred)
res["PPO Sequential"]  = report("PPO Sequential (anti-imitation)", y_seq, preds)

# Deltas vs Static Ensemble (the thing PPO must beat)
delta_recall_late = res["PPO Sequential"]["Recall_late"] - res["Static Ensemble"]["Recall_late"]
delta_fp          = res["PPO Sequential"]["FP_count"]   - res["Static Ensemble"]["FP_count"]

print("\n=== Δ vs Static Ensemble (PPO - Ensemble) ===")
print(f"Δ Recall_late: {delta_recall_late:+.6f}")
print(f"Δ FP_count   : {delta_fp:+d}")

# compact table
table = pd.DataFrame({
    "Model": list(res.keys()),
    "Accuracy": [res[m]["Accuracy"] for m in res],
    "Precision": [res[m]["Precision"] for m in res],
    "Recall": [res[m]["Recall"] for m in res],
    "F1": [res[m]["F1"] for m in res],
    "Recall_late": [res[m]["Recall_late"] for m in res],
    "FP_count": [res[m]["FP_count"] for m in res],
})
print("\n=== Summary (weighted) + Recall_late + FP_count ===")
print(table.to_string(index=False))
table.to_csv("metrics_ppov4_addvalue.csv", index=False)

print(f"\n[OK] Saved: metrics_ppov4_addvalue.csv")
print(f"[TOTAL] {time.time()-t0_all:.1f}s")


[INFO] RL device: cpu
[INFO] Loaded: (180519, 53)
[INFO] TME: 0.19s


2025-08-17 13:58:51.598064: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.
2025-08-17 13:59:57.020993: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[TIME] PPO learn: 58.8s

--- XGBoost ---
Acc/Prec/Rec/F1 (weighted): {'Accuracy': 0.974435, 'Precision': 0.975569, 'Recall': 0.974435, 'F1': 0.974354}
              precision    recall  f1-score   support

           0     0.9999    0.9435    0.9709     16308
           1     0.9555    0.9999    0.9772     19796

    accuracy                         0.9744     36104
   macro avg     0.9777    0.9717    0.9740     36104
weighted avg     0.9756    0.9744    0.9744     36104

Confusion Matrix:
 [[15386   922]
 [    1 19795]]

--- CNN ---
Acc/Prec/Rec/F1 (weighted): {'Accuracy': 0.974463, 'Precision': 0.975599, 'Recall': 0.974463, 'F1': 0.974382}
              precision    recall  f1-score   support

           0     1.0000    0.9435    0.9709     16308
           1     0.9555    1.0000    0.9772     19796

    accuracy                         0.9745     36104
   macro avg     0.9777    0.9717    0.9741     36104
weighted avg     0.9756    0.9745    0.9744     36104

Confusion Matrix:
 [[1