In [None]:
import os, time, random, warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
random.seed(42); np.random.seed(42)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
import tensorflow as tf
tf.random.set_seed(42)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, LSTM, Bidirectional, Flatten, Input
from tensorflow.keras.optimizers import Adam

import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.logger import configure

TOTAL_STEPS_PPO = 100_000
BATCH_SIZE = 64
N_STEPS = 2048
LEARNING_RATE = 3e-4
ENT_COEF = 0.0
N_EPOCHS = 10

from pathlib import Path
candidates = [
    Path("Data/Hybrid_Augmented_TSAFE_Features.xlsx"),
    Path("../Data/Hybrid_Augmented_TSAFE_Features.xlsx"),
    Path("../data/Hybrid_Augmented_TSAFE_Features.xlsx"),
]
for p in candidates:
    if p.exists():
        file_path = str(p)
        break
else:
    raise FileNotFoundError(
        f"Could not find Excel. Tried: {candidates}. CWD={Path.cwd()}"
    )

df = pd.read_excel(file_path)

if 'Plant_Destination' not in df.columns:
    if {'Plant Code', 'Destination Port'}.issubset(df.columns):
        df['Plant_Destination'] = df['Plant Code'].astype(str) + ' | ' + df['Destination Port'].astype(str)
    else:
        raise ValueError("Cannot create 'Plant_Destination': missing 'Plant Code' or 'Destination Port'.")

cat_features = ['Origin Port','Carrier','Plant Code','Destination Port','Plant_Destination']
num_features = [
    'Unit quantity','Weight','TPT',
    'TPT_per_Unit','LeadTime_Deviation','Weight_per_Unit','log_UnitQty',
    'carrier_origin_risk','route_cum_late_rate','route_bb_mean','carrier_bb_mean',
    'route_orders_last7d','route_roll10_Weight_q90',
    'congestion_trend','Weight_vsCarrierMean','seq_pos_norm'
]

requested = [c for c in (cat_features + num_features) if c in df.columns]
missing = sorted(set(cat_features + num_features) - set(requested))
if missing:
    print(f"[WARN] Skipping missing columns: {missing}")

X_raw = pd.get_dummies(df[requested], drop_first=False)
y = (df['Ship Late Day count'] > 0).astype(int)

X_raw = X_raw.replace([np.inf,-np.inf], np.nan).fillna(X_raw.median(numeric_only=True))

X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.20, random_state=42, stratify=y
)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled  = scaler.transform(X_test)

def create_model(model_type, input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim, 1)))
    if model_type == 'CNN':
        model.add(Conv1D(64, 2, activation='relu')); model.add(Flatten())
    elif model_type == 'LSTM':
        model.add(LSTM(64, activation='tanh'))
    elif model_type == 'Bi-LSTM':
        model.add(Bidirectional(LSTM(64, activation='tanh')))
    elif model_type == 'Stacked LSTM':
        model.add(LSTM(64, activation='tanh', return_sequences=True))
        model.add(LSTM(32, activation='tanh'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=1e-3, clipnorm=1.0), loss='binary_crossentropy')
    return model

xgb = GridSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric="logloss", verbosity=0),
    param_grid={'n_estimators':[100],'max_depth':[3,5],'learning_rate':[0.1,0.05],'subsample':[0.8]},
    scoring='roc_auc', cv=3, n_jobs=-1
)
xgb.fit(X_train, y_train)
xgb_proba = xgb.best_estimator_.predict_proba(X_test)[:, 1]

X_train_dl = X_train_scaled.reshape(-1, X_train_scaled.shape[1], 1)
X_test_dl  = X_test_scaled.reshape(-1,  X_test_scaled.shape[1],  1)

dl_models = ['CNN','LSTM','Bi-LSTM','Stacked LSTM']
dl_outputs = {}
for m in dl_models:
    mdl = create_model(m, X_train_dl.shape[1])
    mdl.fit(X_train_dl, y_train_res, epochs=10, batch_size=256, verbose=0)
    dl_outputs[m] = mdl.predict(X_test_dl, verbose=0).reshape(-1)

ppo_input_static = np.vstack([xgb_proba, dl_outputs['CNN'], dl_outputs['LSTM'],
                              dl_outputs['Bi-LSTM'], dl_outputs['Stacked LSTM']]).T.astype(np.float32)
ppo_input_static = MinMaxScaler().fit_transform(ppo_input_static).astype(np.float32)
ppo_labels_static = y_test.values.astype(int)

class PPOHybridEnv(gym.Env):
    """Static classification PPO. One episode = pass through test rows once."""
    metadata = {"render_modes": []}
    def __init__(self, inputs, labels, pos_reward=1.0, neg_reward=-5.0):
        super().__init__()
        self.inputs = inputs; self.labels = labels.astype(int)
        self.n = len(labels); self.pos_reward = pos_reward; self.neg_reward = neg_reward
        self.observation_space = spaces.Box(low=0, high=1, shape=(inputs.shape[1],), dtype=np.float32)
        self.action_space = spaces.Discrete(2); self.idx = 0
    def reset(self, seed=None, options=None):
        super().reset(seed=seed); self.idx = 0
        return self.inputs[self.idx], {}
    def step(self, action):
        y = self.labels[self.idx]
        reward = self.pos_reward if action == y else self.neg_reward
        self.idx += 1
        terminated = self.idx >= self.n
        obs = np.zeros(self.inputs.shape[1], dtype=np.float32) if terminated else self.inputs[self.idx]
        return obs, float(reward), terminated, False, {}

test_idx = X_test.index
test_frame = df.loc[test_idx, ['Order Date','Origin Port','Destination Port','Carrier']].copy()
test_frame['Order Date'] = pd.to_datetime(test_frame['Order Date'])
test_frame['y'] = y_test.values.astype(int)
test_frame['xgb'] = xgb_proba
for m in dl_models: test_frame[m] = dl_outputs[m]
test_frame['route_key'] = (test_frame['Origin Port'].astype(str) + ' | ' +
                           test_frame['Destination Port'].astype(str) + ' | ' +
                           test_frame['Carrier'].astype(str))
test_frame = test_frame.sort_values('Order Date').reset_index(drop=True)

base_inputs = test_frame[['xgb','CNN','LSTM','Bi-LSTM','Stacked LSTM']].values.astype(np.float32)
base_inputs = MinMaxScaler().fit_transform(base_inputs).astype(np.float32)
labels_sorted = test_frame['y'].values.astype(int)
route_sorted  = test_frame['route_key'].values

episodes = []
start = 0
for i in range(1, len(test_frame)+1):
    if i == len(test_frame) or route_sorted[i] != route_sorted[i-1]:
        episodes.append(slice(start, i)); start = i

class SequentialPPOEnv(gym.Env):
    """Sequential PPO with temporal context and cost-sensitive rewards."""
    metadata = {"render_modes": []}
    def __init__(self, base_inputs, labels, episodes, K=5):
        super().__init__()
        self.base_inputs = base_inputs; self.labels = labels.astype(int); self.episodes = episodes; self.K = K
        self.obs_dim = 5 + 2 + K + 2
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.obs_dim,), dtype=np.float32)
        self.action_space = spaces.Discrete(2)
        self._ep_idx = -1; self._indices = None; self._t = None
        self._last_actions = None; self._cum_fp = None; self._cum_fn = None; self._prev_fn_rate = 0.0
    def _time_features(self, t, T):
        pos = (t / max(T-1, 1)); return np.array([np.sin(2*np.pi*pos), np.cos(2*np.pi*pos)], dtype=np.float32)
    def _obs(self):
        T = len(self._indices); cur_idx = self._indices[self._t]
        x = self.base_inputs[cur_idx]; time_feat = self._time_features(self._t, T); lastK = self._last_actions.copy()
        obs = np.concatenate([x, time_feat, lastK, np.array([self._cum_fp, self._cum_fn], dtype=np.float32)], axis=0)
        return obs.astype(np.float32)
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._ep_idx = (self._ep_idx + 1) % len(self.episodes)
        sl = self.episodes[self._ep_idx]; self._indices = np.arange(sl.start, sl.stop, dtype=int); self._t = 0
        self._last_actions = np.zeros(self.K, dtype=np.float32); self._cum_fp = 0.0; self._cum_fn = 0.0
        self._prev_fn_rate = 0.0; return self._obs(), {}
    def step(self, action):
        cur_i = self._indices[self._t]; y = self.labels[cur_i]
        if action == y: reward = 2.0 if y == 1 else 1.0
        else:
            if y == 1 and action == 0: reward = -5.0; self._cum_fn += 1.0
            else:                      reward = -2.0; self._cum_fp += 1.0
        reward -= 0.01
        steps_so_far = float(self._t + 1); fn_rate = self._cum_fn / steps_so_far
        if fn_rate < self._prev_fn_rate: reward += 0.2
        self._prev_fn_rate = fn_rate
        self._last_actions = np.roll(self._last_actions, -1); self._last_actions[-1] = float(action)
        self._t += 1; terminated = self._t >= len(self._indices)
        obs = np.zeros(self.obs_dim, dtype=np.float32) if terminated else self._obs()
        return obs, float(reward), terminated, False, {}

def train_with_csv_logger(env_fn, log_root, total_steps=TOTAL_STEPS_PPO):
    """
    Trains PPO, logging per-iteration metrics to CSV via SB3 logger,
    then returns the loaded progress.csv as a DataFrame.
    """
    os.makedirs(log_root, exist_ok=True)
    env = make_vec_env(env_fn, n_envs=1, monitor_dir=os.path.join(log_root, "monitor"))
    model = PPO(
        "MlpPolicy", env, verbose=0, seed=42,
        n_steps=N_STEPS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE,
        ent_coef=ENT_COEF, n_epochs=N_EPOCHS
    )
   
    new_logger = configure(log_root, ["csv"])
    model.set_logger(new_logger)

    model.learn(total_timesteps=total_steps)

   
    csv_path = os.path.join(log_root, "progress.csv")
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"Expected {csv_path} not found. Check logger configuration.")
    df_prog = pd.read_csv(csv_path)
    return model, env, df_prog, csv_path

def extract_table_from_progress(df_prog):
    """
    Map SB3 columns to the exact table the user requested.
    Missing columns are filled with NaN (e.g., early iters without finished episodes).
    """
   
    cols = {
        "time/total_timesteps": "Time-steps",
        "rollout/ep_len_mean":  "Ep. Len",
        "rollout/ep_rew_mean":  "Ep. Reward",
        "train/policy_gradient_loss": "Policy Grad. Loss",
        "train/value_loss":           "Value Loss",
        "train/entropy_loss":         "Entropy Loss",
        "train/approx_kl":            "KL Divergence",
    }
    out = pd.DataFrame()
    for k, v in cols.items():
        out[v] = df_prog[k] if k in df_prog.columns else np.nan
   
    out["Total Loss"] = (
        out["Policy Grad. Loss"].astype(float) +
        out["Value Loss"].astype(float) +
        out["Entropy Loss"].astype(float)
    )

    return out


static_log_root = "./logs_static_ppo"
env_fn_static = lambda: PPOHybridEnv(ppo_input_static, ppo_labels_static)
model_static, env_static, df_prog_static, path_csv_static = train_with_csv_logger(env_fn_static, static_log_root)

table_static = extract_table_from_progress(df_prog_static)
table_static.to_csv("ppo_training_log_static.csv", index=False)
print("\n=== STATIC PPO: Per-Iteration Training Log (head) ===")
print(table_static.head().to_string(index=False))


seq_log_root = "./logs_sequential_ppo"
env_fn_seq = lambda: SequentialPPOEnv(base_inputs, labels_sorted, episodes, K=5)
model_seq, env_seq, df_prog_seq, path_csv_seq = train_with_csv_logger(env_fn_seq, seq_log_root)

table_seq = extract_table_from_progress(df_prog_seq)
table_seq.to_csv("ppo_training_log_sequential.csv", index=False)
print("\n=== SEQUENTIAL PPO: Per-Iteration Training Log (head) ===")
print(table_seq.head().to_string(index=False))



=== STATIC PPO: Per-Iteration Training Log (head) ===
 Time-steps  Ep. Len  Ep. Reward  Policy Grad. Loss  Value Loss  Entropy Loss  KL Divergence  Total Loss
       2048      NaN         NaN                NaN         NaN           NaN            NaN         NaN
       4096   2281.0     -4055.0          -0.040131  771.707002     -0.683103       0.019554  770.983768
       6144   2281.0     -3497.0          -0.046810  406.110355     -0.620498       0.023908  405.443047
       8192   2281.0     -2829.0          -0.041010  110.765680     -0.476688       0.049052  110.247982
      10240   2281.0     -1973.0          -0.030479   53.573654     -0.282799       0.108313   53.260376

=== SEQUENTIAL PPO: Per-Iteration Training Log (head) ===
 Time-steps  Ep. Len  Ep. Reward  Policy Grad. Loss  Value Loss  Entropy Loss  KL Divergence  Total Loss
       2048     1.74     -1.2534                NaN         NaN           NaN            NaN         NaN
       4096     1.51     -0.7651          -0.1