# Addendum Cells — Mentor Feedback (Append-Only)

> Paste these cells **after the last cell** of `01_train_decision_tree.ipynb`.
> They will: (1) reuse existing variables/paths if present; (2) otherwise fall back
> to sensible defaults. Nothing in prior cells is modified.


In [None]:
# ## A0. Safe Imports and Paths (non-destructive)
import os, json, math, warnings
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.inspection import permutation_importance

# Optional
try:
    import shap
    SHAP_AVAILABLE = True
except Exception:
    SHAP_AVAILABLE = False

# Reuse existing paths if they exist in globals; otherwise set defaults
try:
    ARTIFACTS
except NameError:
    ARTIFACTS = Path("artifacts")

try:
    FIGS
except NameError:
    FIGS = Path("reports") / "figures"

ARTIFACTS.mkdir(parents=True, exist_ok=True)
FIGS.mkdir(parents=True, exist_ok=True)

try:
    RANDOM_STATE
except NameError:
    RANDOM_STATE = 42

print("Using ARTIFACTS ->", ARTIFACTS)
print("Using FIGS      ->", FIGS)


In [None]:
# ## A1. Bootstrap Data/Columns from prior cells if missing
# Try to reuse df, target, date_col, preprocessor if defined; else infer.

if 'df' not in globals():
    # Fallback: try common data paths from the repo
    CANDS = [
        Path("data") / "mess_waste_GIM_500.csv",
        Path("mess_waste_GIM_500.csv"),
        Path("data") / "mess_waste_GIM_daily_exams.csv",
    ]
    src = next((p for p in CANDS if p.exists()), None)
    if src is None:
        raise FileNotFoundError("Could not find data. Please set df manually or place CSV under data/.")
    df = pd.read_csv(src)

# Detect/confirm target and date/event columns
POSSIBLE_TARGETS = ["food_waste_kg", "waste_kg", "leftover_kg", "wastage"]
DATE_CANDIDATES = ["date", "day", "timestamp", "dt"]
EVENT_CANDIDATES = ["event", "event_type", "special_event"]

if 'target' not in globals() or target not in df.columns:
    target = next((c for c in POSSIBLE_TARGETS if c in df.columns), None)
    if target is None:
        raise KeyError(f"Set target variable. Expected one of {POSSIBLE_TARGETS} in df.")

if 'date_col' not in globals() or date_col not in df.columns:
    date_col = next((c for c in DATE_CANDIDATES if c in df.columns), None)

if date_col is not None:
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.sort_values(date_col).reset_index(drop=True)

event_col = next((c for c in EVENT_CANDIDATES if c in df.columns), None)

# Build/Reuse a robust preprocessor if none defined
if 'preprocessor' not in globals():
    X_tmp = df.drop(columns=[target])
    num_cols = X_tmp.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X_tmp.select_dtypes(include=["object", "category"]).columns.tolist()
    if date_col in num_cols: num_cols.remove(date_col)
    if date_col in cat_cols: cat_cols.remove(date_col)
    preprocessor = ColumnTransformer([
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ])

print("Columns detected. target:", target, "| date_col:", date_col, "| event_col:", event_col)


In [None]:
# ## A2. Metrics & Utilities
def smape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    out = np.zeros_like(denom)
    mask = denom != 0
    out[mask] = diff[mask] / denom[mask]
    return 100.0 * np.mean(out)

def seasonal_naive(y_series, period=7):
    y = np.asarray(y_series)
    y_hat = np.full_like(y, np.nan, dtype=float)
    for t in range(period, len(y)):
        y_hat[t] = y[t - period]
    return y_hat

def interval_coverage(y_true, lo, hi):
    y = np.asarray(y_true)
    return float(np.mean((y >= lo) & (y <= hi)))

def waste_shortage(prepared, actual):
    prepared = np.asarray(prepared)
    actual = np.asarray(actual)
    waste = np.maximum(0.0, prepared - actual)
    shortage = np.maximum(0.0, actual - prepared)
    return waste, shortage


In [None]:
# ## A3. Rolling-Origin Backtest (RF) vs Seasonal-Naive
# Expanding window, H=1 day. Saves overall and monthly MAE/SMAPE.

# Feature engineering (light) — non-destructive to df
df_fe = df.copy()
if date_col is not None:
    d = df_fe[date_col].dt
    df_fe["dow"] = d.dayofweek
    df_fe["month"] = d.month
    df_fe["is_weekend"] = (d.dayofweek >= 5).astype(int)
    for win in (3, 7):
        df_fe[f"roll_mean_{win}"] = df_fe[target].rolling(win, min_periods=1).mean().shift(1)
        df_fe[f"roll_std_{win}"]  = df_fe[target].rolling(win, min_periods=1).std().shift(1)

y = df_fe[target].astype(float).values
X = df_fe.drop(columns=[target])

rf_bt = RandomForestRegressor(n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1)
rf_pipe = Pipeline([("prep", preprocessor), ("rf", rf_bt)])

y_naive = seasonal_naive(y, period=7)
preds_rf = np.full_like(y, np.nan, dtype=float)

start = 14  # two-week warmup
for t in range(start, len(df_fe)-1):
    train_idx = slice(0, t+1)
    test_idx = t+1
    rf_pipe.fit(X.iloc[train_idx], y[train_idx])
    preds_rf[test_idx] = rf_pipe.predict(X.iloc[[test_idx]])[0]

mask = ~np.isnan(preds_rf) & ~np.isnan(y_naive)
overall = {
    "RF_MAE": float(mean_absolute_error(y[mask], preds_rf[mask])),
    "RF_SMAPE": float(smape(y[mask], preds_rf[mask])),
    "Naive_MAE": float(mean_absolute_error(y[mask], y_naive[mask])),
    "Naive_SMAPE": float(smape(y[mask], y_naive[mask])),
}
overall["Improvement_MAE_%"] = 100.0 * (1 - overall["RF_MAE"] / overall["Naive_MAE"])
overall["Improvement_SMAPE_%"] = 100.0 * (1 - overall["RF_SMAPE"] / overall["Naive_SMAPE"])

months = (df_fe[date_col].dt.to_period("M").astype(str).values
          if date_col is not None else np.array(["ALL"]*len(df_fe)))

bt_df = pd.DataFrame({"month": months, "y": y, "rf": preds_rf, "naive": y_naive}).dropna()

monthly_rows = []
for m, g in bt_df.groupby("month"):
    monthly_rows.append({
        "month": m,
        "RF_MAE": float(mean_absolute_error(g["y"], g["rf"])),
        "RF_SMAPE": float(smape(g["y"], g["rf"])),
        "Naive_MAE": float(mean_absolute_error(g["y"], g["naive"])),
        "Naive_SMAPE": float(smape(g["y"], g["naive"])),
    })
monthly_df = pd.DataFrame(monthly_rows).sort_values("month")

pd.DataFrame([overall]).to_csv(ARTIFACTS / "backtest_overall.csv", index=False)
monthly_df.to_csv(ARTIFACTS / "backtest_monthly.csv", index=False)
print("Saved:", ARTIFACTS / "backtest_overall.csv")
print("Saved:", ARTIFACTS / "backtest_monthly.csv")


In [None]:
# ## A4. Quantile Regression Decision Layer (0.1/0.5/0.9) + Coverage + Policy
q_models = {}
for q in [0.1, 0.5, 0.9]:
    q_models[q] = Pipeline([
        ("prep", preprocessor),
        ("gbr", GradientBoostingRegressor(loss="quantile", alpha=q, random_state=RANDOM_STATE, n_estimators=400, max_depth=3))
    ])

q_lo = np.full_like(y, np.nan, dtype=float)
q_md = np.full_like(y, np.nan, dtype=float)
q_hi = np.full_like(y, np.nan, dtype=float)

for t in range(start, len(df_fe)-1):
    train_idx = slice(0, t+1)
    test_idx = t+1
    for q in [0.1, 0.5, 0.9]:
        q_models[q].fit(X.iloc[train_idx], y[train_idx])
    q_lo[test_idx] = q_models[0.1].predict(X.iloc[[test_idx]])[0]
    q_md[test_idx] = q_models[0.5].predict(X.iloc[[test_idx]])[0]
    q_hi[test_idx] = q_models[0.9].predict(X.iloc[[test_idx]])[0]

mask_q = ~np.isnan(q_lo) & ~np.isnan(q_md) & ~np.isnan(q_hi)
coverage90 = float(np.mean((y[mask_q] >= q_lo[mask_q]) & (y[mask_q] <= q_hi[mask_q])))
with open(ARTIFACTS / "interval_coverage.json", "w") as f:
    json.dump({"coverage90": coverage90}, f, indent=2)
print("Saved:", ARTIFACTS / "interval_coverage.json")

# Policy simulation across lambda in [0,1]
lams = np.linspace(0, 1, 6)
policy = []
for lam in lams:
    prepared = q_md.copy()
    prepared[mask_q] = q_md[mask_q] + lam * (q_hi[mask_q] - q_md[mask_q])
    waste = np.maximum(0, prepared[mask_q] - y[mask_q])
    shortage = np.maximum(0, y[mask_q] - prepared[mask_q])
    policy.append({
        "lambda": float(lam),
        "avg_waste": float(np.mean(waste)),
        "avg_shortage": float(np.mean(shortage)),
        "efficiency_%": float(100.0 * (1 - (np.mean(waste) / (np.mean(y[mask_q]) + 1e-8))))
    })
pd.DataFrame(policy).to_csv(ARTIFACTS / "policy_simulation.csv", index=False)
print("Saved:", ARTIFACTS / "policy_simulation.csv")


In [None]:
# ## A5. Event Effects + Hierarchical Pooling Fallback
evt = event_col if event_col in df_fe.columns else None
if evt is None:
    df_fe["event_synth"] = "Normal"
    evt = "event_synth"

bt_ev = pd.DataFrame({
    "date": df_fe[date_col] if date_col is not None else pd.RangeIndex(len(df_fe)),
    "y": y,
    "rf": pd.Series(np.where(np.isnan(q_md), np.nan, q_md)).fillna(method="ffill"),  # use median as a stable predictor if RF preds not aligned
    "event": df_fe[evt]
}).dropna()

bt_ev["resid"] = bt_ev["y"] - bt_ev["rf"]
evt_stats = bt_ev.groupby("event")["resid"].agg(["mean","std","count"]).reset_index().rename(columns={"mean":"resid_mean","std":"resid_std","count":"n"})
evt_stats["global_mean"] = bt_ev["resid"].mean()

k = max(1.0, evt_stats["n"].median())
evt_stats["shrink_mean"] = (evt_stats["n"]/(evt_stats["n"]+k))*evt_stats["resid_mean"] + (k/(evt_stats["n"]+k))*evt_stats["global_mean"]
evt_stats.to_csv(ARTIFACTS / "event_residuals.csv", index=False)
print("Saved:", ARTIFACTS / "event_residuals.csv")


In [None]:
# ## A6. Explainability (SHAP if available, else Permutation) + Top Errors
rf_full = Pipeline([("prep", preprocessor), ("rf", RandomForestRegressor(n_estimators=600, random_state=RANDOM_STATE, n_jobs=-1))])
rf_full.fit(X, y)

# Feature names
feat_names = []
try:
    feat_names += list(preprocessor.transformers_[0][2])  # numeric
    ohe = preprocessor.named_transformers_["cat"]
    base = preprocessor.transformers_[1][2]
    feat_names += ohe.get_feature_names_out(base).tolist()
except Exception:
    feat_names = [f"f{i}" for i in range(rf_full.named_steps['rf'].n_features_in_)]

importance_df = None
if SHAP_AVAILABLE:
    try:
        explainer = shap.TreeExplainer(rf_full.named_steps["rf"])
        X_tr = preprocessor.fit_transform(X)
        idx = np.linspace(0, X_tr.shape[0]-1, min(500, X_tr.shape[0])).astype(int)
        sv = explainer.shap_values(X_tr[idx])
        shap_abs = np.mean(np.abs(sv), axis=0)
        importance_df = pd.DataFrame({"feature": feat_names, "importance": shap_abs}).sort_values("importance", ascending=False)
    except Exception as e:
        print("SHAP error; falling back to permutation:", e)

if importance_df is None:
    perm = permutation_importance(rf_full, X, y, n_repeats=5, random_state=RANDOM_STATE, n_jobs=-1)
    importance_df = pd.DataFrame({"feature": feat_names, "importance": perm.importances_mean}).sort_values("importance", ascending=False)

importance_df.to_csv(ARTIFACTS / "feature_importance.csv", index=False)
print("Saved:", ARTIFACTS / "feature_importance.csv")

# Top mispredictions using available predictions (prefer rolling-origin rf if present, else median quantiles)
pred_series = pd.Series(np.where(np.isnan(q_md), np.nan, q_md), name="pred").fillna(method="ffill")
err_df = pd.DataFrame({
    "date": df_fe[date_col] if date_col is not None else pd.RangeIndex(len(df_fe)),
    "y": y,
    "pred": pred_series.values
}).dropna()
err_df["abs_err"] = np.abs(err_df["y"] - err_df["pred"])
TOPK = 15
context_cols = [c for c in df_fe.columns if c != target]
top_err = err_df.sort_values("abs_err", ascending=False).head(TOPK).merge(df_fe[context_cols], left_index=True, right_index=True, how="left")
top_err.to_csv(ARTIFACTS / "top_mispredictions.csv", index=False)
print("Saved:", ARTIFACTS / "top_mispredictions.csv")


In [None]:
# ## A7. Plots for Overleaf
import matplotlib.pyplot as plt

# Monthly MAE/SMAPE
try:
    monthly_df = pd.read_csv(ARTIFACTS / "backtest_monthly.csv")
    if not monthly_df.empty:
        plt.figure(figsize=(8,4))
        plt.plot(monthly_df["month"], monthly_df["RF_MAE"], marker='o', label="RF MAE")
        plt.plot(monthly_df["month"], monthly_df["Naive_MAE"], marker='o', label="Naive MAE")
        plt.xticks(rotation=45, ha='right')
        plt.title("Monthly MAE — Rolling-Origin Backtest")
        plt.legend()
        plt.tight_layout()
        p1 = FIGS / "monthly_mae.png"
        plt.savefig(p1, dpi=150)
        plt.show()
        print("Saved:", p1)

        plt.figure(figsize=(8,4))
        plt.plot(monthly_df["month"], monthly_df["RF_SMAPE"], marker='o', label="RF SMAPE")
        plt.plot(monthly_df["month"], monthly_df["Naive_SMAPE"], marker='o', label="Naive SMAPE")
        plt.xticks(rotation=45, ha='right')
        plt.title("Monthly SMAPE — Rolling-Origin Backtest")
        plt.legend()
        plt.tight_layout()
        p2 = FIGS / "monthly_smape.png"
        plt.savefig(p2, dpi=150)
        plt.show()
        print("Saved:", p2)
except Exception as e:
    print("Monthly plots skipped:", e)

# Quantile interval band (last 60 preds if available)
try:
    if 'q_lo' in globals() and 'q_md' in globals() and 'q_hi' in globals():
        mask_q = ~np.isnan(q_lo) & ~np.isnan(q_md) & ~np.isnan(q_hi)
        idxs = np.where(mask_q)[0]
        if len(idxs) > 0:
            tail = idxs[-min(60, len(idxs)):]  # last 60
            plt.figure(figsize=(9,4))
            plt.fill_between(tail, q_lo[tail], q_hi[tail], alpha=0.3, label="90% PI")
            plt.plot(tail, y[tail], label="Actual")
            plt.plot(tail, q_md[tail], label="Median")
            plt.title("Quantile Intervals (last window)")
            plt.legend()
            plt.tight_layout()
            p3 = FIGS / "quantile_intervals.png"
            plt.savefig(p3, dpi=150)
            plt.show()
            print("Saved:", p3)
except Exception as e:
    print("Quantile plot skipped:", e)

# Feature importance bar (top 20)
try:
    imp = pd.read_csv(ARTIFACTS / "feature_importance.csv").head(20).iloc[::-1]
    plt.figure(figsize=(7,6))
    plt.barh(imp["feature"], imp["importance"])
    plt.title("Top Feature Importances")
    plt.tight_layout()
    p4 = FIGS / "feature_importance.png"
    plt.savefig(p4, dpi=150)
    plt.show()
    print("Saved:", p4)
except Exception as e:
    print("Importance plot skipped:", e)


In [None]:
# ## A8. Outputs Index
index = {
    "backtest_overall": str(ARTIFACTS / "backtest_overall.csv"),
    "backtest_monthly": str(ARTIFACTS / "backtest_monthly.csv"),
    "interval_coverage": str(ARTIFACTS / "interval_coverage.json"),
    "policy_simulation": str(ARTIFACTS / "policy_simulation.csv"),
    "event_residuals": str(ARTIFACTS / "event_residuals.csv"),
    "feature_importance": str(ARTIFACTS / "feature_importance.csv"),
    "top_mispredictions": str(ARTIFACTS / "top_mispredictions.csv"),
    "fig_monthly_mae": str(FIGS / "monthly_mae.png"),
    "fig_monthly_smape": str(FIGS / "monthly_smape.png"),
    "fig_quantile_intervals": str(FIGS / "quantile_intervals.png"),
    "fig_feature_importance": str(FIGS / "feature_importance.png"),
}
print(json.dumps(index, indent=2))
