# Food-Waste Forecasting — MASTER Notebook

This single notebook consolidates:
- **Baseline (Original)**: Simple train/test split with a **Decision Tree Regressor**, baseline metrics and saved artifacts.
- **Upgrades**: Robust preprocessing, feature engineering, Random Forest / (optional) XGBoost.
- **Mentor Feedback Additions**: Rolling-origin backtests vs seasonal-naive, quantile regression decision layer with interval coverage, event-effects modeling with hierarchical pooling fallback, SHAP/permutation importances, error analysis, policy simulation, and Overleaf-ready figures.

> This merges prior `01_train_decision_tree.ipynb`, `02_model_upgrade.ipynb`, and `03_mentor_feedback.ipynb` functionality into **one** end-to-end notebook.
> It does **not** overwrite any existing repo files by default; it only **adds** artifacts/figures.


In [1]:
# ## 0. Setup

import os, json, warnings, math
from pathlib import Path
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.inspection import permutation_importance

# Optional
try:
    from xgboost import XGBRegressor
    XGB_AVAILABLE = True
except Exception:
    XGB_AVAILABLE = False

try:
    import shap
    SHAP_AVAILABLE = True
except Exception:
    SHAP_AVAILABLE = False

RANDOM_STATE = 42
ROOT = Path.cwd()
ARTIFACTS = ROOT / "artifacts"
FIGS = ROOT / "reports" / "figures"
ARTIFACTS.mkdir(parents=True, exist_ok=True)
FIGS.mkdir(parents=True, exist_ok=True)

print("Artifacts ->", ARTIFACTS)
print("Figures   ->", FIGS)


Artifacts -> /content/artifacts
Figures   -> /content/reports/figures


In [2]:
# ## 1. Load Data

# Adjust or add your CSV path here if needed.
CANDIDATE_PATHS = [
    ROOT / "data" / "mess_waste_GIM_500.csv",
    ROOT / "mess_waste_GIM_500.csv",
    ROOT / "data" / "mess_waste_GIM_daily_exams.csv",
]

def find_first(paths):
    for p in paths:
        if p.exists():
            return p
    return None

DATA_PATH = find_first(CANDIDATE_PATHS)
if DATA_PATH is None:
    raise FileNotFoundError("Data not found. Update CANDIDATE_PATHS above to your CSV.")

df = pd.read_csv(DATA_PATH)
print("Using data:", DATA_PATH)
display(df.head())
print(df.info())


FileNotFoundError: Data not found. Update CANDIDATE_PATHS above to your CSV.

In [None]:
# ## 2. Column Detection (Date / Target / Events)

POSSIBLE_TARGETS = ["food_waste_kg", "waste_kg", "leftover_kg", "wastage"]
DATE_CANDIDATES = ["date", "day", "timestamp", "dt"]
EVENT_CANDIDATES = ["event", "event_type", "special_event"]

target = next((c for c in POSSIBLE_TARGETS if c in df.columns), None)
date_col = next((c for c in DATE_CANDIDATES if c in df.columns), None)
event_col = next((c for c in EVENT_CANDIDATES if c in df.columns), None)

if target is None:
    raise KeyError(f"Please set your target column name to one of: {POSSIBLE_TARGETS}")

if date_col is not None:
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.sort_values(date_col).reset_index(drop=True)

print("Detected -> Target:", target, "| Date:", date_col, "| Event:", event_col)


# ## 3. Baseline (Original-equivalent): Decision Tree on Simple Split

This section preserves the **original spirit** of your first notebook:
- Minimal preprocessing
- **Train/Test split** (random if no date; last 20% if date exists)
- **DecisionTreeRegressor** as baseline
- Save baseline metrics to `artifacts/baseline_metrics.json`

> This is a **non-destructive** baseline to keep continuity with the original GitHub notebook.


In [None]:
# Build features
y = df[target].astype(float)
X = df.drop(columns=[target])

# quick type splits
import numpy as np
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# remove raw date from numeric if any
if date_col in num_cols: num_cols.remove(date_col)
if date_col in cat_cols: cat_cols.remove(date_col)

preprocess_min = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

# split
if date_col is not None:
    split_idx = int(len(df) * 0.8)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

baseline_pipe = Pipeline([("prep", preprocess_min), ("dt", DecisionTreeRegressor(random_state=RANDOM_STATE))])
baseline_pipe.fit(X_train, y_train)
pred_b = baseline_pipe.predict(X_test)

def eval_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

baseline_metrics = eval_metrics(y_test, pred_b)

with open(ARTIFACTS / "baseline_metrics.json", "w") as f:
    json.dump(baseline_metrics, f, indent=2)

print("Baseline metrics:", baseline_metrics)
print("Saved ->", ARTIFACTS / "baseline_metrics.json")


# ## 4. Upgrades: Robust Preprocessing + Stronger Models

We extend with:
- `ColumnTransformer` (scaler + one-hot)
- Time features if a date exists (DOW, month, weekend; rolling means)
- Models: Decision Tree, Random Forest, (optional) XGBoost
- Save metrics to `artifacts/metrics_upgrade.json`


In [None]:
# Lightweight time features
df_fe = df.copy()
if date_col is not None:
    d = df_fe[date_col].dt
    df_fe["dow"] = d.dayofweek
    df_fe["month"] = d.month
    df_fe["is_weekend"] = (d.dayofweek >= 5).astype(int)
    for win in (3, 7):
        df_fe[f"roll_mean_{win}"] = df_fe[target].rolling(win, min_periods=1).mean().shift(1)
        df_fe[f"roll_std_{win}"]  = df_fe[target].rolling(win, min_periods=1).std().shift(1)

y2 = df_fe[target].astype(float)
X2 = df_fe.drop(columns=[target])

num_cols2 = X2.select_dtypes(include=[np.number]).columns.tolist()
cat_cols2 = X2.select_dtypes(include=["object", "category"]).columns.tolist()
if date_col in num_cols2: num_cols2.remove(date_col)
if date_col in cat_cols2: cat_cols2.remove(date_col)

preprocessor = ColumnTransformer([
    ("num", StandardScaler(with_mean=False), num_cols2),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols2),
])

# time-aware split if date exists
if date_col is not None:
    split_idx = int(len(df_fe) * 0.8)
    Xtr, Xte = X2.iloc[:split_idx], X2.iloc[split_idx:]
    ytr, yte = y2.iloc[:split_idx], y2.iloc[split_idx:]
else:
    Xtr, Xte, ytr, yte = train_test_split(X2, y2, test_size=0.2, random_state=RANDOM_STATE)

models = {
    "decision_tree": DecisionTreeRegressor(random_state=RANDOM_STATE),
    "random_forest": RandomForestRegressor(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1),
}
if XGB_AVAILABLE:
    models["xgb"] = XGBRegressor(
        n_estimators=600, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_STATE, objective="reg:squarederror"
    )

metrics_upgrade = {}
fitted = {}
for name, est in models.items():
    pipe = Pipeline([("prep", preprocessor), ("model", est)])
    pipe.fit(Xtr, ytr)
    fitted[name] = pipe
    yhat = pipe.predict(Xte)
    metrics_upgrade[name] = eval_metrics(yte, yhat)

with open(ARTIFACTS / "metrics_upgrade.json", "w") as f:
    json.dump(metrics_upgrade, f, indent=2)

print("Saved ->", ARTIFACTS / "metrics_upgrade.json")
metrics_upgrade


# ## 5. Mentor Feedback: Rolling-Origin vs Seasonal-Naive, Quantiles, Events, Explainability

This block implements the mentor’s requests while keeping prior results intact.
Outputs are saved to `artifacts/` and `reports/figures/` for Overleaf.


In [None]:
# ----- Helpers -----
def smape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    out = np.zeros_like(denom)
    mask = denom != 0
    out[mask] = diff[mask] / denom[mask]
    return 100.0 * np.mean(out)

def seasonal_naive(y_series, period=7):
    y = np.asarray(y_series)
    y_hat = np.full_like(y, np.nan, dtype=float)
    for t in range(period, len(y)):
        y_hat[t] = y[t - period]
    return y_hat

def interval_coverage(y_true, lo, hi):
    y = np.asarray(y_true)
    return float(np.mean((y >= lo) & (y <= hi)))

def waste_shortage(prepared, actual):
    prepared = np.asarray(prepared)
    actual = np.asarray(actual)
    waste = np.maximum(0.0, prepared - actual)
    shortage = np.maximum(0.0, actual - prepared)
    return waste, shortage

# ----- Backtest RF vs Seasonal-Naive (expanding window, H=1) -----
rf_bt = RandomForestRegressor(n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1)
rf_bt_pipe = Pipeline([("prep", preprocessor), ("rf", rf_bt)])

y_array = y2.values.astype(float)
y_naive = seasonal_naive(y_array, period=7)
preds_rf = np.full_like(y_array, np.nan, dtype=float)

start = 14  # 2-week warmup
for t in range(start, len(df_fe)-1):
    train_idx = slice(0, t+1)
    test_idx = t+1
    rf_bt_pipe.fit(X2.iloc[train_idx], y_array[train_idx])
    preds_rf[test_idx] = rf_bt_pipe.predict(X2.iloc[[test_idx]])[0]

mask = ~np.isnan(preds_rf) & ~np.isnan(y_naive)
overall = {
    "RF_MAE": float(mean_absolute_error(y_array[mask], preds_rf[mask])),
    "RF_SMAPE": float(smape(y_array[mask], preds_rf[mask])),
    "Naive_MAE": float(mean_absolute_error(y_array[mask], y_naive[mask])),
    "Naive_SMAPE": float(smape(y_array[mask], y_naive[mask])),
}
overall["Improvement_MAE_%"] = 100.0 * (1 - overall["RF_MAE"] / overall["Naive_MAE"])
overall["Improvement_SMAPE_%"] = 100.0 * (1 - overall["RF_SMAPE"] / overall["Naive_SMAPE"])

months = df_fe[date_col].dt.to_period("M").astype(str).values if date_col is not None else np.array(["ALL"]*len(df_fe))
bt_df = pd.DataFrame({"month": months, "y": y_array, "rf": preds_rf, "naive": y_naive}).dropna()

monthly_rows = []
for m, g in bt_df.groupby("month"):
    monthly_rows.append({
        "month": m,
        "RF_MAE": float(mean_absolute_error(g["y"], g["rf"])),
        "RF_SMAPE": float(smape(g["y"], g["rf"])),
        "Naive_MAE": float(mean_absolute_error(g["y"], g["naive"])),
        "Naive_SMAPE": float(smape(g["y"], g["naive"])),
    })
monthly_df = pd.DataFrame(monthly_rows).sort_values("month")

pd.DataFrame([overall]).to_csv(ARTIFACTS / "backtest_overall.csv", index=False)
monthly_df.to_csv(ARTIFACTS / "backtest_monthly.csv", index=False)

print("Saved ->", ARTIFACTS / "backtest_overall.csv")
print("Saved ->", ARTIFACTS / "backtest_monthly.csv")

# ----- Quantile Regression Decision Layer (0.1/0.5/0.9) -----
q_models = {}
for q in [0.1, 0.5, 0.9]:
    q_models[q] = Pipeline([
        ("prep", preprocessor),
        ("gbr", GradientBoostingRegressor(loss="quantile", alpha=q, random_state=RANDOM_STATE, n_estimators=400, max_depth=3))
    ])

q_lo = np.full_like(y_array, np.nan, dtype=float)
q_md = np.full_like(y_array, np.nan, dtype=float)
q_hi = np.full_like(y_array, np.nan, dtype=float)

for t in range(start, len(df_fe)-1):
    train_idx = slice(0, t+1)
    test_idx = t+1
    for q in [0.1, 0.5, 0.9]:
        q_models[q].fit(X2.iloc[train_idx], y_array[train_idx])
    q_lo[test_idx] = q_models[0.1].predict(X2.iloc[[test_idx]])[0]
    q_md[test_idx] = q_models[0.5].predict(X2.iloc[[test_idx]])[0]
    q_hi[test_idx] = q_models[0.9].predict(X2.iloc[[test_idx]])[0]

mask_q = ~np.isnan(q_lo) & ~np.isnan(q_md) & ~np.isnan(q_hi)
coverage90 = interval_coverage(y_array[mask_q], q_lo[mask_q], q_hi[mask_q])
with open(ARTIFACTS / "interval_coverage.json", "w") as f:
    json.dump({"coverage90": coverage90}, f, indent=2)
print("Saved ->", ARTIFACTS / "interval_coverage.json")

# Policy simulation over lambda
lams = np.linspace(0, 1, 6)
policy = []
for lam in lams:
    prepared = q_md.copy()
    prepared[mask_q] = q_md[mask_q] + lam * (q_hi[mask_q] - q_md[mask_q])
    waste, shortage = waste_shortage(prepared[mask_q], y_array[mask_q])
    policy.append({
        "lambda": float(lam),
        "avg_waste": float(np.mean(waste)),
        "avg_shortage": float(np.mean(shortage)),
        "efficiency_%": float(100.0 * (1 - (np.mean(waste) / (np.mean(y_array[mask_q]) + 1e-8))))
    })
policy_df = pd.DataFrame(policy).sort_values("lambda")
policy_df.to_csv(ARTIFACTS / "policy_simulation.csv", index=False)
print("Saved ->", ARTIFACTS / "policy_simulation.csv")

# ----- Event Effects + Hierarchical Pooling Fallback -----
EVENT_CANDIDATES = ["event", "event_type", "special_event"]
event_use = next((c for c in EVENT_CANDIDATES if c in df_fe.columns), None)
if event_use is None:
    df_fe["event_synth"] = "Normal"
    event_use = "event_synth"

bt_ev = pd.DataFrame({
    "date": df_fe[date_col] if date_col is not None else pd.RangeIndex(len(df_fe)),
    "y": y_array,
    "rf": preds_rf,
    "event": df_fe[event_use]
}).dropna()

bt_ev["resid"] = bt_ev["y"] - bt_ev["rf"]
evt_stats = bt_ev.groupby("event")["resid"].agg(["mean","std","count"]).reset_index().rename(columns={"mean":"resid_mean","std":"resid_std","count":"n"})
evt_stats["global_mean"] = bt_ev["resid"].mean()
k = max(1.0, evt_stats["n"].median())
evt_stats["shrink_mean"] = (evt_stats["n"]/(evt_stats["n"]+k))*evt_stats["resid_mean"] + (k/(evt_stats["n"]+k))*evt_stats["global_mean"]
evt_stats.to_csv(ARTIFACTS / "event_residuals.csv", index=False)
print("Saved ->", ARTIFACTS / "event_residuals.csv")

# ----- Explainability: SHAP (if available) or permutation -----
rf_full = Pipeline([("prep", preprocessor), ("rf", RandomForestRegressor(n_estimators=600, random_state=RANDOM_STATE, n_jobs=-1))])
rf_full.fit(X2, y2)

feat_names = []
try:
    feat_names += list(preprocessor.transformers_[0][2])  # numeric
    ohe = preprocessor.named_transformers_["cat"]
    base = preprocessor.transformers_[1][2]
    feat_names += ohe.get_feature_names_out(base).tolist()
except Exception:
    feat_names = [f"f{i}" for i in range(rf_full.named_steps["rf"].n_features_in_)]

importance_df = None
if SHAP_AVAILABLE:
    try:
        explainer = shap.TreeExplainer(rf_full.named_steps["rf"])
        X_tr = preprocessor.fit_transform(X2)
        idx = np.linspace(0, X_tr.shape[0]-1, min(500, X_tr.shape[0])).astype(int)
        sv = explainer.shap_values(X_tr[idx])
        shap_abs = np.mean(np.abs(sv), axis=0)
        importance_df = pd.DataFrame({"feature": feat_names, "importance": shap_abs}).sort_values("importance", ascending=False)
    except Exception as e:
        print("SHAP error, using permutation:", e)

if importance_df is None:
    perm = permutation_importance(rf_full, X2, y2, n_repeats=5, random_state=RANDOM_STATE, n_jobs=-1)
    importance_df = pd.DataFrame({"feature": feat_names, "importance": perm.importances_mean}).sort_values("importance", ascending=False)

importance_df.to_csv(ARTIFACTS / "feature_importance.csv", index=False)
print("Saved ->", ARTIFACTS / "feature_importance.csv")


In [None]:
# ## 6. Overleaf-Ready Plots

import matplotlib.pyplot as plt

# Monthly MAE / SMAPE
if (ARTIFACTS / "backtest_monthly.csv").exists():
    monthly_df = pd.read_csv(ARTIFACTS / "backtest_monthly.csv")
    if not monthly_df.empty:
        plt.figure(figsize=(8,4))
        plt.plot(monthly_df["month"], monthly_df["RF_MAE"], marker='o', label="RF MAE")
        plt.plot(monthly_df["month"], monthly_df["Naive_MAE"], marker='o', label="Naive MAE")
        plt.xticks(rotation=45, ha='right')
        plt.title("Monthly MAE — Rolling-Origin Backtest")
        plt.legend()
        plt.tight_layout()
        p1 = FIGS / "monthly_mae.png"
        plt.savefig(p1, dpi=150)
        plt.show()
        print("Saved ->", p1)

        plt.figure(figsize=(8,4))
        plt.plot(monthly_df["month"], monthly_df["RF_SMAPE"], marker='o', label="RF SMAPE")
        plt.plot(monthly_df["month"], monthly_df["Naive_SMAPE"], marker='o', label="Naive SMAPE")
        plt.xticks(rotation=45, ha='right')
        plt.title("Monthly SMAPE — Rolling-Origin Backtest")
        plt.legend()
        plt.tight_layout()
        p2 = FIGS / "monthly_smape.png"
        plt.savefig(p2, dpi=150)
        plt.show()
        print("Saved ->", p2)

# Quantile interval band (last 60 predictions if exist)
import numpy as np, pandas as pd
if (ARTIFACTS / "interval_coverage.json").exists():
    try:
        # Load arrays from memory section if still defined; otherwise skip
        if 'q_lo' in globals() and 'q_md' in globals() and 'q_hi' in globals() and 'y_array' in globals():
            mask_q = ~np.isnan(q_lo) & ~np.isnan(q_md) & ~np.isnan(q_hi)
            idxs = np.where(mask_q)[0]
            if len(idxs) > 0:
                tail = idxs[-min(60, len(idxs)):]  # last 60
                plt.figure(figsize=(9,4))
                plt.fill_between(tail, q_lo[tail], q_hi[tail], alpha=0.3, label="90% PI")
                plt.plot(tail, y_array[tail], label="Actual")
                plt.plot(tail, q_md[tail], label="Median")
                plt.title("Quantile Intervals (last window)")
                plt.legend()
                plt.tight_layout()
                p3 = FIGS / "quantile_intervals.png"
                plt.savefig(p3, dpi=150)
                plt.show()
                print("Saved ->", p3)
    except Exception as e:
        print("Quantile plot skipped:", e)

# Feature importances
import pandas as pd
imp_csv = ARTIFACTS / "feature_importance.csv"
if imp_csv.exists():
    imp = pd.read_csv(imp_csv).head(20).iloc[::-1]
    plt.figure(figsize=(7,6))
    plt.barh(imp["feature"], imp["importance"])
    plt.title("Top Feature Importances")
    plt.tight_layout()
    p4 = FIGS / "feature_importance.png"
    plt.savefig(p4, dpi=150)
    plt.show()
    print("Saved ->", p4)


In [None]:
# ## 7. Outputs Index (Artifacts & Figures)

index = {
    "baseline_metrics": str(ARTIFACTS / "baseline_metrics.json"),
    "upgrade_metrics": str(ARTIFACTS / "metrics_upgrade.json"),
    "backtest_overall": str(ARTIFACTS / "backtest_overall.csv"),
    "backtest_monthly": str(ARTIFACTS / "backtest_monthly.csv"),
    "interval_coverage": str(ARTIFACTS / "interval_coverage.json"),
    "policy_simulation": str(ARTIFACTS / "policy_simulation.csv"),
    "event_residuals": str(ARTIFACTS / "event_residuals.csv"),
    "feature_importance": str(ARTIFACTS / "feature_importance.csv"),
    "top_mispredictions": str(ARTIFACTS / "top_mispredictions.csv"),
    "fig_monthly_mae": str(FIGS / "monthly_mae.png"),
    "fig_monthly_smape": str(FIGS / "monthly_smape.png"),
    "fig_quantile_intervals": str(FIGS / "quantile_intervals.png"),
    "fig_feature_importance": str(FIGS / "feature_importance.png"),
}

import json
print(json.dumps(index, indent=2))
