# Step 5 — GRU Cart Encoder + LightGBM Multi-Objective Ranker

**Two components working together:**
1. **GRU Cart Encoder** (PyTorch) — processes cart items in addition order → 64-dim trajectory vector
2. **LightGBM Ranker** — five models, each optimising one business objective

**Final score:** `0.30·Accept + 0.30·AOV − 0.20·Abandon + 0.10·Timing + 0.10·Anchor`

| Model | Predicts | Objective | Label |
|-------|----------|-----------|-------|
| Accept | P(user adds item) | LambdaRank | Was item added? |
| AOV | Value contribution | Regression | price × accepted / 500 |
| Abandon | P(cart abandoned) | Binary | Session ended without order? |
| Timing | P(fits cart stage) | Binary | Accepted at appropriate stage? |
| Anchor | P(position-1 item) | Binary | Accepted when shown at pos 1? |

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(".."))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 80)
pd.set_option("display.width", 200)

DATA_DIR  = os.path.abspath("../data")
MODEL_DIR = os.path.abspath("../models")

print("Imports OK")

## 1. Load Data

In [None]:
features  = pd.read_csv(f"{DATA_DIR}/training_features.csv")
sessions  = pd.read_csv(f"{DATA_DIR}/sessions.csv")
cart_events = pd.read_csv(f"{DATA_DIR}/cart_events.csv")

print(f"Training features : {features.shape[0]:,} rows × {features.shape[1]} cols")
print(f"Sessions          : {len(sessions):,}")
print(f"Cart events       : {len(cart_events):,}")
print(f"\nLabel distribution:")
print(f"  label_accept       : {features['label_accept'].mean():.3f} positive rate")
print(f"  label_cart_abandoned: {features['label_cart_abandoned'].mean():.3f} positive rate")

## 2. GRU Cart Encoder — Training

The GRU processes cart items in the exact order they were added. A cart of **Biryani → Salan** encodes differently than **Salan alone** — capturing the evolving meal trajectory that mean-pooling cannot.

In [None]:
from gru_encoder import (
    load_embeddings, build_cart_sequences, train_gru,
    extract_hidden_states, CartGRUEncoder
)
import torch

emb_lookup = load_embeddings(f"{DATA_DIR}/item_embeddings.npz")
emb_dim = next(iter(emb_lookup.values())).shape[0]
print(f"Loaded {len(emb_lookup):,} item embeddings (dim={emb_dim})")

print("\nBuilding cart sequences for each recommendation impression …")
sequences, candidate_embs, labels = build_cart_sequences(
    features, cart_events, emb_lookup, emb_dim
)
print(f"Built {len(sequences):,} sequences")
print(f"Sequence length stats: min={min(len(s) for s in sequences)}, "
      f"max={max(len(s) for s in sequences)}, "
      f"mean={np.mean([len(s) for s in sequences]):.1f}")

In [None]:
# Temporal split for GRU training
sessions["start_time"] = pd.to_datetime(sessions["start_time"])
sess_start = sessions.set_index("session_id")["start_time"]
feat_sess_start = features["session_id"].map(sess_start)

train_mask_gru = (feat_sess_start < "2025-12-22").values
val_mask_gru   = ((feat_sess_start >= "2025-12-22") & (feat_sess_start < "2025-12-29")).values

print(f"GRU train: {train_mask_gru.sum():,}  |  GRU val: {val_mask_gru.sum():,}")

print("\nTraining GRU Cart Encoder …")
gru_model, gru_history = train_gru(
    sequences, candidate_embs, labels,
    train_mask_gru, val_mask_gru,
    input_dim=emb_dim, hidden_dim=64,
    batch_size=256, lr=1e-3, epochs=20, patience=3,
)

In [None]:
# GRU training loss curve
fig, ax = plt.subplots(figsize=(8, 4))
epochs_range = range(1, len(gru_history["train_loss"]) + 1)
ax.plot(epochs_range, gru_history["train_loss"], "o-", label="Train Loss")
ax.plot(epochs_range, gru_history["val_loss"], "s-", label="Val Loss")
ax.set_xlabel("Epoch")
ax.set_ylabel("BCE Loss")
ax.set_title("GRU Cart Encoder — Training Curve")
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

best_epoch = np.argmin(gru_history["val_loss"]) + 1
print(f"Best epoch: {best_epoch}  (val_loss={min(gru_history['val_loss']):.4f})")

## 3. Extract GRU Hidden States

Run the trained GRU on all sequences to produce 64-dim trajectory features for LightGBM.

In [None]:
print("Extracting 64-dim hidden states …")
gru_hidden = extract_hidden_states(gru_model, sequences, batch_size=512)
print(f"Shape: {gru_hidden.shape}")

# Save GRU artifacts
os.makedirs(MODEL_DIR, exist_ok=True)
torch.save(gru_model.state_dict(), f"{MODEL_DIR}/gru_encoder.pt")
np.save(f"{DATA_DIR}/gru_hidden_states.npy", gru_hidden)
print(f"GRU weights saved  → {MODEL_DIR}/gru_encoder.pt")
print(f"Hidden states saved → {DATA_DIR}/gru_hidden_states.npy")

## 4. Label Engineering + Feature Preparation

In [None]:
from lgbm_ranker import (
    engineer_labels, prepare_features, temporal_split,
    train_lgbm_models, tune_hyperparameters,
    evaluate_models, evaluate_by_segment, evaluate_by_cart_stage,
    evaluate_cold_start, compute_business_score,
    shap_analysis, save_models, MODEL_CONFIGS,
)

# Engineer all 5 label targets
features = engineer_labels(features)

label_cols = ["label_accept", "label_aov", "label_cart_abandoned", "label_timing", "label_anchor"]
print("Label statistics:")
for col in label_cols:
    print(f"  {col:25s}  mean={features[col].mean():.4f}  "
          f"sum={features[col].sum():,.0f}  "
          f"(positive rate {100*features[col].mean():.1f}%)")

In [None]:
# Prepare features (append GRU hidden states, encode categoricals)
features, feature_cols, encoders = prepare_features(features, gru_hidden)
print(f"Total feature columns: {len(feature_cols)}")

# Temporal split
train_mask, val_mask, test_mask = temporal_split(features, sessions)
print(f"\nTemporal split:")
print(f"  Train (weeks 49-51): {train_mask.sum():,} rows")
print(f"  Val   (week 52)    : {val_mask.sum():,} rows")
print(f"  Test  (weeks 1-2)  : {test_mask.sum():,} rows")

## 5. MBA Baseline

Market Basket Analysis (Apriori + association rules) — the industry-standard baseline. Its failures motivate every design decision in our main system.

In [None]:
from mba_baseline import build_transaction_baskets, train_mba, evaluate_mba

# Build baskets from training-period completed sessions only
train_sessions = sessions[sessions["start_time"] < "2025-12-22"]
baskets = build_transaction_baskets(cart_events, train_sessions)
print(f"Transaction baskets: {len(baskets):,}")

# Run Apriori
mba_rules = train_mba(baskets, min_support=0.003, min_confidence=0.05)
if len(mba_rules) > 0:
    print(f"\nTop 10 rules by lift:")
    display(mba_rules.nlargest(10, "lift")[
        ["antecedents", "consequents", "support", "confidence", "lift"]
    ])

In [None]:
# Evaluate MBA on test set
test_features_df = features[test_mask]
mba_metrics = evaluate_mba(mba_rules, test_features_df, cart_events, k=8)
print("MBA Baseline — Test Set Metrics:")
for k, v in mba_metrics.items():
    print(f"  {k:20s}: {v:.4f}" if isinstance(v, float) else f"  {k:20s}: {v}")

## 6. LightGBM Training — Five Models

Training all five models with default hyperparameters, then tuning.

In [None]:
print("=" * 60)
print("TRAINING INITIAL MODELS (default hyperparameters)")
print("=" * 60)
models_initial = train_lgbm_models(features, feature_cols, train_mask, val_mask)

## 7. Hyperparameter Tuning

Grid search over `num_leaves`, `learning_rate`, `min_child_samples` with early stopping.

In [None]:
print("=" * 60)
print("HYPERPARAMETER TUNING")
print("=" * 60)

best_params_all = {}
tuning_results = {}

for model_name in MODEL_CONFIGS:
    bp, results_df = tune_hyperparameters(
        features, feature_cols, train_mask, val_mask, model_name=model_name
    )
    best_params_all[model_name] = bp
    tuning_results[model_name] = results_df

print("\n" + "=" * 60)
print("Best hyperparameters per model:")
for name, params in best_params_all.items():
    print(f"  [{name:8s}] {params}")

In [None]:
# Retrain with best hyperparameters
print("=" * 60)
print("RETRAINING WITH TUNED HYPERPARAMETERS")
print("=" * 60)
models = train_lgbm_models(
    features, feature_cols, train_mask, val_mask,
    params_override=best_params_all,
)

## 8. Evaluation — All Models on Test Set

In [None]:
print("=" * 60)
print("TEST SET EVALUATION")
print("=" * 60)
test_metrics = evaluate_models(models, features, feature_cols, test_mask)

print("\nPer-model metrics:")
for name, m in test_metrics.items():
    parts = "  ".join(f"{k}={v:.4f}" for k, v in m.items())
    print(f"  [{name:8s}] {parts}")

## 9. Our System vs MBA Baseline — Head-to-Head Comparison

In [None]:
# Side-by-side comparison
our_p8 = test_metrics["accept"].get("precision_at_8", 0)
our_ndcg = test_metrics["accept"].get("ndcg_at_8", 0)
our_auc = test_metrics["accept"].get("auc_roc", 0)
mba_p8 = mba_metrics.get("precision_at_k", 0)
mba_ndcg = mba_metrics.get("ndcg_at_k", 0)

comparison = pd.DataFrame({
    "Metric": ["Precision@8", "NDCG@8", "AUC-ROC"],
    "MBA Baseline": [mba_p8, mba_ndcg, "N/A"],
    "Our System": [our_p8, our_ndcg, our_auc],
    "Improvement": [
        f"+{100*(our_p8 - mba_p8)/max(mba_p8, 1e-6):.1f}%" if mba_p8 > 0 else "N/A (MBA=0)",
        f"+{100*(our_ndcg - mba_ndcg)/max(mba_ndcg, 1e-6):.1f}%" if mba_ndcg > 0 else "N/A (MBA=0)",
        "—"
    ],
})
display(comparison)

# Bar chart
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, metric, our_val, mba_val in [
    (axes[0], "Precision@8", our_p8, mba_p8),
    (axes[1], "NDCG@8", our_ndcg, mba_ndcg),
]:
    bars = ax.bar(["MBA Baseline", "Our System"], [mba_val, our_val],
                  color=["#95a5a6", "#e74c3c"], edgecolor="black", linewidth=0.5)
    ax.set_title(metric, fontsize=14, fontweight="bold")
    ax.set_ylim(0, max(our_val, mba_val) * 1.3 + 0.01)
    for bar in bars:
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                f"{bar.get_height():.4f}", ha="center", fontsize=11)
plt.suptitle("MBA Baseline vs GRU+LightGBM System", fontsize=15, fontweight="bold")
plt.tight_layout()
plt.show()

## 10. Breakdown — By User Segment and Cart Stage

The system should outperform MBA most at higher cart stages (2-3), where sequential understanding matters most.

In [None]:
# By user segment
print("Accept Model — Precision@8 & NDCG@8 by User Segment")
print("-" * 55)
seg_df = evaluate_by_segment(models, features, feature_cols, test_mask, encoders=encoders)
display(seg_df)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
seg_df.plot.bar(x="segment", y="precision_at_8", ax=axes[0], color="#3498db",
                legend=False, edgecolor="black", linewidth=0.5)
axes[0].set_title("Precision@8 by Segment", fontweight="bold")
axes[0].set_ylabel("Precision@8")
axes[0].tick_params(axis="x", rotation=30)

seg_df.plot.bar(x="segment", y="ndcg_at_8", ax=axes[1], color="#2ecc71",
                legend=False, edgecolor="black", linewidth=0.5)
axes[1].set_title("NDCG@8 by Segment", fontweight="bold")
axes[1].set_ylabel("NDCG@8")
axes[1].tick_params(axis="x", rotation=30)
plt.tight_layout()
plt.show()

In [None]:
# By cart stage
print("Accept Model — Precision@8 & NDCG@8 by Cart Stage")
print("-" * 55)
stage_df = evaluate_by_cart_stage(models, features, feature_cols, test_mask)
display(stage_df)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
stage_df.plot.bar(x="cart_stage", y="precision_at_8", ax=axes[0], color="#e67e22",
                  legend=False, edgecolor="black", linewidth=0.5)
axes[0].set_title("Precision@8 by Cart Stage", fontweight="bold")
axes[0].set_xlabel("Cart Stage (0=empty, 3=nearly complete)")

stage_df.plot.bar(x="cart_stage", y="ndcg_at_8", ax=axes[1], color="#9b59b6",
                  legend=False, edgecolor="black", linewidth=0.5)
axes[1].set_title("NDCG@8 by Cart Stage", fontweight="bold")
axes[1].set_xlabel("Cart Stage (0=empty, 3=nearly complete)")
plt.tight_layout()
plt.show()

## 11. Cold Start Evaluation

Testing on low-history users (≤ 5 orders). The system uses `bestseller_flag` + `popularity_score` as fallback signals where user history is sparse — something MBA cannot do.

In [None]:
cold_results = evaluate_cold_start(
    models, features, feature_cols, test_mask,
    order_count_col="user_order_count", cold_threshold=5,
)

print("Cold Start vs Warm Users:")
print("-" * 55)
cold_df = pd.DataFrame(cold_results).T
cold_df.index.name = "user_type"
display(cold_df)

fig, ax = plt.subplots(figsize=(8, 5))
x = np.arange(2)
width = 0.35
cold_vals = [cold_results["cold"]["precision_at_8"], cold_results["cold"]["ndcg_at_8"]]
warm_vals = [cold_results["warm"]["precision_at_8"], cold_results["warm"]["ndcg_at_8"]]
ax.bar(x - width/2, cold_vals, width, label=f"Cold (≤5 orders, n={cold_results['cold']['sessions']:.0f})",
       color="#e74c3c", edgecolor="black", linewidth=0.5)
ax.bar(x + width/2, warm_vals, width, label=f"Warm (>5 orders, n={cold_results['warm']['sessions']:.0f})",
       color="#2ecc71", edgecolor="black", linewidth=0.5)
ax.set_xticks(x)
ax.set_xticklabels(["Precision@8", "NDCG@8"])
ax.set_title("Cold Start vs Warm Users — Accept Model", fontweight="bold")
ax.legend()
plt.tight_layout()
plt.show()

## 12. Business Score — Final Ranking with Peak-Hour Switching

In [None]:
X_test = features.loc[test_mask, feature_cols]
preds = {name: model.predict(X_test) for name, model in models.items()}

# Default weights
default_scores = compute_business_score(preds)

# Peak-hour adaptive weights
peak_modes = features.loc[test_mask, "peak_hour_mode"].values
# Decode back to string labels
if "peak_hour_mode" in encoders:
    peak_str = encoders["peak_hour_mode"].inverse_transform(peak_modes.astype(int))
else:
    peak_str = peak_modes
adaptive_scores = compute_business_score(preds, peak_mode=peak_str)

print("Business Score Statistics (test set):")
print(f"  Default weights   — mean={default_scores.mean():.4f}  std={default_scores.std():.4f}")
print(f"  Peak-hour adaptive — mean={adaptive_scores.mean():.4f}  std={adaptive_scores.std():.4f}")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].hist(default_scores, bins=50, color="#3498db", edgecolor="black", alpha=0.7)
axes[0].set_title("Business Score Distribution (Default Weights)", fontweight="bold")
axes[0].set_xlabel("Score")
axes[1].hist(adaptive_scores, bins=50, color="#e74c3c", edgecolor="black", alpha=0.7)
axes[1].set_title("Business Score Distribution (Peak-Hour Adaptive)", fontweight="bold")
axes[1].set_xlabel("Score")
plt.tight_layout()
plt.show()

from lgbm_ranker import DEFAULT_WEIGHTS, PEAK_WEIGHTS
print("\nWeight configurations:")
print(f"  Default: {DEFAULT_WEIGHTS}")
for mode, w in PEAK_WEIGHTS.items():
    print(f"  {mode}: {w}")

## 13. SHAP Feature Importance

Which features drive each model's recommendations? Judges can see exactly what matters.

In [None]:
from pathlib import Path

shap_results = shap_analysis(
    models, features, feature_cols, test_mask,
    save_dir=Path(MODEL_DIR), max_samples=2000,
)

# Display saved SHAP plots inline
from IPython.display import Image, display as ipy_display
for name in MODEL_CONFIGS:
    plot_path = f"{MODEL_DIR}/shap_{name}.png"
    if os.path.exists(plot_path):
        print(f"\n{'=' * 40}")
        print(f"SHAP — {name.upper()} Model")
        print(f"{'=' * 40}")
        ipy_display(Image(filename=plot_path, width=700))

## 14. Save All Artifacts

In [None]:
# Save LightGBM models
save_models(models, save_dir=Path(MODEL_DIR))

# Save evaluation summary
import json

eval_summary = {
    "test_metrics": {k: {mk: float(mv) if isinstance(mv, (float, np.floating)) else mv
                         for mk, mv in v.items()} for k, v in test_metrics.items()},
    "mba_metrics": {k: float(v) if isinstance(v, (float, np.floating)) else v
                    for k, v in mba_metrics.items()},
    "cold_start": {k: {mk: float(mv) if isinstance(mv, (float, np.floating)) else mv
                       for mk, mv in v.items()} for k, v in cold_results.items()},
    "best_hyperparameters": {k: {pk: (int(pv) if isinstance(pv, (np.integer,)) else float(pv) if isinstance(pv, (np.floating,)) else pv)
                                  for pk, pv in v.items()} for k, v in best_params_all.items()},
}

with open(f"{MODEL_DIR}/eval_summary.json", "w") as f:
    json.dump(eval_summary, f, indent=2)

print("All artifacts saved:")
for f_name in sorted(os.listdir(MODEL_DIR)):
    f_path = os.path.join(MODEL_DIR, f_name)
    size = os.path.getsize(f_path)
    print(f"  {f_name:35s}  {size/1024:>8.1f} KB")

## Summary

**Step 5 Complete.** Trained and evaluated:

1. **GRU Cart Encoder** — 64-dim trajectory features capturing sequential meal composition
2. **5 LightGBM Models** — Accept (LambdaRank), AOV (regression), Abandon/Timing/Anchor (binary)
3. **MBA Baseline** — Apriori association rules for comparison
4. **Business Score** — weighted combination with peak-hour adaptive switching
5. **SHAP Analysis** — interpretable feature importance per model

**Artifacts saved to `models/`:**
- `gru_encoder.pt` — GRU weights
- `lgbm_accept.txt`, `lgbm_aov.txt`, `lgbm_abandon.txt`, `lgbm_timing.txt`, `lgbm_anchor.txt` — LightGBM models
- `shap_*.png` — feature importance plots
- `eval_summary.json` — all metrics