In [1]:
# =============================================================================
# DISH DEMAND FORECASTER – FINAL POLISHED NOTEBOOK
# Author:  Your Name
# Date:    2025-10-27
# Goal:    Predict hourly demand for top-K dishes per restaurant
# =============================================================================

# --------------------------------------------------------------
# 1. IMPORTS & CONFIG
# --------------------------------------------------------------
import os
import re
import joblib
import warnings
from collections import Counter
from typing import List, Tuple

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb

warnings.filterwarnings("ignore")

# ── CONFIG ───────────────────────────────────────────────────
DATA_PATH      = "../data/data.csv"  # Adjust path as needed
TOP_K          = 50
LAGS           = [1, 2, 3, 6, 12, 24]
WINDOWS        = [3, 6, 12, 24]
USE_SCALER     = True
MODEL_TYPE     = "xgboost"
TRAIN_FRAC     = 0.8
RANDOM_STATE   = 42
MODEL_DIR      = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

print("Configuration loaded.\n")

Configuration loaded.



In [2]:
# --------------------------------------------------------------
# 2. PARSING UTILITIES
# --------------------------------------------------------------
def parse_order_items(order_str: str) -> List[Tuple[str, int]]:
    """Parse '2 x Pizza, 1 x Coke' → [('Pizza', 2), ('Coke', 1)]"""
    if pd.isna(order_str):
        return []
    return [(name.strip(), int(qty))
            for qty, name in re.findall(r"(\d+)\s*[xX]\s*([^,]+)", order_str)]

def expand_items(df: pd.DataFrame) -> pd.DataFrame:
    """Expand 'Items in order' into a dictionary of item counts."""
    df["expanded_items"] = df["Items in order"].fillna("").apply(parse_order_items)
    return df

print("Parsing utilities ready.\n")

Parsing utilities ready.



In [3]:
# --------------------------------------------------------------
# 3. LOAD & PREPROCESS
# --------------------------------------------------------------
def load_and_prepare_data(path: str) -> pd.DataFrame:
    """Load and preprocess raw order data."""
    df = pd.read_csv(path)
    print(f"Raw data: {df.shape[0]:,} rows, {df.shape[1]} columns")

    # ── datetime ──
    dt_col = next((c for c in df.columns
                   if "order" in c.lower() and ("placed" in c.lower() or "date" in c.lower())), None)
    if not dt_col:
        raise ValueError("No order datetime column found.")
    df["order_datetime"] = pd.to_datetime(df[dt_col], errors="coerce")
    df = df.dropna(subset=["order_datetime"]).copy()
    df["order_hour"] = df["order_datetime"].dt.floor("h")

    # ── restaurant: clean + one-hot ──
    rest_col = "Restaurant name"
    if rest_col not in df.columns:
        raise ValueError(f"'{rest_col}' column not found.")
    # Standardize restaurant names: strip and keep original case
    df[rest_col] = df[rest_col].str.strip()
    # Verify unique restaurants
    unique_restaurants = df[rest_col].unique()
    print(f"Found {len(unique_restaurants)} restaurants: {list(unique_restaurants)}")
    df = pd.get_dummies(df, columns=[rest_col], prefix="rest", dtype=int)

    # ── expand items ──
    if "Items in order" not in df.columns:
        raise ValueError("'Items in order' column not found.")
    df = expand_items(df)

    print(f"Pre-processed: {len(df):,} orders")
    return df

df = load_and_prepare_data(DATA_PATH)

Raw data: 21,321 rows, 29 columns
Found 6 restaurants: ['Swaad', 'Aura Pizzas', 'Dilli Burger Adda', 'Tandoori Junction', 'The Chicken Junction', 'Masala Junction']
Pre-processed: 21,321 orders


In [4]:
# --------------------------------------------------------------
# 4. BUILD HOURLY AGGREGATED TABLE
# --------------------------------------------------------------
def build_hourly_table(df: pd.DataFrame, top_k: int) -> Tuple[pd.DataFrame, List[str]]:
    """Aggregate orders to hourly level with top-K dishes."""
    # ── top-K dishes (global) ──
    all_items = Counter()
    for items in df["expanded_items"]:
        all_items.update({name: qty for name, qty in items})
    top_dishes = [name for name, _ in all_items.most_common(top_k)]
    print(f"Top-{len(top_dishes)} dishes selected (sample: {top_dishes[:5]})")

    # ── full hour index ──
    hour_idx = pd.date_range(
        start=df["order_hour"].min().floor("D"),
        end=df["order_hour"].max().ceil("D"),
        freq="h"
    )
    agg = pd.DataFrame(index=hour_idx)
    agg.index.name = "order_hour"

    # ── time features ──
    agg["hour_of_day"] = agg.index.hour
    agg["day_of_week"] = agg.index.dayofweek
    agg["is_weekend"] = agg.index.dayofweek.isin([5, 6]).astype(int)

    # ── initialize targets & restaurant cols ──
    for dish in top_dishes:
        agg[f"dish__{dish}"] = 0
    agg["total_orders"] = 0

    rest_cols = [c for c in df.columns if c.startswith("rest_")]
    for col in rest_cols:
        agg[col] = 0

    # ── aggregate per hour ──
    for hour, group in df.groupby("order_hour"):
        if hour not in agg.index:
            continue
        agg.loc[hour, "total_orders"] = len(group)

        # Restaurant presence: 1 if any order
        for col in rest_cols:
            agg.loc[hour, col] = 1 if group[col].sum() > 0 else 0

        # Dish counts
        hour_counts = Counter()
        for items in group["expanded_items"]:
            hour_counts.update({name: qty for name, qty in items})
        for dish in top_dishes:
            agg.loc[hour, f"dish__{dish}"] = hour_counts.get(dish, 0)

    # ── keep only hours with ≥1 order ──
    agg = agg[agg["total_orders"] > 0].copy()
    print(f"Aggregated {len(agg):,} active hours.")
    return agg, top_dishes

agg, top_dishes = build_hourly_table(df, TOP_K)

Top-50 dishes selected (sample: ['Bageecha Pizza', 'Chilli Cheese Garlic Bread', 'Bone in Jamaican Grilled Chicken', 'All About Chicken Pizza', 'Makhani Paneer Pizza'])
Aggregated 2,555 active hours.


In [5]:
# --------------------------------------------------------------
# 5. FEATURE ENGINEERING (lags + rolling)
# --------------------------------------------------------------
def add_temporal_features(agg: pd.DataFrame,
                         lags: List[int],
                         windows: List[int]) -> pd.DataFrame:
    """Add lag and rolling window features."""
    df = agg.copy()

    for lag in lags:
        df[f"total_orders_lag_{lag}"] = df["total_orders"].shift(lag)
        for dish in top_dishes:
            df[f"dish__{dish}_lag_{lag}"] = df[f"dish__{dish}"].shift(lag)

    for w in windows:
        df[f"total_orders_rollmean_{w}"] = df["total_orders"].rolling(w, min_periods=1).mean()
        for dish in top_dishes:
            df[f"dish__{dish}_rollmean_{w}"] = df[f"dish__{dish}"].rolling(w, min_periods=1).mean()

    df = df.dropna().reset_index()  # keep order_hour
    print(f"Feature engineering → {df.shape[0]:,} rows, {df.shape[1]} cols")
    return df

agg_with_hour = add_temporal_features(agg, LAGS, WINDOWS)

Feature engineering → 2,531 rows, 571 cols


In [6]:
# --------------------------------------------------------------
# 6. TRAIN / TEST SPLIT (time-based)
# --------------------------------------------------------------
def ensure_restaurant_coverage(train_agg: pd.DataFrame, test_agg: pd.DataFrame,
                              rest_cols: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Ensure all restaurants appear in training data."""
    train_rest = set([c for c in rest_cols if train_agg[c].sum() > 0])
    test_rest = set([c for c in rest_cols if test_agg[c].sum() > 0])
    missing = test_rest - train_rest
    if missing:
        print(f"Warning: {len(missing)} restaurants in test but not in train: {missing}")
        # Move a small portion of test data to train for each missing restaurant
        for col in missing:
            rest_data = test_agg[test_agg[col] == 1].head(1)
            if not rest_data.empty:
                train_agg = pd.concat([train_agg, rest_data])
                test_agg = test_agg.drop(rest_data.index)
        print("Adjusted train/test split to include all restaurants.")
    return train_agg, test_agg

split_time = agg_with_hour["order_hour"].quantile(TRAIN_FRAC)
train_agg = agg_with_hour[agg_with_hour["order_hour"] < split_time].copy()
test_agg = agg_with_hour[agg_with_hour["order_hour"] >= split_time].copy()
rest_cols = [c for c in agg_with_hour.columns if c.startswith("rest_")]
train_agg, test_agg = ensure_restaurant_coverage(train_agg, test_agg, rest_cols)
full_agg = agg_with_hour.set_index("order_hour")  # for lag lookup

print(f"Train hours: {len(train_agg):,} | Test hours: {len(test_agg):,}")

Adjusted train/test split to include all restaurants.
Train hours: 2,025 | Test hours: 506


In [7]:
# --------------------------------------------------------------
# 7. FEATURE / TARGET LISTS
# --------------------------------------------------------------
feature_cols = [c for c in train_agg.columns
                if not c.startswith("dish__") and c != "order_hour"]
rest_cols = [c for c in feature_cols if c.startswith("rest_")]
target_cols = [f"dish__{d}" for d in top_dishes]

print(f"Features: {len(feature_cols)} (restaurants: {len(rest_cols)})")
print(f"Targets: {len(target_cols)}")

Features: 20 (restaurants: 6)
Targets: 50


In [8]:
# --------------------------------------------------------------
# 8. FORECASTER CLASS
# --------------------------------------------------------------
class DishDemandForecaster:
    def __init__(self):
        self.top_dishes = top_dishes
        self.feature_cols = feature_cols
        self.target_cols = target_cols
        self.scaler = StandardScaler() if USE_SCALER else None
        self.model = MultiOutputRegressor(
            xgb.XGBRegressor(
                n_estimators=400,
                max_depth=7,
                learning_rate=0.08,
                random_state=RANDOM_STATE,
                n_jobs=-1,
                objective="reg:squarederror"
            )
        )
        self._warned = set()

    def fit(self, df: pd.DataFrame):
        """Train the forecaster."""
        X = df[self.feature_cols].select_dtypes(include=[np.number])
        Y = df[self.target_cols]
        if self.scaler:
            X = pd.DataFrame(self.scaler.fit_transform(X), index=X.index, columns=X.columns)
        print(f"Training on {X.shape[0]:,} hours...")
        self.model.fit(X.values, Y.values)
        print("Model trained.\n")

    def predict(self, dt_hour: pd.Timestamp, restaurant: str = None,
                round_int: bool = True) -> pd.DataFrame:
        """Predict dish demand for a specific hour and optional restaurant."""
        dt_hour = pd.to_datetime(dt_hour).floor("h")
        past = full_agg[full_agg.index < dt_hour]
        if past.empty:
            raise ValueError(f"No history before {dt_hour}")
        last = past.iloc[-1]

        row = pd.Series(0.0, index=self.feature_cols)
        row["hour_of_day"] = dt_hour.hour
        row["day_of_week"] = dt_hour.dayofweek
        row["is_weekend"] = int(dt_hour.dayofweek in [5, 6])

        if restaurant:
            col = f"rest_{restaurant.strip()}"
            if col in self.feature_cols:
                row[col] = 1
            elif restaurant not in self._warned:
                print(f"Warning: Restaurant '{restaurant}' not recognized. Using average behavior.")
                self._warned.add(restaurant)

        # Lag/roll features from past
        lag_like = [c for c in self.feature_cols if c.startswith(("total_orders_lag_",
                                                                  "total_orders_rollmean_",
                                                                  "dish__"))]
        for c in lag_like:
            row[c] = last.get(c, 0)

        X_in = pd.DataFrame([row])
        if self.scaler:
            X_in = pd.DataFrame(self.scaler.transform(X_in), columns=X_in.columns)
        X_in = X_in.reindex(columns=self.feature_cols, fill_value=0)

        pred = self.model.predict(X_in)[0]
        if round_int:
            pred = np.maximum(np.round(pred).astype(int), 0)

        return (pd.DataFrame({"dish": self.top_dishes, "predicted_qty": pred})
                .sort_values("predicted_qty", ascending=False)
                .reset_index(drop=True))

In [9]:
# --------------------------------------------------------------
# 9. TRAIN
# --------------------------------------------------------------
forecaster = DishDemandForecaster()
forecaster.fit(train_agg.drop(columns=["order_hour"]))

Training on 2,025 hours...
Model trained.



In [10]:
# --------------------------------------------------------------
# 10. EVALUATE ON TEST SET
# --------------------------------------------------------------
preds = []
print("\nEvaluating test hours...")
for _, row in test_agg.iterrows():
    dt = row["order_hour"]
    active = [c for c in rest_cols if row[c] == 1]
    rest_name = active[0][5:] if active else None  # Remove 'rest_' prefix
    pred = forecaster.predict(dt, restaurant=rest_name, round_int=False)
    preds.append(pred["predicted_qty"].values)

Y_test = test_agg[target_cols].values
rmse = np.sqrt(mean_squared_error(Y_test, np.array(preds)))
print(f"\nOverall RMSE on test set: {rmse:.3f}")


Evaluating test hours...

Overall RMSE on test set: 0.616


In [11]:
# --------------------------------------------------------------
# 11. INFERENCE HELPERS
# --------------------------------------------------------------
def predict_next_hour(restaurant: str = None):
    """Predict demand for the next hour."""
    next_h = full_agg.index.max() + pd.Timedelta(hours=1)
    return forecaster.predict(next_h, restaurant=restaurant)

def forecast_day(date: str, restaurant: str = None):
    """Forecast demand for all hours in a day."""
    hours = pd.date_range(f"{date} 00:00", f"{date} 23:00", freq="h")
    results = []
    for h in hours:
        pred = forecaster.predict(h, restaurant=restaurant)
        pred["hour"] = h.strftime("%H:%M")
        results.append(pred)
    return pd.concat(results).reset_index(drop=True)

In [12]:
# Example usage:
print("\nNext hour (Swaad):")
print(predict_next_hour("Swaad").head(10))

print("\nFull day forecast (2025-10-29, Swaad):")
daily = forecast_day("2025-10-29", "Swaad")
print(daily.head(15))


Next hour (Swaad):
                               dish  predicted_qty
0                    Bageecha Pizza              1
1                      Animal Fries              1
2            Bellpepper Onion Pizza              1
3            Peri Peri Chicken Melt              1
4  Grilled Chicken Smoky BBQ Tender              1
5        Chilli Cheese Garlic Bread              1
6              Just Pepperoni Pizza              1
7    Fried Chicken Peri Peri Tender              1
8            Peri Peri Paneer Pizza              1
9    Bone in Kabuli Grilled Chicken              1

Full day forecast (2025-10-29, Swaad):
                                 dish  predicted_qty   hour
0                      Bageecha Pizza              1  00:00
1                        Animal Fries              1  00:00
2              Bellpepper Onion Pizza              1  00:00
3              Peri Peri Chicken Melt              1  00:00
4    Grilled Chicken Smoky BBQ Tender              1  00:00
5          Chilli C

In [13]:
# --------------------------------------------------------------
# 12. SAVE MODEL + FULL HISTORY
# --------------------------------------------------------------
joblib.dump({
    "model": forecaster,
    "top_dishes": top_dishes,
    "feature_cols": feature_cols,
    "full_agg": full_agg
}, os.path.join(MODEL_DIR, "dish_forecaster_final.pkl"))

print(f"\nModel saved → {MODEL_DIR}/dish_forecaster_final.pkl")


Model saved → models/dish_forecaster_final.pkl
