In [1]:
%pip install pandas scikit-learn joblib --quiet


Note: you may need to restart the kernel to use updated packages.


In [1]:
# GreenPlatter - Demand Prediction Training Script (Pipeline + CV + Chrono Split)
# Author: Your Name

import json
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
import joblib

# ----------------------------
# 1. Load Dataset
# ----------------------------
file_path = "Expanded_GreenPlatter_12000.csv"  # ensure file exists in working dir
df = pd.read_csv(file_path)

print("Dataset shape:", df.shape)
print(df.head())

# Parse date and sort chronologically to avoid leakage
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df = df.sort_values("Date").reset_index(drop=True)

# ----------------------------
# 2. Define Features & Target (no manual label encoding)
# ----------------------------
feature_cols = ["DayOfWeek", "Guests", "Event", "Weather", "Dish"]
target_col = "Sold_Qty"

# Handle missing categorical values by replacing NaN with string "Unknown"
for col in ["DayOfWeek", "Event", "Weather", "Dish"]:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown").astype(str)

X = df[feature_cols].copy()
y = df[target_col].astype(float)

# ----------------------------
# 3. Build Pipeline: OneHotEncoder + HistGradientBoostingRegressor
# ----------------------------
categorical_features = ["DayOfWeek", "Event", "Weather", "Dish"]
numeric_features = ["Guests"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ("num", "passthrough", numeric_features),
    ]
)

regressor = HistGradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=None,
    max_iter=300,
    l2_regularization=0.0,
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", regressor),
])

# ----------------------------
# 4. Chronological Train/Test Split (last 20% for test)
# ----------------------------
num_rows = len(df)
split_index = int(num_rows * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

# ----------------------------
# 5. Cross-validated metrics on training set (TimeSeriesSplit)
# ----------------------------
ts_cv = TimeSeriesSplit(n_splits=5)
mae_scores = -cross_val_score(pipeline, X_train, y_train, cv=ts_cv, scoring="neg_mean_absolute_error", n_jobs=None)
r2_scores = cross_val_score(pipeline, X_train, y_train, cv=ts_cv, scoring="r2", n_jobs=None)

print("\nCross-Validated Performance on Training (TimeSeriesSplit):")
print(f"MAE: mean={mae_scores.mean():.2f} ± {mae_scores.std():.2f}")
print(f"R² : mean={r2_scores.mean():.2f} ± {r2_scores.std():.2f}")

# ----------------------------
# 6. Fit on training and evaluate on holdout test
# ----------------------------
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nHoldout Test Performance (chronological 20%):")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")

# ----------------------------
# 7. Show Sample Predictions
# ----------------------------
results = pd.DataFrame({"Actual": y_test.values[:10], "Predicted": y_pred[:10]})
print("\nSample Predictions:")
print(results)

# ----------------------------
# 8. Persist: Save Pipeline and Category Metadata
# ----------------------------
joblib.dump(pipeline, "greenplatter_pipeline.joblib")
print("\nPipeline saved as greenplatter_pipeline.joblib")

# Save original category options for the app UI
category_options = {
    "DayOfWeek": sorted(pd.Series(X["DayOfWeek"]).dropna().unique().tolist()),
    "Event": sorted(pd.Series(X["Event"]).dropna().unique().tolist()),
    "Weather": sorted(pd.Series(X["Weather"]).dropna().unique().tolist()),
    "Dish": sorted(pd.Series(X["Dish"]).dropna().unique().tolist()),
}
with open("greenplatter_categories.json", "w", encoding="utf-8") as f:
    json.dump(category_options, f, ensure_ascii=False, indent=2)
print("Category options saved as greenplatter_categories.json")



Dataset shape: (12000, 9)
         Date  DayOfWeek  Guests     Event Weather            Dish  \
0  01-12-2024     Sunday     187       NaN    Cold       Veg Pulao   
1  23-11-2024   Saturday     321       NaN   Sunny   Chicken Curry   
2  03-01-2024  Wednesday      65       NaN  Cloudy       Veg Pulao   
3  20-08-2024    Tuesday      53       NaN   Rainy  Veg Manchurian   
4  08-10-2024    Tuesday     338  Festival     Hot   Pasta Alfredo   

   Prepared_Qty  Sold_Qty  Waste_Qty  
0            37        36          7  
1            44        44          3  
2            27        23          4  
3            38        34          4  
4            41        40          7  

Cross-Validated Performance on Training (TimeSeriesSplit):
MAE: mean=6.83 ± 0.15
R² : mean=0.18 ± 0.06

Holdout Test Performance (chronological 20%):
Mean Absolute Error (MAE): 6.55
R² Score: 0.27

Sample Predictions:
   Actual  Predicted
0    47.0  36.551921
1    48.0  46.161429
2    49.0  37.522347
3    50.0  41.57