# Prediction on Test Set

In [None]:
# Clone GitHub Repository
!git clone https://github.com/sabin74/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform.git

In [None]:
!pip install -q catboost
!pip install category_encoders

## Load Test Data

In [None]:
# Environment Setup - Import Libraries
import os
import gc
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from category_encoders import TargetEncoder

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Set Project Root
os.chdir("/content/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform")
print("Current Directory: ", os.getcwd())

In [None]:
# Load Feature-Engineered Data
DATA_DIR = Path("data/features")

train = pd.read_parquet(DATA_DIR / "train_features.parquet")
test  = pd.read_parquet(DATA_DIR / "test_features.parquet")

train = train.sort_values(["store_nbr", "family", "date"]).reset_index(drop=True)
test  = test.sort_values(["store_nbr", "family", "date"]).reset_index(drop=True)

print("Train shape:", train.shape)
print("Test shape :", test.shape)

In [None]:
# Load All Model and Ensemble Bundle
bundle = joblib.load("models/ensemble-stacking/final_ensemble_model.pkl")

print("Loaded Ensemble Components:")
for k in bundle:
    print(" -", k)

In [None]:
# Memory Optimization
def reduce_mem_usage(df):
    for col in df.columns:
        if df[col].dtype == "float64":
            df[col] = df[col].astype("float32")
        elif df[col].dtype == "int64":
            df[col] = df[col].astype("int32")
    return df

train = reduce_mem_usage(train)
test  = reduce_mem_usage(test)
gc.collect()

In [None]:
# Define Lag / Rolls
LAGS  = [1, 7, 14, 28]
ROLLS = [7, 14, 28]
MAX_LAG = max(LAGS)

In [None]:
# Build Initial History Store
history_df = train[
    train["date"] >= (train["date"].max() - pd.Timedelta(days=MAX_LAG + 30))
][["store_nbr", "family", "date", "sales", "onpromotion", "dcoilwtico"]].copy()

history_df = history_df.sort_values(
    ["store_nbr", "family", "date"]
).reset_index(drop=True)

In [None]:
# Define Feature Generator
def generate_sales_features(history, current, lags, rolls):
    for lag in lags:
        current[f"sales_lag_{lag}"] = (
            history.groupby(["store_nbr","family"])["sales"]
            .shift(lag)
            .reindex(current.index)
        )
    for r in rolls:
        roll = (
            history.groupby(["store_nbr","family"])["sales"]
            .rolling(r)
            .agg(["mean","std"])
            .reset_index(level=[0,1], drop=True)
        )
        current[f"sales_roll_mean_{r}"] = roll["mean"].reindex(current.index)
        current[f"sales_roll_std_{r}"]  = roll["std"].reindex(current.index)
    return current


def generate_promo_features(history, current, lags, rolls):
    for lag in lags:
        current[f"promo_lag_{lag}"] = (
            history.groupby(["store_nbr","family"])["onpromotion"]
            .shift(lag)
            .reindex(current.index)
        )
    for r in rolls:
        rolling = history.groupby(["store_nbr","family"])["onpromotion"].rolling(r)
        current[f"promo_roll_sum_{r}"] = rolling.sum().reset_index(level=[0,1], drop=True)
        current[f"promo_freq_{r}"] = rolling.mean().reset_index(level=[0,1], drop=True)
    current["promo_flag"] = (current["onpromotion"] > 0).astype(int)
    return current


def generate_oil_features(history, current, lags):
    for lag in lags:
        current[f"oil_lag_{lag}"] = (
            history.groupby("store_nbr")["dcoilwtico"]
            .shift(lag)
            .reindex(current.index)
        )
    return current

In [None]:
# Training Feature List
DROP_COLS = ["id", "date", "sales", "sales_log"]
TRAIN_FEATURES = [c for c in train.columns if c not in DROP_COLS]

In [None]:
# Final Prediction Function
def predict_from_bundle(X_raw, bundle):

    X_te = bundle["target_encoder"].transform(X_raw)

    preds = np.column_stack([
        bundle["rf_model"].predict(X_te),
        bundle["xgb_model"].predict(xgb.DMatrix(X_te)),
        bundle["lgb_model"].predict(X_raw, num_iteration=bundle["lgb_model"].best_iteration),
        bundle["cat_model"].predict(X_raw),
    ])

    y_log = bundle["meta_model"].predict(preds)
    y_log += bundle["bias"]
    y_log = np.where(y_log < bundle["zero_threshold"], 0, y_log)

    return np.expm1(y_log)

In [None]:
# Recursive Test Prediction Loop
test_preds = []
test_dates = sorted(test["date"].unique())

for current_date in test_dates:

    test_day = test[test["date"] == current_date].copy()

    temp_history = pd.concat([history_df, test_day], ignore_index=True)

    test_day = generate_sales_features(temp_history, test_day, LAGS, ROLLS)
    test_day = generate_promo_features(temp_history, test_day, [1,7], ROLLS)
    test_day = generate_oil_features(temp_history, test_day, [7,14,28])

    # Safe filling
    sales_cols = [c for c in test_day if "sales_" in c]
    test_day[sales_cols] = test_day[sales_cols].fillna(0)
    test_day = test_day.ffill().bfill()

    # Predict
    test_day["sales"] = predict_from_bundle(
        test_day[TRAIN_FEATURES], bundle
    )

    # Update history with predictions
    history_df = pd.concat([history_df, test_day], ignore_index=True)

    test_preds.append(test_day)

In [None]:
# Final Test Prediction
test_final = pd.concat(test_preds).reset_index(drop=True)
test_final["sales_pred"] = test_final["sales"]

print("Final Test Shape:", test_final.shape)
test_final[["date","store_nbr","family","sales_pred"]].head()

In [None]:
# Plot Test Forecast Trend
daily_forecast = test_final.groupby("date")["sales_pred"].sum().reset_index()

plt.figure(figsize=(14,5))
plt.plot(daily_forecast["date"], daily_forecast["sales_pred"])
plt.title("Test Forecast Trend (Total Sales)")
plt.xlabel("Date")
plt.ylabel("Predicted Sales")
plt.grid(True)
plt.show()

In [None]:
# Store-wise Forecast Plot
sample_store = test_final["store_nbr"].iloc[0]

store_forecast = test_final[test_final["store_nbr"] == sample_store]

plt.figure(figsize=(14, 5))
plt.plot(store_forecast["date"], store_forecast["sales_pred"])
plt.title(f"Store {sample_store} – Forecast Trend")
plt.xlabel("Date")
plt.ylabel("Predicted Sales")
plt.grid(True)
plt.show()

In [None]:
# Kaggle submission format
submission = (
    test_final[["id", "sales_pred"]]
    .rename(columns={"sales_pred": "sales"})
    .copy()
)

submission["sales"] = submission["sales"].clip(lower=0)

print(submission.head())
print("Submission shape:", submission.shape)


In [None]:
# Save Submission
submission_path = "outputs/kaggle_submission.csv"
os.makedirs("outputs", exist_ok=True)

submission.to_csv(submission_path, index=False)
print(f"Saved Kaggle submission → {submission_path}")


In [None]:
# Save Full Forecast
forecast_cols = [
    "date",
    "store_nbr",
    "family",
    "sales_pred"
]

forecast_df = test_final[forecast_cols].copy()
forecast_path = "outputs/forecast.parquet"

# to_csv
forecast_df.to_csv(
    "outputs/test_forecast_full.csv",
    index=False
)

# to_parquet
forecast_df.to_parquet(
    "outputs/test_forecast_full.parquet",
    index=False
)