In [None]:
import sys
import pickle
from pathlib import Path

# Add parent directory to path to import src module
sys.path.append(str(Path().resolve().parent.parent))

import pandas as pd
from darts import TimeSeries


from src.modules.model_handling.model_catalogue import MODEL_CATALOGUE



In [None]:
future_covariates_df = pd.read_parquet("../../data/pipeline_runs/future_covariates.parquet")
past_covariates_df = pd.read_parquet("../../data/pipeline_runs/past_covariates.parquet")
target_train_df = pd.read_parquet("../../data/pipeline_runs/train_targets_daily.parquet")
target_test_df = pd.read_parquet("../../data/pipeline_runs/test_targets_daily.parquet")

In [None]:
future_covariates_columns = ["is_holiday"]
past_covariates_columns = ["num_transactions", "num_unique_customers", 
                           "num_unique_articles", "avg_basket_size", 
                           "avg_unit_price"]
target_columns = ["Quantity"]
time_index_column = "InvoiceDate"

future_covariates = TimeSeries.from_dataframe(
    future_covariates_df, 
    time_col=time_index_column, 
    value_cols=future_covariates_columns,
    fill_missing_dates=True,
    fillna_value=0, 
    freq='D',
)
past_covariates = TimeSeries.from_dataframe(
    past_covariates_df, 
    time_col=time_index_column,
    value_cols=past_covariates_columns,
    fill_missing_dates=True,
    fillna_value=0,
    freq='D',
)
target_train = TimeSeries.from_dataframe(
    target_train_df,
    time_col=time_index_column,
    value_cols=target_columns,
    fill_missing_dates=True,
    fillna_value=0,
    freq='D',
)
target_test = TimeSeries.from_dataframe(
    target_test_df,
    time_col=time_index_column,
    value_cols=target_columns,
    fill_missing_dates=True,
    fillna_value=0,
    freq='D',
)


## Configuration Parameters

Set model and evaluation parameters here to run experiments with different models.

In [None]:
from darts.metrics import rmse, wmape
# Model configuration
MODEL_KEY = "random_forest_7777_cyclic_day_month_scaled"  # Change this to test different models

# Backtest configuration
FORECAST_HORIZON = 7  # Days to forecast ahead
BACKTEST_STRIDE = 7   # Days between backtest evaluations
BACKTEST_START = 0.7  # Start backtesting at 70% through training data
BACKTEST_RETRAIN = True  # Whether to retrain at each backtest step (True = slower but more realistic)
METRICS = [rmse, wmape]
# Test set configuration
TEST_FORECAST_DAYS = 30  # How many days to predict on test set

print(f"Model: {MODEL_KEY}")
print(f"Backtest: {FORECAST_HORIZON}-day horizon, stride={BACKTEST_STRIDE}, retrain={BACKTEST_RETRAIN}")
print(f"Test forecast: {TEST_FORECAST_DAYS} days")

## Model Evaluation Strategy

**Backtest (on training data)**: 
- Simulates real-world forecasting by repeatedly training and testing on historical data
- Used to validate model performance and tune hyperparameters
- Safe to run multiple times during development

**Test Set Evaluation (final check)**:
- Holdout data that model has never seen
- Used ONLY ONCE for final unbiased performance assessment
- Touching it multiple times = data leakage

**Proper workflow**:
1. Backtest on training data → tune model
2. Final evaluation on test set → report results

In [None]:
# STEP 1: Backtest on training data (for model validation & tuning)
from darts.metrics import rmse, mae, smape, mase, wmape

print("="*60)
print("STEP 1: BACKTEST ON TRAINING DATA")
print("="*60)
print(f"Model: {MODEL_KEY}")
print("Purpose: Validate model can generalize to unseen time periods")
print()

model = MODEL_CATALOGUE[MODEL_KEY]()

# Get the model configuration to understand what covariates it needs
model_config = model.model_params if hasattr(model, 'model_params') else {}
print(f"Model config: {model_config}")
print()

model.fit(
    series=target_train,
    past_covariates=past_covariates,
    future_covariates=future_covariates,
)

# Backtest on training data - simulates forecasting at multiple points in history
print("Running backtest...")
backtest_metrics = model.backtest(
    series=target_train,  # Use TRAINING data for backtest
    past_covariates=past_covariates,
    future_covariates=future_covariates,
    forecast_horizon=FORECAST_HORIZON,
    stride=BACKTEST_STRIDE,
    retrain=BACKTEST_RETRAIN,
    last_points_only=True,  # Returns scalar metric, not forecast series
    metric=METRICS,
    start=BACKTEST_START,
)

print(f"Training Backtest RMSE ({FORECAST_HORIZON}-day horizon): {backtest_metrics[0]:.2f}")
print(f"Training Backtest WMAPE ({FORECAST_HORIZON}-day horizon): {backtest_metrics[1]:.2f}")
print("This shows how well the model performs on historical data it was trained on")
print()

In [None]:
score_dict = {METRICS[i]: score for i, score in enumerate(backtest_metrics)}
import json
print(
json.dumps(score_dict, indent=2)
)

In [None]:
# STEP 2: Final evaluation on test set (ONLY ONCE)
print("="*60)
print("STEP 2: FINAL EVALUATION ON TEST SET (HOLDOUT DATA)")
print("="*60)
print("Purpose: Unbiased assessment of model performance on unseen data")
print()

# Predict on test set
forecast = model.predict(
    n=TEST_FORECAST_DAYS,
    past_covariates=past_covariates,
    future_covariates=future_covariates
)

# Calculate metrics (using zero-friendly percentage metrics)
test_rmse = rmse(target_test, forecast)
test_mae = mae(target_test, forecast)
test_wmape = wmape(target_test, forecast)  # Weighted MAPE - handles zeros, most interpretable
test_smape = smape(target_test, forecast)  # Symmetric MAPE - handles zeros
test_mase = mase(target_test, forecast, insample=target_train)  # Mean Absolute Scaled Error

print("Test Set Performance:")
print(f"  RMSE:   {test_rmse:.2f} (root mean squared error)")
print(f"  MAE:    {test_mae:.2f} (mean absolute error)")
print(f"  WMAPE:  {test_wmape:.2f}% (weighted MAPE, handles zeros)")
print(f"  sMAPE:  {test_smape:.2f}% (symmetric MAPE, 0-200% range)")
print(f"  MASE:   {test_mase:.2f} (vs naive forecast, <1 is good)")
print()
print("⚠️  This is the TRUE model performance - only check this once!")
print("   If you iterate based on test results, you're leaking information")

In [None]:
# Understand the data scale and error context
print("\n" + "="*60)
print("DATA SCALE & ERROR CONTEXT")
print("="*60)

print("\nTarget Variable Statistics:")
print(f"  Train mean: {target_train.values().mean():.2f}")
print(f"  Train std:  {target_train.values().std():.2f}")
print(f"  Train range: [{target_train.values().min():.0f}, {target_train.values().max():.0f}]")
print(f"  Zero values in train: {(target_train.values() == 0).sum()} / {len(target_train)}")
print(f"\n  Test mean: {target_test.values().mean():.2f}")
print(f"  Test std:  {target_test.values().std():.2f}")
print(f"  Zero values in test: {(target_test.values() == 0).sum()} / {len(target_test)}")

print(f"\nForecast Statistics:")
print(f"  Forecast mean: {forecast.values().mean():.2f}")
print(f"  Forecast std:  {forecast.values().std():.2f}")

print(f"\nError in Context:")
print(f"  RMSE as % of test mean: {(test_rmse / target_test.values().mean()) * 100:.1f}%")
print(f"  MAE as % of test mean:  {(test_mae / target_test.values().mean()) * 100:.1f}%")

In [None]:
# Visualize forecast vs actual
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 6))
target_test.plot(label="Actual Test Data")
forecast.plot(label="30-Day Forecast")
plt.title("Forecast vs Actual Test Data")
plt.xlabel("Date")
plt.ylabel("Quantity")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Check forecast statistics
print("\n=== Forecast Statistics ===")
print(f"Forecast mean: {forecast.values().mean():.2f}")
print(f"Forecast std: {forecast.values().std():.2f}")
print(f"Forecast min: {forecast.values().min():.2f}")
print(f"Forecast max: {forecast.values().max():.2f}")