<a href="https://colab.research.google.com/github/htnphu/retail-demand-forecasting/blob/main/m5_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import altair as alt

from matplotlib import pyplot as plt
plt.style.use('seaborn-v0_8')

import gc

import logging
import warnings

warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/Fall_2025/CPSC_5305_Intro_to_DS/data/"

In [None]:
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype

        # --- FIX: Skip Datetime Columns ---
        if pd.api.types.is_datetime64_any_dtype(col_type):
            continue
        # -----------------------------------

        # Only process numeric columns for min/max
        if col_type != object and not isinstance(col_type, pd.CategoricalDtype):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                # This is where the error occurred previously for datetime types
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        # Handle categorical columns without min/max
        elif isinstance(col_type, pd.CategoricalDtype):
            df[col] = df[col].cat.as_unordered()  # Ensure no ordering assumption
        else:
            df[col] = df[col].astype('category')  # Convert objects to category
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Memory usage decreased from {start_mem:.2f} Mb to {end_mem:.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

In [None]:
print("Loading and reducing memory usage for data files...")

# load the datasets and apply memory reduction
df = pd.read_csv(f'{DATA_PATH}m5_processed.csv')
df = reduce_mem_usage(df)

print("\nInitial data loading complete.")

Loading and reducing memory usage for data files...
Memory usage decreased from 13093.96 Mb to 2767.02 Mb (78.9% reduction)

Initial data loading complete.


In [None]:

print(df.head().to_markdown())

|    | id                          | item_id     | dept_id   | cat_id   | store_id   | state_id   | d   |   sales |   wm_yr_wk |   sell_price | date       | weekday   |   wday |   month |   year |   snap_CA |   snap_TX |   snap_WI |   day_of_week |   day_of_month |   day_of_year |   weekend | event_name_1   | event_type_1   | event_name_2   | event_type_2   |   price_mean |   price_relative_to_mean |   price_change_lag |
|---:|:----------------------------|:------------|:----------|:---------|:-----------|:-----------|:----|--------:|-----------:|-------------:|:-----------|:----------|-------:|--------:|-------:|----------:|----------:|----------:|--------------:|---------------:|--------------:|----------:|:---------------|:---------------|:---------------|:---------------|-------------:|-------------------------:|-------------------:|
|  0 | FOODS_1_001_CA_1_evaluation | FOODS_1_001 | FOODS_1   | FOODS    | CA_1       | CA         | d_1 |       3 |      11101 |            2 | 2011-0

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 29 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   id                      category
 1   item_id                 category
 2   dept_id                 category
 3   cat_id                  category
 4   store_id                category
 5   state_id                category
 6   d                       category
 7   sales                   int16   
 8   wm_yr_wk                int16   
 9   sell_price              float32 
 10  date                    category
 11  weekday                 category
 12  wday                    int8    
 13  month                   int8    
 14  year                    int16   
 15  snap_CA                 int8    
 16  snap_TX                 int8    
 17  snap_WI                 int8    
 18  day_of_week             int8    
 19  day_of_month            int8    
 20  day_of_year             int16   
 21  weeken

In [None]:
print(df.shape)

(59181090, 29)


In [None]:
print(df.isnull().sum().to_markdown())

|                        |   0 |
|:-----------------------|----:|
| id                     |   0 |
| item_id                |   0 |
| dept_id                |   0 |
| cat_id                 |   0 |
| store_id               |   0 |
| state_id               |   0 |
| d                      |   0 |
| sales                  |   0 |
| wm_yr_wk               |   0 |
| sell_price             |   0 |
| date                   |   0 |
| weekday                |   0 |
| wday                   |   0 |
| month                  |   0 |
| year                   |   0 |
| snap_CA                |   0 |
| snap_TX                |   0 |
| snap_WI                |   0 |
| day_of_week            |   0 |
| day_of_month           |   0 |
| day_of_year            |   0 |
| weekend                |   0 |
| event_name_1           |   0 |
| event_type_1           |   0 |
| event_name_2           |   0 |
| event_type_2           |   0 |
| price_mean             |   0 |
| price_relative_to_mean |   0 |
| price_ch

In [None]:
df.sort_values(by=['id', 'date'], inplace=True)

In [None]:
print(df.head().to_markdown())

|    | id                          | item_id     | dept_id   | cat_id   | store_id   | state_id   | d   |   sales |   wm_yr_wk |   sell_price | date       | weekday   |   wday |   month |   year |   snap_CA |   snap_TX |   snap_WI |   day_of_week |   day_of_month |   day_of_year |   weekend | event_name_1   | event_type_1   | event_name_2   | event_type_2   |   price_mean |   price_relative_to_mean |   price_change_lag |
|---:|:----------------------------|:------------|:----------|:---------|:-----------|:-----------|:----|--------:|-----------:|-------------:|:-----------|:----------|-------:|--------:|-------:|----------:|----------:|----------:|--------------:|---------------:|--------------:|----------:|:---------------|:---------------|:---------------|:---------------|-------------:|-------------------------:|-------------------:|
|  0 | FOODS_1_001_CA_1_evaluation | FOODS_1_001 | FOODS_1   | FOODS    | CA_1       | CA         | d_1 |       3 |      11101 |            2 | 2011-0

In [None]:
gc.collect()

71

# Modelling

In [None]:
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, mean_squared_error

In [None]:
# features engineering
# create features grouped by item ('id')
# .shift(28) looks 28 days back *for that item
df['sales_lag_7'] = df.groupby('id')['sales'].shift(7)
df['sales_lag_28'] = df.groupby('id')['sales'].shift(28)

# .rolling(28) gets a 28-day window for that item
# .shift(1) ensures we only use data from before the current day
df['sales_rolling_mean_28'] = df.groupby('id')['sales'].shift(1).rolling(28).mean()
print("New features created.")

New features created.


In [None]:
categorical_features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
                        'weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']

numeric_features = ['wday', 'month', 'year', 'day_of_week', 'day_of_month', 'day_of_year',
                    'weekend', 'snap_CA', 'snap_TX', 'snap_WI',
                    'sell_price', 'price_mean', 'price_relative_to_mean', 'price_change_lag', 'sales_lag_7', 'sales_lag_28', 'sales_rolling_mean_28']

features = categorical_features + numeric_features

target = 'sales'

print(f"Original shape before dropna: {df.shape}")
df.dropna(inplace=True)
print(f"New shape after dropna: {df.shape}")

X = df[features]
y = df[target]

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

print(f"\nStarting {n_splits}-Fold Time Series Cross-Validation with Tweedie...")

rmse_scores = []
fold = 1

for train_index, val_index in tscv.split(X):
    print(f"--- Fold {fold}/{n_splits} ---")

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    print(f"Training data size: {len(X_train)}")
    print(f"Validation data size: {len(X_val)}")

    model = lgb.LGBMRegressor(
        # core params
        objective='tweedie',  # use 'tweedie': perfect for sales data (counts with many zeros)
        metric='rmse',        # tell the model to optimize for RMSE (Root Mean Squared Error)
        # device='gpu',
        # max_bin=255,

        # speed and performance
        n_estimators=1000,    # build up to 1000 simple "decision trees" (will stop early)
        learning_rate=0.03,   # how quickly the model learns
        n_jobs=-1,            # use all available CPU cores to train faster

        # overfitting guardrails
        subsample=0.8,        # use 80% of rows for each tree
        colsample_bytree=0.8, # use 80% of features for each tree

        random_state=42       # Ensures getting the exact same results every time run this
    )

    print("Training model...")
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(50, verbose=False)],
              # specify LightGBM which features are categorical (very very important part)
              categorical_feature=categorical_features
             )

    val_preds = model.predict(X_val)

    val_preds[val_preds < 0] = 0 # sales cannot be negative

    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"Fold {fold} RMSE: {rmse:.4f}\n")
    rmse_scores.append(rmse)

    fold += 1

print("--- Cross-Validation Complete ---")
print(f"Mean RMSE across {n_splits} folds: {np.mean(rmse_scores):.4f}")
print(f"Std Dev of RMSE across {n_splits} folds: {np.std(rmse_scores):.4f}")
print("\nIndividual Fold RMSEs:")
print(rmse_scores)

Original shape before dropna: (58327370, 32)
New shape after dropna: (58327370, 32)

Starting 5-Fold Time Series Cross-Validation with Tweedie...
--- Fold 1/5 ---
Training data size: 9721230
Validation data size: 9721228
Training model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.536613 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2106
[LightGBM] [Info] Number of data points in the train set: 9721230, number of used features: 27
[LightGBM] [Info] Start training from score 0.083667
Fold 1 RMSE: 4.3457

--- Fold 2/5 ---
Training data size: 19442458
Validation data size: 9721228
Training model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.068622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [

In [None]:
categorical_features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
                        'weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']

numeric_features = ['wday', 'month', 'year', 'day_of_week', 'day_of_month', 'day_of_year',
                    'weekend', 'snap_CA', 'snap_TX', 'snap_WI',
                    'sell_price', 'price_mean', 'price_relative_to_mean', 'price_change_lag', 'sales_lag_7', 'sales_lag_28', 'sales_rolling_mean_28']

features = categorical_features + numeric_features

target = 'sales'

print(f"Original shape before dropna: {df.shape}")
df.dropna(inplace=True)
print(f"New shape after dropna: {df.shape}")

X = df[features]
y = df[target]

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

print(f"\nStarting {n_splits}-Fold Time Series Cross-Validation with Tweedie...")

rmse_scores = []
fold = 1

for train_index, val_index in tscv.split(X):
    print(f"--- Fold {fold}/{n_splits} ---")

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    print(f"Training data size: {len(X_train)}")
    print(f"Validation data size: {len(X_val)}")

    model = lgb.LGBMRegressor(
        # core params
        objective='tweedie',  # use 'tweedie': perfect for sales data (counts with many zeros)
        metric='rmse',        # tell the model to optimize for RMSE (Root Mean Squared Error)
        # device='gpu',
        # max_bin=255,

        # speed and performance
        n_estimators=1000,    # build up to 1000 simple "decision trees" (will stop early)
        learning_rate=0.05,   # how quickly the model learns -> 0.05 is a good choice.
        n_jobs=-1,            # use all available CPU cores to train faster

        # overfitting guardrails
        subsample=0.8,        # use 80% of rows for each tree
        colsample_bytree=0.8, # use 80% of features for each tree

        random_state=42       # Ensures getting the exact same results every time run this
    )

    print("Training model...")
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(50, verbose=False)],
              # specify LightGBM which features are categorical (very very important part)
              categorical_feature=categorical_features
             )

    val_preds = model.predict(X_val)

    val_preds[val_preds < 0] = 0 # sales cannot be negative

    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"Fold {fold} RMSE: {rmse:.4f}\n")
    rmse_scores.append(rmse)

    fold += 1

print("--- Cross-Validation Complete ---")
print(f"Mean RMSE across {n_splits} folds: {np.mean(rmse_scores):.4f}")
print(f"Std Dev of RMSE across {n_splits} folds: {np.std(rmse_scores):.4f}")
print("\nIndividual Fold RMSEs:")
print(rmse_scores)

Original shape before dropna: (59181090, 32)
New shape after dropna: (58327370, 32)

Starting 5-Fold Time Series Cross-Validation with Tweedie...
--- Fold 1/5 ---
Training data size: 9721230
Validation data size: 9721228
Training model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.550373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2106
[LightGBM] [Info] Number of data points in the train set: 9721230, number of used features: 27
[LightGBM] [Info] Start training from score 0.083667
Fold 1 RMSE: 4.3262

--- Fold 2/5 ---
Training data size: 19442458
Validation data size: 9721228
Training model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.841234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [

In [None]:
categorical_features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
                        'weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']

numeric_features = ['wday', 'month', 'year', 'day_of_week', 'day_of_month', 'day_of_year',
                    'weekend', 'snap_CA', 'snap_TX', 'snap_WI',
                    'sell_price', 'price_mean', 'price_relative_to_mean', 'price_change_lag']

features = categorical_features + numeric_features

target = 'sales'

print(f"Original shape before dropna: {df.shape}")
df.dropna(inplace=True)
print(f"New shape after dropna: {df.shape}")

X = df[features]
y = df[target]

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

print(f"\nStarting {n_splits}-Fold Time Series Cross-Validation with Tweedie...")

rmse_scores = []
fold = 1

for train_index, val_index in tscv.split(X):
    print(f"--- Fold {fold}/{n_splits} ---")

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    print(f"Training data size: {len(X_train)}")
    print(f"Validation data size: {len(X_val)}")

    model = lgb.LGBMRegressor(
        # core params
        objective='tweedie',  # use 'tweedie': perfect for sales data (counts with many zeros)
        metric='rmse',        # tell the model to optimize for RMSE (Root Mean Squared Error)
        # device='gpu',
        # max_bin=255,

        # speed and performance
        n_estimators=1000,    # build up to 1000 simple "decision trees" (will stop early)
        learning_rate=0.05,   # how quickly the model learns -> 0.05 is a good choice.
        n_jobs=-1,            # use all available CPU cores to train faster

        # overfitting guardrails
        subsample=0.8,        # use 80% of rows for each tree
        colsample_bytree=0.8, # use 80% of features for each tree

        random_state=42       # Ensures getting the exact same results every time run this
    )

    print("Training model...")
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(50, verbose=False)],
              # specify LightGBM which features are categorical (very very important part)
              categorical_feature=categorical_features
             )

    val_preds = model.predict(X_val)

    val_preds[val_preds < 0] = 0 # sales cannot be negative

    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"Fold {fold} RMSE: {rmse:.4f}\n")
    rmse_scores.append(rmse)

    fold += 1

print("--- Cross-Validation Complete ---")
print(f"Mean RMSE across {n_splits} folds: {np.mean(rmse_scores):.4f}")
print(f"Std Dev of RMSE across {n_splits} folds: {np.std(rmse_scores):.4f}")
print("\nIndividual Fold RMSEs:")
print(rmse_scores)

Original shape before dropna: (59181090, 29)
New shape after dropna: (59181090, 29)

Starting 5-Fold Time Series Cross-Validation with Tweedie...
--- Fold 1/5 ---
Training data size: 9863515
Validation data size: 9863515
Training model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.355888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1744
[LightGBM] [Info] Number of data points in the train set: 9863515, number of used features: 24
[LightGBM] [Info] Start training from score 0.080907
Fold 1 RMSE: 6.0155

--- Fold 2/5 ---
Training data size: 19727030
Validation data size: 9863515
Training model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.668574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [

### 24 features (before adding lag and rolling features)

Original shape before dropna: (59181090, 29)
New shape after dropna: (59181090, 29)

Starting 5-Fold Time Series Cross-Validation with Tweedie...
--- Fold 1/5 ---
Training data size: 9863515
Validation data size: 9863515
Training model...
[LightGBM] [Warning] Categorical features with more bins than the configured maximum bin number found.
[LightGBM] [Warning] For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.355888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1744
[LightGBM] [Info] Number of data points in the train set: 9863515, number of used features: 24
[LightGBM] [Info] Start training from score 0.080907
Fold 1 RMSE: 6.0155

--- Fold 2/5 ---
Training data size: 19727030
Validation data size: 9863515
Training model...
[LightGBM] [Warning] Categorical features with more bins than the configured maximum bin number found.
[LightGBM] [Warning] For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.668574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2236
[LightGBM] [Info] Number of data points in the train set: 19727030, number of used features: 24
[LightGBM] [Info] Start training from score 0.362749
Fold 2 RMSE: 5.6752

--- Fold 3/5 ---
Training data size: 29590545
Validation data size: 9863515
Training model...
[LightGBM] [Warning] Categorical features with more bins than the configured maximum bin number found.
[LightGBM] [Warning] For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.037235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2737
[LightGBM] [Info] Number of data points in the train set: 29590545, number of used features: 24
[LightGBM] [Info] Start training from score 0.467928
Fold 3 RMSE: 1.9722

--- Fold 4/5 ---
Training data size: 39454060
Validation data size: 9863515
Training model...
[LightGBM] [Warning] Categorical features with more bins than the configured maximum bin number found.
[LightGBM] [Warning] For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.901304 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3234
[LightGBM] [Info] Number of data points in the train set: 39454060, number of used features: 24
[LightGBM] [Info] Start training from score 0.290435
Fold 4 RMSE: 2.6805

--- Fold 5/5 ---
Training data size: 49317575
Validation data size: 9863515
Training model...
[LightGBM] [Warning] Categorical features with more bins than the configured maximum bin number found.
[LightGBM] [Warning] For categorical features, max_bin and max_bin_by_feature may be ignored with a large number of categories.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.003314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3741
[LightGBM] [Info] Number of data points in the train set: 49317575, number of used features: 24
[LightGBM] [Info] Start training from score 0.259362
Fold 5 RMSE: 0.8889

--- Cross-Validation Complete ---
Mean RMSE across 5 folds: 3.4464
Std Dev of RMSE across 5 folds: 2.0430

Individual Fold RMSEs:
[np.float64(6.015495228490148), np.float64(5.675159014221771), np.float64(1.9721585384492206), np.float64(2.6804973653766853), np.float64(0.8888765510898942)]


Mean RMSE across 5 folds: 3.4464: This is the main performance score. On average, across all 5 folds, the model's prediction of sales for any given item on any given day was off by 3.45 units.

Std Dev of RMSE... 2.0430: This number is quite high, and it's explained by the individual fold scores. It means the model's performance wasn't consistent across the different time periods.

Individual Fold RMSEs: [6.01, 5.67, 1.97, 2.68, 0.88]: This is the most important part.

Model performed poorly on the early data (Folds 1 and 2, RMSE of ~6.0). It had less data to learn from and was trying to predict farther into the future.

It got significantly better as it got more data.

The last fold (Fold 5) is the most important one. It used the most data (49 million rows) to predict the most recent time period. An RMSE of 0.89 is an excellent score. It means that by the end, the model was, on average, off by less than one unit.

-> model is learning very effectively. The high overall average (3.44) is just skewed by the early, "dumber" folds. The "true" model performance, when trained on all this data, is likely closer to the 0.89 RMSE.

In [None]:
# features = [
#     'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
#     'wday', 'month', 'year', 'day_of_week', 'day_of_month', 'day_of_year',
#     'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
#     'snap_CA', 'snap_TX', 'snap_WI',
#     'sell_price', 'price_relative_to_mean', 'price_change_lag'
# ]

# target = 'sales'

# df_train = df.dropna(subset=['sales'])
# X = df_train[features]
# y = df_train[target]

# n_splits = 10
# tscv = TimeSeriesSplit(n_splits=n_splits)

# print(f"Starting {n_splits}-Fold Time Series Cross-Validation...")

# rmse_scores = []
# fold = 1

# for train_index, val_index in tscv.split(X):
#     print(f"--- Fold {fold}/{n_splits} ---")

#     X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#     y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#     print(f"Training data size: {len(X_train)}")
#     print(f"Validation data size: {len(X_val)}")

#     # LightGBM model with Tweedie regression.
#     # because it is excellent for 'count' data with many zeros, like sales.
#     model = lgb.LGBMRegressor(
#         objective='tweedie',
#         metric='rmse',
#         n_estimators=1000,
#         learning_rate=0.05,
#         num_leaves=31,
#         subsample=0.8,
#         colsample_bytree=0.8,
#         random_state=42,
#         n_jobs=-1
#     )

#     print("Training model...")
#     model.fit(X_train, y_train,
#               eval_set=[(X_val, y_val)],
#               eval_metric='rmse',
#               callbacks=[lgb.early_stopping(50, verbose=False)])

#     val_preds = model.predict(X_val)

#     val_preds[val_preds < 0] = 0

#     rmse = np.sqrt(mean_squared_error(y_val, val_preds))
#     print(f"Fold {fold} RMSE: {rmse}\n")
#     rmse_scores.append(rmse)

#     fold += 1

# print("\n--- Cross-Validation Complete ---")
# print(f"Mean RMSE across {n_splits} folds: {np.mean(rmse_scores):.4f}")
# print(f"Std Dev of RMSE across {n_splits} folds: {np.std(rmse_scores):.4f}")
# print("\nIndividual Fold RMSEs:")
# print(rmse_scores)

In [None]:
# --- Load Data (Assuming this is already done) ---
# calendar_df = pd.read_csv('calendar.csv')
# prices_df = pd.read_csv('sell_prices.csv')
# sales_df = pd.read_csv('sales_train_validation.csv')
# ... (all your data merging and feature engineering from above) ...
# df = ... (your final merged and cleaned DataFrame)

# Let's assume 'df' is the DataFrame created in your previous steps.
# We'll drop rows with NaNs from the lag/rolling features you created.
# This also removes the first 27 days for which sales_lag_28 is undefined.
df.dropna(inplace=True)

# Define feature columns (X) and target column (y)
# We will exclude 'id', 'd', 'date', and 'wm_yr_wk' (since 'year' and 'month' capture time)
target = 'sales'
categorical_features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
                        'weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
numeric_features = ['wday', 'month', 'year', 'day_of_week', 'day_of_month', 'day_of_year',
                    'weekend', 'snap_CA', 'snap_TX', 'snap_WI',
                    'sell_price', 'price_mean', 'price_relative_to_mean', 'price_change_lag']

# Combine all features
features = categorical_features + numeric_features

X = df[features]
y = df[target]

# Create a preprocessor
# OneHotEncoder for categorical features
# 'passthrough' for numeric features (they are already scaled or are simple counts/flags)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('num', 'passthrough', numeric_features)
    ],
    remainder='drop'  # Drop any columns not specified
)

# Create the model pipeline
# 1. Preprocess the data (OneHotEncoding)
# 2. Apply Ridge Regression (a good linear baseline)
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', Ridge(alpha=1.0, random_state=42))])

# --- Time Series Cross-Validation ---
# The user's request for "80% training and 20% testing with cross-validation"
# is best implemented using TimeSeriesSplit.
# With n_splits=5, the first fold is ~17% train, 83% test.
# The last fold is ~83% train, 17% test, which is close to the 80/20 split requested.

n_splits = 5  # Using 5 splits instead of 10 to manage computational time
tscv = TimeSeriesSplit(n_splits=n_splits)

print(f"Starting {n_splits}-Fold Time Series Cross-Validation with Ridge Regression...")

rmse_scores = []
fold = 1

for train_index, val_index in tscv.split(X):
    print(f"--- Fold {fold}/{n_splits} ---")

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    print(f"Training data size: {len(X_train)}")
    print(f"Validation data size: {len(X_val)}")

    # Fit the pipeline
    print("Training model...")
    model.fit(X_train, y_train)

    # Predict on validation data
    print("Evaluating model...")
    val_preds = model.predict(X_val)

    # Ensure predictions are non-negative
    val_preds[val_preds < 0] = 0

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"Fold {fold} RMSE: {rmse}\n")
    rmse_scores.append(rmse)

    fold += 1

print("\n--- Cross-Validation Complete ---")
print(f"Mean RMSE across {n_splits} folds: {np.mean(rmse_scores):.4f}")
print(f"Std Dev of RMSE across {n_splits} folds: {np.std(rmse_scores):.4f}")
print("\nIndividual Fold RMSEs:")
print(rmse_scores)

Starting 5-Fold Time Series Cross-Validation with Ridge Regression...
--- Fold 1/5 ---
Training data size: 9863515
Validation data size: 9863515
Training model...
Evaluating model...
Fold 1 RMSE: 5.968871816043665

--- Fold 2/5 ---
Training data size: 19727030
Validation data size: 9863515
Training model...
