In [1]:
import pandas as pd
from lightgbm import LGBMRegressor

from data_helpers import MaskedData, Data, Model, Feature, Features, TARGET, DATE_COLUMN

### Set parameters

In [2]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'tweedie',
    'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'subsample_freq': 1,
    'learning_rate': 0.03,
    'num_leaves': 2**11-1, 
    'min_data_in_leaf': 2**12-1,
    'feature_fraction': 0.5,
    'n_estimators': 1400,
    'boost_from_average': False,
}

horizons = [1, 7, 14]
retrain_frequency = 60  # retrain every 60 days
first_forecast_date = pd.Timestamp("2013-01-01")  # we assume that on the forecast date, we have observed the sales of that date

### Read features

In [3]:
data = (
    pd.read_parquet("../data/features.parquet")
    .assign(date=lambda df: pd.to_datetime(df["date"]))
    .set_index("date")
    .sort_index()
)

features = Features(feature_names=[col for col in data.columns if col not in [TARGET, DATE_COLUMN]])

### Initialize models

In [4]:
direct_method_models = {
    horizon: Model(model=LGBMRegressor(**params, verbose=-1), data=Data(data, horizons=[horizon], features=features))
    for horizon in horizons
}

multi_horizon_model = Model(model=LGBMRegressor(**params, verbose=-1), data=MaskedData(data, horizons=horizons, features=features))

### Run cross validation
We forecast every day, but retrain only once every `retrain_frequency` days.

In [5]:
forecast_date = first_forecast_date

results = []

for i, forecast_date in enumerate(pd.date_range(first_forecast_date, periods=365, freq="D")):

    if not i % retrain_frequency:
        print(f"retraining model on {forecast_date}")

        # train separate models
        for horizon, model in direct_method_models.items():
            X_train, y_train = model.data.get_train_data(forecast_date)
            model.model.fit(X_train, y_train)

        # train single model
        X_train, y_train = multi_horizon_model.data.get_train_data(forecast_date)
        multi_horizon_model.model.fit(X_train, y_train)

    # predict separate models
    for horizon, model in direct_method_models.items():
        X_test, y_test, _ = model.data.get_test_data(forecast_date)
        
        if X_test.empty:
            continue

        y_pred = model.model.predict(X_test)
        result = X_test.assign(y_pred=y_pred, y_test=y_test, forecast_date=forecast_date, model="single-horizon")
        results.append(result)

    # predict single model
    X_test, y_test = multi_horizon_model.data.get_test_data(forecast_date)
    
    if X_test.empty: 
        continue

    y_pred = multi_horizon_model.model.predict(X_test)
    result = X_test.assign(y_pred=y_pred, y_test=y_test, forecast_date=forecast_date, model="multi-horizon")
    results.append(result)

retraining model on 2013-01-01 00:00:00
retraining model on 2013-03-02 00:00:00
retraining model on 2013-05-01 00:00:00
retraining model on 2013-06-30 00:00:00
retraining model on 2013-08-29 00:00:00
retraining model on 2013-10-28 00:00:00
retraining model on 2013-12-27 00:00:00


In [6]:
from datetime import datetime 
ts = datetime.now().strftime("%Y%m%d_%H_%M_%S")
file_name = f"{ts}_retrain_freq_{retrain_frequency}.parquet"
pd.concat(results).to_parquet(f"../results/{file_name}")
print(f"Written results to {file_name}")

Written results to 20240222_13_40_36_retrain_freq_60.parquet
