### Cross Validation
- **LightGBM**
- **63 Folds (~ 1 month)** 

In [1]:
import pandas as pd
import numpy as np
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error

import warnings

warnings.filterwarnings("ignore")

In [2]:
import sys
sys.path.append("../../../utils/")

In [3]:
from lightgbm import LGBMRegressor
from sktime_custom_pipeline import ForecastingPipeline, TransformedTargetForecaster
from sktime_custom_reduce import make_reduction

### Notebook to run hyperparameter optimization for the model

As the ForecastingGridSearchCV and ForecastingRandomizedSearchCV of sktime are not capable of utilizing the warm initialization feature of LightGBM, we have to implement our own hyperparameter optimization. 

We're relying on an expanding window approach here. We consider the initial training window length as Jan 1st 2021 to Jan 31st 2021. We then expand the training window by 12 hours and retrain the model. We repeat this process until we reach the end of the training data. We'll try out different hyperparameter combinations for each training window and evaluate the performance on the validation set. The best performing hyperparameter combination will be used for the final model.

In [4]:
# Specify the end date of the training data. The rest of the code will automatically create the necessary validation folds. 
# The results will be saved as a pickle file in the same folder as this notebook. It can also been seen in the output of the notebook.
train_start = "2022-01-01"
train_end = "2022-12-31"
create_validation_from = "2023-01-01"
device = "gpu"

In [5]:
X_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/train/X_train.csv",
    parse_dates=["date"],
    index_col="date",
)

y_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/train/y_train.csv",
    parse_dates=["date"],
    index_col="date",
)

X_train = X_train[train_start:]
y_train = y_train[train_start:]

X_train = X_train.sort_values(by="date")
X_train = X_train.asfreq("H")
y_train = y_train.sort_values(by="date")
y_train = y_train.asfreq("H")

cols_for_log_transform = list(set(X_train.columns) - set(list(X_train.columns[X_train.lt(3).any()])) - set(["weekly_profile"]))

In [6]:
X_test = X_train[create_validation_from:]
y_test = y_train[create_validation_from:]

In [7]:
X_train = X_train[:train_end]
y_train = y_train[:train_end]

In [8]:
from sklearn.preprocessing import StandardScaler
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.transformations.compose import ColumnwiseTransformer
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import TimeSeriesSplit
from sktime.transformations.series.boxcox import LogTransformer


def initialize_lgbm_forecaster():
    pipe = ForecastingPipeline(
        steps=[
            ("log_column_transformer", ColumnwiseTransformer(LogTransformer(), columns=cols_for_log_transform)),
            ("std_column_transformer", TabularToSeriesAdaptor(StandardScaler())),
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        ("log_column_transformer", LogTransformer()),
                        ("std_column_transformer", TabularToSeriesAdaptor(StandardScaler())),
                        (
                            "forecast",
                            make_reduction(
                                LGBMRegressor(
                                    device="gpu", num_threads=12, n_estimators=200
                                ),
                                window_length=24,
                                strategy="direct",
                            ),
                        ),
                    ]
                ),
            ),
        ]
    )

    return pipe


fh = ForecastingHorizon(np.arange(1, 12 + 1))

In [9]:
# initialize pipelines with hyperparameters
pipelines = [
    initialize_lgbm_forecaster()
]

In [10]:
pipeline_assets = []
for i, pipeline in enumerate(pipelines):
    
    print(f"Training pipeline {i+1}...")
    
    rolling_prediction_df = pd.DataFrame()
    pipeline.fit(y=y_train, X=X_train, fh=fh)

    y_pred = pipeline.predict(fh, X=X_train.tail(1))
    y_pred.columns = [f"cutoff_hour_{pipeline.cutoff.hour[0]}"]
    rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)
      
      
    for i in range(0, len(y_test), 12):
            
            print(f"Predicting fold {i//12+1}...")

            new_observation_y, new_observation_X  = y_test[i:i+12], X_test[i:i+12]
            
            new_observation_y = new_observation_y.asfreq('H')
            new_observation_X = new_observation_X.asfreq('H')

            pipeline.update(y=new_observation_y, X=new_observation_X, update_params=True)

            pipeline.cutoff.freq = 'H'

            cutoff_time = pipeline.cutoff
            prediction_for = cutoff_time + pd.DateOffset(hours=i)

            y_pred = pipeline.predict(fh, X=new_observation_X)
            
            y_pred.columns = [f"cutoff_hour_{pipeline.cutoff.hour[0]}"]
            
            rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)
            
    rmse_list = []
    fold_actuals = []
    fold_predictions_list = []

    for col in range(rolling_prediction_df.shape[1]-1):
        
        fold_predictions = rolling_prediction_df.iloc[:, col].dropna()
        
        fold_indices = fold_predictions.index  

        y_test_subset = y_test.loc[fold_indices]  
        
        rmse = np.sqrt(mean_squared_error(y_test_subset, fold_predictions))  
        
        rmse_list.append(rmse)

        fold_actuals.append(y_test_subset)
        fold_predictions_list.append(fold_predictions)

    print(f"Average RMSE for each fold: {np.mean(rmse_list)}")
    print(f"STD RMSE for each fold: {np.std(rmse_list)}")
    print(f"MIN RMSE for each fold: {np.min(rmse_list)}")
    print(f"MAX RMSE for each fold: {np.max(rmse_list)}")

    asset_dict = {"actuals": fold_actuals, "predictions": fold_predictions_list, "rmse": rmse_list, "pipeline": pipeline}

    pipeline_assets.append(asset_dict)

Training pipeline 1...
Predicting fold 1...
Predicting fold 2...
Predicting fold 3...
Predicting fold 4...
Predicting fold 5...
Predicting fold 6...
Predicting fold 7...
Predicting fold 8...
Predicting fold 9...
Predicting fold 10...
Predicting fold 11...
Predicting fold 12...
Predicting fold 13...
Predicting fold 14...
Predicting fold 15...
Predicting fold 16...
Predicting fold 17...
Predicting fold 18...
Predicting fold 19...
Predicting fold 20...
Predicting fold 21...
Predicting fold 22...
Predicting fold 23...
Predicting fold 24...
Predicting fold 25...
Predicting fold 26...
Predicting fold 27...
Predicting fold 28...
Predicting fold 29...
Predicting fold 30...
Predicting fold 31...
Predicting fold 32...
Predicting fold 33...
Predicting fold 34...
Predicting fold 35...
Predicting fold 36...
Predicting fold 37...
Predicting fold 38...
Predicting fold 39...
Predicting fold 40...
Predicting fold 41...
Predicting fold 42...
Predicting fold 43...
Predicting fold 44...
Predicting fold 45