### Cross Validation
- **LightGBM**
- **63 Folds (~ 1 month)** 

In [1]:
import pandas as pd
import numpy as np
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error

import warnings

warnings.filterwarnings("ignore")

In [2]:
import sys
sys.path.append("../../utils/")

In [3]:
from lightgbm import LGBMRegressor
from sktime_custom_pipeline import ForecastingPipeline, TransformedTargetForecaster
from sktime_custom_reduce import make_reduction

### Notebook to run hyperparameter optimization for the model

As the ForecastingGridSearchCV and ForecastingRandomizedSearchCV of sktime are not capable of utilizing the warm initialization feature of LightGBM, we have to implement our own hyperparameter optimization. 

We're relying on an expanding window approach here. We consider the initial training window length as Jan 1st 2021 to Jan 31st 2021. We then expand the training window by 12 hours and retrain the model. We repeat this process until we reach the end of the training data. We'll try out different hyperparameter combinations for each training window and evaluate the performance on the validation set. The best performing hyperparameter combination will be used for the final model.

In [4]:
# Specify the end date of the training data. The rest of the code will automatically create the necessary validation folds. 
# The results will be saved as a pickle file in the same folder as this notebook. It can also been seen in the output of the notebook.
train_end = "2022-12-31"
create_validation_from = "2023-01-01"
device = "gpu"

In [5]:
X_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/train/X_train.csv",
    parse_dates=["date"],
    index_col="date",
)

y_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/train/y_train.csv",
    parse_dates=["date"],
    index_col="date",
)

X_train = X_train.sort_values(by="date")
X_train = X_train.asfreq("H")
y_train = y_train.sort_values(by="date")
y_train = y_train.asfreq("H")

In [6]:
X_test = X_train[create_validation_from:]
y_test = y_train[create_validation_from:]

In [7]:
X_train = X_train[:train_end]
y_train = y_train[:train_end]

In [8]:
cols_for_log_transform = list(set(X_train.columns) - set(list(X_train.columns[X_train.lt(5).any()])) - set(["weekly_profile"]))

In [9]:
from sklearn.preprocessing import StandardScaler
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.transformations.compose import ColumnwiseTransformer
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import TimeSeriesSplit
from sktime.transformations.series.boxcox import LogTransformer

def initialize_log_std_lgbm_forecaster(boosting_type, learning_rate, max_depth, num_leaves=None, reg_alpha=None, reg_lambda=None, min_data_in_leaf=None):
    regressor = LGBMRegressor(
        device=device,
        n_jobs=-1,
        n_estimators=1000,
        boosting_type=boosting_type,
        learning_rate=learning_rate,
        max_depth=max_depth,
    )

    if num_leaves is not None:
        regressor.num_leaves = num_leaves
    if reg_alpha is not None:
        regressor.reg_alpha = reg_alpha
    if reg_lambda is not None:
        regressor.reg_lambda = reg_lambda
    if min_data_in_leaf is not None:
        regressor.min_data_in_leaf = min_data_in_leaf

    pipe = ForecastingPipeline(
        steps=[
            ("log_column_transformer", ColumnwiseTransformer(LogTransformer(), columns=cols_for_log_transform)),
            ("std_column_transformer", TabularToSeriesAdaptor(StandardScaler())),
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        ("log_column_transformer", LogTransformer()),
                        ("std_column_transformer", TabularToSeriesAdaptor(StandardScaler())),
                        (
                            "forecast",
                            make_reduction(
                                regressor,  
                                window_length=24,
                                strategy="direct",
                            ),
                        ),
                    ]
                ),
            ),
        ]
    )

    return pipe


def initialize_log_lgbm_forecaster(boosting_type, learning_rate, max_depth, num_leaves=None, reg_alpha=None, reg_lambda=None, min_data_in_leaf=None):
    regressor = LGBMRegressor(
        device=device,
        n_jobs=-1,
        n_estimators=1000,
        boosting_type=boosting_type,
        learning_rate=learning_rate,
        max_depth=max_depth,
    )

    if num_leaves is not None:
        regressor.num_leaves = num_leaves
    if reg_alpha is not None:
        regressor.reg_alpha = reg_alpha
    if reg_lambda is not None:
        regressor.reg_lambda = reg_lambda
    if min_data_in_leaf is not None:
        regressor.min_data_in_leaf = min_data_in_leaf

    pipe = ForecastingPipeline(
        steps=[
            ("log_column_transformer", ColumnwiseTransformer(LogTransformer(), columns=cols_for_log_transform)),
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        ("log_column_transformer", LogTransformer()),
                        (
                            "forecast",
                            make_reduction(
                                regressor,  
                                window_length=24,
                                strategy="direct",
                            ),
                        ),
                    ]
                ),
            ),
        ]
    )

    return pipe

def initialize_lgbm_forecaster(boosting_type, learning_rate, max_depth, num_leaves=None, reg_alpha=None, reg_lambda=None, min_data_in_leaf=None):
    regressor = LGBMRegressor(
        device=device,
        n_jobs=-1,
        n_estimators=1000,
        boosting_type=boosting_type,
        learning_rate=learning_rate,
        max_depth=max_depth,
    )

    if num_leaves is not None:
        regressor.num_leaves = num_leaves
    if reg_alpha is not None:
        regressor.reg_alpha = reg_alpha
    if reg_lambda is not None:
        regressor.reg_lambda = reg_lambda
    if min_data_in_leaf is not None:
        regressor.min_data_in_leaf = min_data_in_leaf

    pipe = ForecastingPipeline(
        steps=[
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        (
                            "forecast",
                            make_reduction(
                                regressor,
                                window_length=24,
                                strategy="direct",
                            ),
                        ),
                    ]
                ),
            ),
        ]
    )

    return pipe

fh = ForecastingHorizon(np.arange(1, 12 + 1))

In [10]:
# initialize pipelines with hyperparameters
pipelines = [
    initialize_log_lgbm_forecaster("gbdt", 0.01, 20),
    initialize_log_lgbm_forecaster("dart", 0.01, 20),
    initialize_log_std_lgbm_forecaster("gbdt", 0.01, 20),
    initialize_log_std_lgbm_forecaster("dart", 0.01, 20),
    initialize_lgbm_forecaster("dart", 0.01, 20),
]

In [11]:
pipeline_assets = []
for i, pipeline in enumerate(pipelines):
    
    print(f"Training pipeline {i+1}...")
    
    rolling_prediction_df = pd.DataFrame()
    pipeline.fit(y=y_train, X=X_train, fh=fh)

    y_pred = pipeline.predict(fh, X=X_train.tail(1))
    y_pred.columns = [f"cutoff_hour_{pipeline.cutoff.hour[0]}"]
    rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)
      
      
    for i in range(0, len(y_test), 12):

            new_observation_y, new_observation_X  = y_test[i:i+12], X_test[i:i+12]
            
            new_observation_y = new_observation_y.asfreq('H')
            new_observation_X = new_observation_X.asfreq('H')

            pipeline.update(y=new_observation_y, X=new_observation_X, update_params=True)

            pipeline.cutoff.freq = 'H'

            cutoff_time = pipeline.cutoff
            prediction_for = cutoff_time + pd.DateOffset(hours=i)

            y_pred = pipeline.predict(fh, X=new_observation_X)
            
            y_pred.columns = [f"cutoff_hour_{pipeline.cutoff.hour[0]}"]
            
            rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)
            
    rmse_list = []
    fold_actuals = []
    fold_predictions_list = []

    for col in range(rolling_prediction_df.shape[1]-1):
        
        fold_predictions = rolling_prediction_df.iloc[:, col].dropna()
        
        fold_indices = fold_predictions.index  

        y_test_subset = y_test.loc[fold_indices]  
        
        rmse = np.sqrt(mean_squared_error(y_test_subset, fold_predictions))  
        
        rmse_list.append(rmse)

        fold_actuals.append(y_test_subset)
        fold_predictions_list.append(fold_predictions)

    print(f"Average RMSE for each fold: {np.mean(rmse_list)}")
    print(f"STD RMSE for each fold: {np.std(rmse_list)}")
    print(f"MIN RMSE for each fold: {np.min(rmse_list)}")
    print(f"MAX RMSE for each fold: {np.max(rmse_list)}")

    asset_dict = {"actuals": fold_actuals, "predictions": fold_predictions_list, "rmse": rmse_list, "pipeline": pipeline}

    pipeline_assets.append(asset_dict)

Training pipeline 1...
Average RMSE for each fold: 85.588833878514
STD RMSE for each fold: 83.14453898529717
MIN RMSE for each fold: 8.55774434890682
MAX RMSE for each fold: 350.1433484136448
Training pipeline 2...
Average RMSE for each fold: 103.78493800112403
STD RMSE for each fold: 94.08296945130428
MIN RMSE for each fold: 14.465232869183872
MAX RMSE for each fold: 401.4911092193406
Training pipeline 3...
Average RMSE for each fold: 86.15834783115243
STD RMSE for each fold: 83.16394163647337
MIN RMSE for each fold: 8.834967190727612
MAX RMSE for each fold: 352.8045049505404
Training pipeline 4...
Average RMSE for each fold: 85.73208521996806
STD RMSE for each fold: 85.97135835862902
MIN RMSE for each fold: 9.111699204323294
MAX RMSE for each fold: 360.04744485556625
Training pipeline 5...
Average RMSE for each fold: 89.39899806519014
STD RMSE for each fold: 79.79654334071667
MIN RMSE for each fold: 9.606605968497064
MAX RMSE for each fold: 344.27675614094903


In [12]:
lowest_average_rmse = float('inf')
best_model = None

# Iterating over the list of dictionaries
for asset_dict in pipeline_assets:
    avg_rmse = sum(asset_dict["rmse"]) / len(asset_dict["rmse"])  # Calculate the average RMSE
    
    # If this model has a lower average RMSE than the current best model, update the best model and lowest RMSE
    if avg_rmse < lowest_average_rmse:
        lowest_average_rmse = avg_rmse
        best_model = asset_dict

print(f'The model with the lowest average RMSE is: {best_model["pipeline"]}')

The model with the lowest average RMSE is: ForecastingPipeline(steps=[('log_column_transformer',
                            ColumnwiseTransformer(columns=['demand_supply_ratio',
                                                           'gas_tng',
                                                           'calgary_load',
                                                           'rolling_mean',
                                                           'total_reserve_margin',
                                                           'hydro_tng',
                                                           'hydro_reserve_margin',
                                                           'northwest_load',
                                                           'exp_moving_avg',
                                                           'rolling_median',
                                                           'rolling_max',
                                                          

In [13]:
import pickle

with open("tuning_results.pkl", "wb") as f:
    pickle.dump(pipeline_assets, f)

In [17]:
rmse_cv_results = []
rmse_cv_std = []
rmse_cv_min = []
rmse_cv_max = []
rmse = np.mean(pipeline_assets[4]['rmse'])
rmse_std = np.std(pipeline_assets[4]['rmse'])
rmse_min = np.min(pipeline_assets[4]['rmse'])
rmse_max = np.max(pipeline_assets[4]['rmse'])
rmse_cv_results.append(rmse)
rmse_cv_std.append(rmse_std)
rmse_cv_min.append(rmse_min)
rmse_cv_max.append(rmse_max)

In [18]:
# 12 step prediction errors
rmse_cv_results_df = pd.DataFrame(
    {"Model": "LGBM", "RMSE_CV": rmse_cv_results, "RMSE_CV_STD": rmse_cv_std, "RMSE_MIN": rmse_cv_min, "RMSE_MAX": rmse_cv_max}
).sort_values(by=["RMSE_CV"])

rmse_cv_results_df

Unnamed: 0,Model,RMSE_CV,RMSE_CV_STD,RMSE_MIN,RMSE_MAX
0,LGBM,89.398998,79.796543,9.606606,344.276756


In [19]:
with open('cv_lgbm_results.pkl', 'wb') as f:
    pickle.dump(rmse_cv_results_df, f)