In [59]:
import pandas as pd
import numpy as np
from sktime.forecasting.arima import ARIMA
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
from sktime.utils.plotting import plot_series

import warnings

warnings.filterwarnings("ignore")

In [60]:
import sys
sys.path.append("../utils/")

In [61]:
from sktime_custom_reduce import make_reduction
from lightgbm import LGBMRegressor
from sktime_custom_pipeline import ForecastingPipeline, TransformedTargetForecaster
from pipeline_helpers import get_fold_predictions, get_plotting_df, get_aeso_predictions, generate_step_predictions, generate_step_errors

### Pipeline to evaluate the default hyperparameters on the test set

In [62]:
X_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/train/X_train.csv",
    parse_dates=["date"],
    index_col="date",
)

y_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/train/y_train.csv",
    parse_dates=["date"],
    index_col="date",
)

X_train = X_train.sort_values(by="date")
X_train = X_train.asfreq("H")
y_train = y_train.sort_values(by="date")
y_train = y_train.asfreq("H")

In [63]:
X_train = X_train['2023':]
y_train = y_train['2023':]

In [64]:
X = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/test/X_test.csv",
    parse_dates=["date"],
    index_col="date",
)

y = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/test/y_test.csv",
    parse_dates=["date"],
    index_col="date",
)

X = X.sort_values(by="date")
X = X.asfreq("H")
y = y.sort_values(by="date")
y = y.asfreq("H")

In [65]:
X = X[:'2023-02-10']
y = y[:'2023-02-10']

In [66]:
y_test_full = y.copy()
X_test = X.copy()

forecast_len = 12
y_test = y_test_full[:-forecast_len]

In [67]:
def initialize_lgbm_forecaster():
    pipe = ForecastingPipeline(
        steps=[
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        (
                            "forecast",
                            make_reduction(
                                LGBMRegressor(
                                    device="gpu",
                                    num_threads=13,
                                    n_estimators=1,
                                ),
                                window_length=24,
                                strategy="direct",
                            ),
                        ),
                    ]
                ),
            ),
        ]
    )

    return pipe

lgbm_pipeline = initialize_lgbm_forecaster()

fh = ForecastingHorizon(np.arange(1, 12 + 1))

In [68]:
rolling_prediction_df = pd.DataFrame(index=y_test_full.index)

In [69]:
lgbm_pipeline.fit(y=y_train, X=X_train, fh=fh)



In [70]:
y_pred = lgbm_pipeline.predict(fh, X=X_train.tail(1))
y_pred.columns = [f"cutoff_hour_{lgbm_pipeline.cutoff.hour[0]}"]
rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)

In [71]:
rolling_prediction_df

Unnamed: 0,cutoff_hour_23
2023-02-01 00:00:00,127.522971
2023-02-01 01:00:00,127.417081
2023-02-01 02:00:00,119.742964
2023-02-01 03:00:00,120.163915
2023-02-01 04:00:00,121.513006
...,...
2023-02-10 19:00:00,
2023-02-10 20:00:00,
2023-02-10 21:00:00,
2023-02-10 22:00:00,


In [72]:
# emulating the rolling prediction for the next hours

for i in range(0, len(y_test), 12):

        new_observation_y, new_observation_X  = y_test_full[i:i+12], X_test[i:i+12]
        
        new_observation_y = new_observation_y.asfreq('H')
        new_observation_X = new_observation_X.asfreq('H')

        print(f'Updating with actual values at {new_observation_y.index[0]}')
        print(f'Cut off before update: {lgbm_pipeline.cutoff}')

        lgbm_pipeline.update(y=new_observation_y, X=new_observation_X, update_params=True)

        print(f'Cut off after update: {lgbm_pipeline.cutoff}')

        lgbm_pipeline.cutoff.freq = 'H'

        cutoff_time = lgbm_pipeline.cutoff
        prediction_for = cutoff_time + pd.DateOffset(hours=i)

        print(f'Predicting for {prediction_for}')
        
        y_pred = lgbm_pipeline.predict(fh, X=new_observation_X)
        
        y_pred.columns = [f"cutoff_hour_{lgbm_pipeline.cutoff.hour[0]}"]
        
        rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)
        
        print(f'Update and prediction done for {new_observation_y.index[0]}')
        print(f'----------------------------------------------------------------------------------')

Updating with actual values at 2023-02-01 00:00:00
Cut off before update: DatetimeIndex(['2023-01-31 23:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Cut off after update: DatetimeIndex(['2023-02-01 11:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Predicting for DatetimeIndex(['2023-02-01 11:00:00'], dtype='datetime64[ns]', name='date', freq=None)
Update and prediction done for 2023-02-01 00:00:00
----------------------------------------------------------------------------------
Updating with actual values at 2023-02-01 12:00:00
Cut off before update: DatetimeIndex(['2023-02-01 11:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Cut off after update: DatetimeIndex(['2023-02-01 23:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Predicting for DatetimeIndex(['2023-02-02 11:00:00'], dtype='datetime64[ns]', name='date', freq=None)
Update and prediction done for 2023-02-01 12:00:00
---------------------------------------------------------------------------

In [73]:
rolling_prediction_df

Unnamed: 0,cutoff_hour_23,cutoff_hour_11,cutoff_hour_23.1,cutoff_hour_11.1,cutoff_hour_23.2,cutoff_hour_11.2,cutoff_hour_23.3,cutoff_hour_11.3,cutoff_hour_23.4,cutoff_hour_11.4,cutoff_hour_23.5,cutoff_hour_11.5,cutoff_hour_23.6,cutoff_hour_11.6,cutoff_hour_23.7,cutoff_hour_11.7,cutoff_hour_23.8,cutoff_hour_11.8,cutoff_hour_23.9,cutoff_hour_11.9
2023-02-01 00:00:00,127.522971,,,,,,,,,,,,,,,,,,,
2023-02-01 01:00:00,127.417081,,,,,,,,,,,,,,,,,,,
2023-02-01 02:00:00,119.742964,,,,,,,,,,,,,,,,,,,
2023-02-01 03:00:00,120.163915,,,,,,,,,,,,,,,,,,,
2023-02-01 04:00:00,121.513006,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-10 19:00:00,,,,,,,,,,,,,,,,,,,,131.230228
2023-02-10 20:00:00,,,,,,,,,,,,,,,,,,,,125.492980
2023-02-10 21:00:00,,,,,,,,,,,,,,,,,,,,121.527282
2023-02-10 22:00:00,,,,,,,,,,,,,,,,,,,,121.804517


In [74]:
fold_actuals, fold_predictions_list, rmse_list = get_fold_predictions(rolling_prediction_df, y_test_full)

In [75]:
# Print Average RMSE of all folds
print(f"Average RMSE for each fold: {np.mean(rmse_list)}")

Average RMSE for each fold: 83.4223864754236


In [76]:
# Print max RMSE
print(f"Max RMSE for each fold: {np.max(rmse_list)}")

Max RMSE for each fold: 230.6956040746861


In [77]:
# Print hightst 5 RMSE
print(f"Top 5 RMSE for each fold: {np.sort(rmse_list)[-5:]}")

Top 5 RMSE for each fold: [ 79.1014519   83.83795642  83.91754    161.92818857 230.69560407]


In [78]:
y_hist = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/filtered_target_medium.csv",
    parse_dates=["date"],
    index_col="date",
)

y_hist = y_hist.sort_values(by="date")
y_hist = y_hist.asfreq("H")

In [79]:
ddf = get_plotting_df(fold_actuals=fold_actuals, fold_predictions_list=fold_predictions_list, y_hist=y_hist)

In [80]:
import plotly.express as px
fig = px.line(ddf, x="periodstep", y=["HistoricalPrice", "FuturePrice", "Predicted"], animation_frame="timestep")
fig.show()

In [81]:
aeso_predictions_df = get_aeso_predictions(y_test_full.index[0], y_test_full.index[-1])
rmse_aeso_predictions = mean_squared_error(aeso_predictions_df['actual'], aeso_predictions_df['forecast'], squared=False)
print(f"RMSE for the predictions by AESO for the same time period as the test set: {round(rmse_aeso_predictions, 2)} CAD/MWh")

RMSE for the predictions by AESO for the same time period as the test set: 47.66 CAD/MWh
