In [1]:
import pandas as pd
import numpy as np
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
import plotly.express as px
from sktime.forecasting.model_selection import temporal_train_test_split
import time

import warnings

warnings.filterwarnings("ignore")

In [2]:
import sys
sys.path.append("../utils/")
import pipeline_helpers as ph

### Pipeline to evaluate the optimized hyperparameters on the dates where AESO predictions are available.
- Updates pool price one at a time.
- AESO complete 6 hour predictions were logged for the dates from 26th May 2023 to 31st May 2023.

In [3]:
X = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/complete_data/features.csv",
    parse_dates=["date"],
    index_col="date",
)

y = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/complete_data/target.csv",
    parse_dates=["date"],
    index_col="date",
)

X = X.sort_values(by="date")
X = X.asfreq("H")
y = y.sort_values(by="date")
y = y.asfreq("H")

In [4]:
# Train test split
forecast_len = 12

# Split the data into training and test sets by slicing dates
forecast_len = 12

# Select test size same as the size where we have actaul predictions from AESO
test_size = 24 * 4

y_train, y_test, X_train, X_test = temporal_train_test_split(
    y, X, test_size=test_size + forecast_len
)

y_train = y_train.asfreq("H")
y_test = y_test.asfreq("H")
X_train = X_train.asfreq("H")
X_test = X_test.asfreq("H")

In [5]:
lgbm_pipeline = ph.initialize_optimized_lgbm_forecaster()
fh = ForecastingHorizon(np.arange(1, 12 + 1))

In [6]:
forecast_len = 12
step_length = 1

In [7]:
# start stop watch to time the training
start_time = time.time()
lgbm_pipeline.fit(y=y_train, X=X_train, fh=fh)
end_time = time.time()
print(f"Fit time: {end_time - start_time} seconds")

Fit time: 1081.1804971694946 seconds


In [8]:
fit_time = end_time - start_time

In [9]:
# start stop watch to time the training
start_time = time.time()
rolling_prediction_df = ph.get_rolling_predictions(lgbm_pipeline, X_train, X_test, y_test, fh, 1, forecast_len, verbose=True)
end_time = time.time()
print(f"Generating Predictions Time: {end_time - start_time} seconds")

Updating with actual values at 2023-05-26 13:00:00
Cut off before update: DatetimeIndex(['2023-05-26 12:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Cut off after update: DatetimeIndex(['2023-05-26 13:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Predicting for DatetimeIndex(['2023-05-26 13:00:00'], dtype='datetime64[ns]', name='date', freq=None)
Update and prediction done for 2023-05-26 13:00:00
----------------------------------------------------------------------------------
Updating with actual values at 2023-05-26 14:00:00
Cut off before update: DatetimeIndex(['2023-05-26 13:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Cut off after update: DatetimeIndex(['2023-05-26 14:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Predicting for DatetimeIndex(['2023-05-26 15:00:00'], dtype='datetime64[ns]', name='date', freq=None)
Update and prediction done for 2023-05-26 14:00:00
---------------------------------------------------------------------------

In [10]:
prediction_time = end_time - start_time

In [11]:
fold_actuals, fold_predictions_list, rmse_list = ph.get_fold_predictions(rolling_prediction_df, y_test)

Average RMSE for each fold: 115.04575413921445
STD RMSE for each fold: 61.14944235715754


In [12]:
predictions = ph.generate_step_predictions(rolling_prediction_df, y_test, forecast_len)

In [13]:
actuals, rmses = ph.generate_step_errors(predictions, y_test, forecast_len)

1 Step RMSE for model: 94.63643995276831
2 Step RMSE for model: 117.42258844823421
3 Step RMSE for model: 115.68593069587409
4 Step RMSE for model: 126.4747788558844
5 Step RMSE for model: 136.80022124983512
6 Step RMSE for model: 143.051882588718
7 Step RMSE for model: 144.72724439932472
8 Step RMSE for model: 140.81093188406712
9 Step RMSE for model: 137.47604171527522
10 Step RMSE for model: 131.59741121679005
11 Step RMSE for model: 135.71407415942423
12 Step RMSE for model: 128.14616875478313


In [14]:
ph.get_aeso_predictions(y_test.index[0], y_test.index[-1])

One step prediction errors for AESO forecasts: 120.57 CAD/MWh.
As these are one step predictions, the error should be lesser than ours since ours is 12 step prediction errors.


In [15]:
data = {
    f"{step}_step_rmse": [rmse]
    for step, rmse in zip(range(1, forecast_len + 1), rmses)
}

error_df = pd.DataFrame(data)
error_df["avg_fold_rmse"] = round(np.mean(rmse_list), 2)
error_df["fit_time"] = fit_time
error_df["prediction_time"] = prediction_time 

# save the error_df
error_df.to_csv("aeso_test_results.csv", index=False)