### Cross Validation
- **ARIMA Model**
- **63 Folds (~ 1 month)** 

In [1]:
import pandas as pd
import numpy as np
from sktime.forecasting.compose import TransformedTargetForecaster
from sklearn.metrics import mean_squared_error
import requests
from sktime.forecasting.model_selection import (
    ExpandingWindowSplitter,
)
from sktime.transformations.series.boxcox import LogTransformer
from sktime.forecasting.model_evaluation import evaluate
from sktime.performance_metrics.forecasting import MeanSquaredError

import warnings
import pickle

warnings.filterwarnings("ignore")

In [None]:
# Specify the end date of the training data. The rest of the code will automatically create the necessary validation folds. 
# The results will be saved as a pickle file in the same folder as this notebook. It can also been seen in the output of the notebook.
train_end = "2023-01"

In [2]:
y = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/train/y_train.csv",
    parse_dates=["date"],
    index_col="date",
)

y = y.sort_values(by="date")
y = y.asfreq("H")

In [3]:
from sktime.forecasting.arima import AutoARIMA
from sktime.transformations.series.detrend import Deseasonalizer


def initialize_arima_forecaster():
    deseasonalizer_daily = Deseasonalizer(sp=24, model="additive")
    residual_forecaster = AutoARIMA(
        start_p=0,
        max_p=24,
        d=0,
        start_q=0,
        max_q=12,
    )

    pipe = TransformedTargetForecaster(
        [
            ("log_transformer", LogTransformer()),
            ("deseasonalizer_daily", deseasonalizer_daily),
            ("residual_forecaster", residual_forecaster),
        ]
    )

    return pipe


def get_aeso_predictions(start_date, end_date):
    url = "https://api.aeso.ca/report/v1.1/price/poolPrice"
    headers = {
        "accept": "application/json",
        "X-API-Key": "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ6MHo4MnIiLCJpYXQiOjE2ODM1NzQyMTh9.Gbod9kjeDwP4SOJibSFof63X7GGZxbZdBmBVrgE409w",
    }
    params = {
        "startDate": start_date.date().strftime("%Y-%m-%d"),
        "endDate": end_date.date().strftime("%Y-%m-%d"),
    }

    response = requests.get(url, headers=headers, params=params)

    data = response.json()["return"]["Pool Price Report"]
    df = pd.DataFrame(data)
    df["actual"] = pd.to_numeric(df["pool_price"])
    df["forecast"] = pd.to_numeric(df["forecast_pool_price"])
    return df

In [4]:
arima_pipeline = initialize_arima_forecaster()

In [5]:
# filter to be only 2022-12-01 0:00 to 2023-03-28 23:00
y = y.loc[:"2023-01"]

In [6]:
cv = ExpandingWindowSplitter(
    initial_window=int(len(y) * 0.92), step_length=12, fh=np.arange(1, 13)
)

n_splits = cv.get_n_splits(y)
print(f"Number of Folds = {n_splits}")

Number of Folds = 63


In [7]:
train_test_idx = list(cv.split(y))

In [8]:
print(f"We'll start predicting from: {y.iloc[train_test_idx[0][0]].index[-1]} until {y.index[-1]} with 12 steps ahead for each hour")

We'll start predicting from: 2022-12-31 08:00:00 until 2023-01-31 23:00:00 with 12 steps ahead for each hour


In [9]:
aeso_predictions_df = get_aeso_predictions(y.iloc[train_test_idx[0][0]].index[-1], y.index[-1])
rmse_aeso_predictions = mean_squared_error(aeso_predictions_df['actual'], aeso_predictions_df['forecast'], squared=False)
print(f"RMSE for the predictions by AESO for the same time period as the test set: {round(rmse_aeso_predictions, 2)} CAD/MWh")

RMSE for the predictions by AESO for the same time period as the test set: 87.33 CAD/MWh


In [10]:
results = evaluate(
    forecaster=arima_pipeline,
    y=y,
    cv=cv,
    strategy="refit",
    return_data=True,
    scoring=MeanSquaredError(square_root=True),
    backend="loky",
    error_score='raise'
)

In [13]:
with open('cv_arima_results.pkl', 'wb') as f:
    pickle.dump(results, f)

In [2]:
with open('cv_arima_results.pkl', 'rb') as f:
    results = pickle.load(f)

rmse_cv_results = []
rmse_cv_std = []
rmse_cv_min = []
rmse_cv_max = []
rmse = results["test_MeanSquaredError"].mean()
rmse_std = results["test_MeanSquaredError"].std()
rmse_min = results["test_MeanSquaredError"].min()
rmse_max = results["test_MeanSquaredError"].max()
rmse_cv_results.append(rmse)
rmse_cv_std.append(rmse_std)
rmse_cv_min.append(rmse_min)
rmse_cv_max.append(rmse_max)

In [3]:
one_step_actuals = []
one_step_predictions = []
for i in range(len(results["y_pred"])):
    one_step_actuals.append(results["y_test"].iloc[i].iloc[0])
    one_step_predictions.append(results["y_pred"].iloc[i].iloc[0])

our_one_step_rmse = mean_squared_error(one_step_actuals, one_step_predictions, squared=False)
print(f"Our one-step RMSE: {our_one_step_rmse} CAD/MWh")

Our one-step RMSE: 73.27568961121742 CAD/MWh


In [4]:
# 12 step prediction errors
rmse_cv_results_df = pd.DataFrame(
    {"Model": "ARIMA", "RMSE_CV": rmse_cv_results, "RMSE_CV_STD": rmse_cv_std, "RMSE_MIN": rmse_cv_min, "RMSE_MAX": rmse_cv_max}
).sort_values(by=["RMSE_CV"])

rmse_cv_results_df

Unnamed: 0,Model,RMSE_CV,RMSE_CV_STD,RMSE_MIN,RMSE_MAX
0,ARIMA,83.850093,73.117051,9.47205,307.5202


In [6]:
# Uncomment to plot the 63 folds of the 12 step predictions

# from sktime.utils.plotting import plot_series

# for i in range(len(results["y_pred"])):
#     fig, ax = plot_series(
#         results["y_pred"].iloc[i], results["y_test"].iloc[i], labels=["Predictions", "Actuals"]
#     )