In [1]:
import pandas as pd
import pandas as pd
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import numpy as np
from sktime.forecasting.arima import ARIMA
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
from sktime.forecasting.compose import make_reduction
from sklearn.exceptions import ConvergenceWarning
import requests
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import ElasticNetCV
from sktime.forecasting.model_selection import (
    ForecastingGridSearchCV,
    ExpandingWindowSplitter,
)
from sktime.forecasting.compose import MultiplexForecaster
from sklearn.neighbors import KNeighborsRegressor
from sktime.forecasting.ets import AutoETS
from sktime.transformations.series.boxcox import LogTransformer


import warnings

data_df = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/supply_load_price.csv",
    parse_dates=["Date (MST)"],
    index_col="Date (MST)",
)

data_df = data_df.sort_index()
# select dates after dec 1 2022
data_df = data_df.loc["2022-06-01":]

In [2]:
X = data_df.drop(columns=["price"])
y = data_df["price"]

In [3]:
window = 24
X['rolling_mean'] = y.rolling(window).mean().rolling(2).mean().shift(-window // 2)
X['rolling_std'] = y.rolling(window).std().rolling(2).mean().shift(-window // 2)
X['rolling_min'] = y.rolling(window).min().rolling(2).mean().shift(-window // 2)
X['rolling_max'] = y.rolling(window).max().rolling(2).mean().shift(-window // 2)
X['rolling_median'] = y.rolling(window).median().rolling(2).mean().shift(-window // 2)

In [4]:
X['exp_moving_avg'] = y.ewm(span=24).mean()

In [5]:
X = X.dropna()
y = y[X.index]

In [6]:
selected_features = [
    "ail",
    "gas_price",
    "gas_tng",
    "coal_tng",
    "wind_tng",
    "gas_avail",
    "wind_avail",
    "gas_reserve_margin",
    "coal_reserve_margin",
    "wind_reserve_margin",
    "other_reserve_margin",
    "gas_supply_mix",
    "coal_supply_mix",
    "wind_supply_mix",
    "other_supply_mix",
    "total_reserve_margin",
    "demand_supply_ratio",
    "fossil_fuel_ratio",
    "rolling_mean",
    "rolling_std",
    "rolling_min",
    "rolling_max",
    "rolling_median",
    "exp_moving_avg",
]

In [7]:
X = X.loc[
    "2023-01-01":"2023-01-31",
    selected_features,
]
y = y.loc["2023-01-01":"2023-01-31"]

In [8]:
cv = ExpandingWindowSplitter(
    initial_window=int(len(X) * 0.94), step_length=1, fh=np.arange(1, 13)
)

n_splits = cv.get_n_splits(y)
print(f"Number of Folds = {n_splits}")

Number of Folds = 34


In [9]:
from sklearn.preprocessing import StandardScaler
from sktime.forecasting.compose import ForecastingPipeline
from sktime.transformations.series.adapt import TabularToSeriesAdaptor

pipe = ForecastingPipeline(
    steps=[
        ("standardize", TabularToSeriesAdaptor(StandardScaler())),
        (
            "forecaster",
            TransformedTargetForecaster(
                [
                    ("log_transformer", LogTransformer()),
                    # ("deseasonalizer_weekly", Deseasonalizer(sp=24*7, model="additive")),
                    (
                        "forecast",
                        make_reduction(
                            RandomForestRegressor(n_estimators=200, n_jobs=-1),
                            window_length=24,
                            strategy="direct",
                        ),
                    ),
                ]
            ),
        ),
    ]
)

In [10]:
pipe.get_params()

{'steps': [('standardize',
   TabularToSeriesAdaptor(transformer=StandardScaler())),
  ('forecaster',
   TransformedTargetForecaster(steps=[('log_transformer', LogTransformer()),
                                      ('forecast',
                                       DirectTabularRegressionForecaster(estimator=RandomForestRegressor(n_estimators=200, n_jobs=-1),
                                                                         window_length=24))]))],
 'standardize': TabularToSeriesAdaptor(transformer=StandardScaler()),
 'forecaster': TransformedTargetForecaster(steps=[('log_transformer', LogTransformer()),
                                    ('forecast',
                                     DirectTabularRegressionForecaster(estimator=RandomForestRegressor(n_estimators=200, n_jobs=-1),
                                                                       window_length=24))]),
 'standardize__fit_in_transform': False,
 'standardize__transformer': StandardScaler(),
 'standardize__t

In [11]:
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import make_regression
from scipy.stats import uniform, randint
from sktime.forecasting.model_selection import ForecastingRandomizedSearchCV
from sktime.performance_metrics.forecasting import MeanSquaredError

# Define the parameter distributions for randomized search
param_dist = {
    'forecaster__forecast__estimator__min_samples_split': [2, 5, 10],  
    'forecaster__forecast__estimator__min_samples_leaf': [1, 2, 4], 
    'forecaster__forecast__estimator__max_depth': [None, 10, 20, 30],
    'forecaster__forecast__estimator__max_features': ['auto', 'sqrt']
}

random_search = ForecastingGridSearchCV(pipe, cv=cv, param_grid=param_dist, scoring=MeanSquaredError(square_root=True), n_jobs=-1, verbose=1, error_score="raise")

In [27]:
random_search.fit(y, X, fh=np.arange(1, 13))

Fitting 11 folds for each of 1 candidates, totalling 11 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [28]:
random_search.best_score_

203.66191734454935

In [29]:
random_search.best_params_

{'forecaster__forecast__estimator__alphas': [0.1, 0.01, 0.001],
 'forecaster__forecast__estimator__cv': 8,
 'forecaster__forecast__estimator__l1_ratio': 0.9971848109388686,
 'forecaster__forecast__estimator__max_iter': 1000,
 'forecaster__forecast__estimator__n_alphas': 187}

In [30]:
random_search.n_best_scores_

[203.66191734454935]

In [31]:
random_search.cv_results_

Unnamed: 0,mean_test_MeanSquaredError,mean_fit_time,mean_pred_time,params,rank_test_MeanSquaredError
0,203.661917,8.729878,0.007478,{'forecaster__forecast__estimator__alphas': [0...,1.0
