In [None]:
import pandas as pd
import pandas as pd
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import numpy as np
from sktime.forecasting.arima import ARIMA
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
from sktime.forecasting.compose import make_reduction
from sklearn.exceptions import ConvergenceWarning
import requests
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import ElasticNetCV
from sktime.forecasting.model_selection import (
    ForecastingGridSearchCV,
    ExpandingWindowSplitter,
)
from sktime.forecasting.compose import MultiplexForecaster
from sklearn.neighbors import KNeighborsRegressor
from sktime.forecasting.ets import AutoETS
from sktime.transformations.series.boxcox import LogTransformer


import warnings

data_df = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/supply_load_price.csv",
    parse_dates=["Date (MST)"],
    index_col="Date (MST)",
)

data_df = data_df.sort_index()
# select dates after dec 1 2022
data_df = data_df.loc["2022-06-01":]

In [None]:
X = data_df.drop(columns=["price"])
y = data_df["price"]

In [None]:
window = 24
X['rolling_mean'] = y.rolling(window).mean().rolling(2).mean().shift(-window // 2)
X['rolling_std'] = y.rolling(window).std().rolling(2).mean().shift(-window // 2)
X['rolling_min'] = y.rolling(window).min().rolling(2).mean().shift(-window // 2)
X['rolling_max'] = y.rolling(window).max().rolling(2).mean().shift(-window // 2)
X['rolling_median'] = y.rolling(window).median().rolling(2).mean().shift(-window // 2)

In [None]:
X['exp_moving_avg'] = y.ewm(span=24).mean()

In [None]:
X = X.dropna()
y = y[X.index]

In [None]:
selected_features = [
    "ail",
    "gas_price",
    "gas_tng",
    "coal_tng",
    "wind_tng",
    "gas_avail",
    "wind_avail",
    "gas_reserve_margin",
    "coal_reserve_margin",
    "wind_reserve_margin",
    "other_reserve_margin",
    "gas_supply_mix",
    "coal_supply_mix",
    "wind_supply_mix",
    "other_supply_mix",
    "total_reserve_margin",
    "demand_supply_ratio",
    "fossil_fuel_ratio",
    "rolling_mean",
    "rolling_std",
    "rolling_min",
    "rolling_max",
    "rolling_median",
    "exp_moving_avg",
]

In [None]:
X = X.loc[
    "2023-01-01":"2023-01-31",
    selected_features,
]
y = y.loc["2023-01-01":"2023-01-31"]

In [None]:
cv = ExpandingWindowSplitter(
    initial_window=int(len(X) * 0.94), step_length=1, fh=np.arange(1, 13)
)

n_splits = cv.get_n_splits(y)
print(f"Number of Folds = {n_splits}")

In [None]:
from sklearn.preprocessing import StandardScaler
from sktime.forecasting.compose import ForecastingPipeline
from sktime.transformations.series.adapt import TabularToSeriesAdaptor

pipe = ForecastingPipeline(
        steps=[
            ("standardize", TabularToSeriesAdaptor(StandardScaler())),
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        ("log_transformer", LogTransformer()),
                        (
                            "forecast",
                            make_reduction(
                                ElasticNetCV(n_jobs=-1),
                                window_length=24,
                                strategy="direct",
                            ),
                        ),
                    ]
                ),
            ),
        ]
    )

In [None]:
# set freq
y.index.freq = "H"
X.index.freq = "H"

In [None]:
from sktime.forecasting.model_evaluation import evaluate
from sktime.performance_metrics.forecasting import MeanSquaredError

list_models = ["elasticnet_pipeline"]

rmse_cv_results = []
rmse_cv_std = []
for i in list_models:
    print(i)
    results = evaluate(
        forecaster=pipe,
        y=y,
        X=X,
        cv=cv,
        strategy="refit",
        return_data=True,
        scoring=MeanSquaredError(square_root=True),
        backend="loky",
        error_score='raise'
    )
    
    rmse = results["test_MeanSquaredError"].mean()
    rmse_std = results["test_MeanSquaredError"].std()
    rmse_cv_results.append(rmse)
    rmse_cv_std.append(rmse_std)

In [None]:
rmse_cv_results_df = pd.DataFrame(
    {"Model": list_models, "RMSE_CV": rmse_cv_results, "RMSE_CV_STD": rmse_cv_std}
).sort_values(by=["RMSE_CV"])
rmse_cv_results_df

In [None]:
re_pipe = ForecastingPipeline(
        steps=[
            ("standardize", TabularToSeriesAdaptor(StandardScaler())),
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        ("log_transformer", LogTransformer()),
                        ("deseasonalizer_weekly", Deseasonalizer(sp=24*7, model="additive")),
                        (
                            "forecast",
                            make_reduction(
                                RandomForestRegressor(n_estimators=100, n_jobs=-1),
                                window_length=24,
                                strategy="direct",
                            ),
                        ),
                    ]
                ),
            ),
        ]
    )

In [None]:
re_pipe.fit(y, X, fh=np.arange(1, 13))

In [None]:
from sktime.forecasting.model_evaluation import evaluate
from sktime.performance_metrics.forecasting import MeanSquaredError

list_models = ["elasticnet_pipeline"]

rmse_cv_results = []
rmse_cv_std = []
for i in list_models:
    print(i)
    results = evaluate(
        forecaster=re_pipe,
        y=y,
        X=X,
        cv=cv,
        strategy="refit",
        return_data=True,
        scoring=MeanSquaredError(square_root=True),
        backend="loky",
        error_score='raise'
    )
    
    rmse = results["test_MeanSquaredError"].mean()
    rmse_std = results["test_MeanSquaredError"].std()
    rmse_cv_results.append(rmse)
    rmse_cv_std.append(rmse_std)

In [None]:
rmse_cv_results_df = pd.DataFrame(
    {"Model": list_models, "RMSE_CV": rmse_cv_results, "RMSE_CV_STD": rmse_cv_std}
).sort_values(by=["RMSE_CV"])
rmse_cv_results_df