In [8]:
import pandas as pd
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import numpy as np
from sktime.forecasting.arima import ARIMA
from sktime.forecasting.compose import TransformedTargetForecaster, DirectTimeSeriesRegressionForecaster, DirectTabularRegressionForecaster
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
from sktime.forecasting.compose import make_reduction
from sklearn.exceptions import ConvergenceWarning
import requests
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sktime.forecasting.compose import YfromX
from sktime.transformations.series.boxcox import LogTransformer
from sktime.transformations.compose import ColumnwiseTransformer
from sktime.forecasting.compose import ForecastingPipeline

import warnings
warnings.filterwarnings('ignore')

## Direct Strategy V3
Adding more lags to the model


In [9]:
import pandas as pd
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import numpy as np
from sktime.forecasting.arima import ARIMA
from sktime.forecasting.compose import (
    TransformedTargetForecaster,
    DirectTimeSeriesRegressionForecaster,
    DirectTabularRegressionForecaster,
)
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
from sktime.forecasting.compose import make_reduction
from sklearn.exceptions import ConvergenceWarning
import requests
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import ElasticNetCV
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sktime.forecasting.compose import YfromX
from sktime.transformations.series.boxcox import LogTransformer
from sktime.transformations.compose import ColumnwiseTransformer
from sktime.forecasting.compose import ForecastingPipeline

import warnings

warnings.filterwarnings("ignore")
import my_reduce
import my_pipelinee

class TimeSeriesForecasting:
    def __init__(
        self,
        len_models=12,
        num_lags=24,
    ):
        self.len_models = len_models
        self.num_lags = num_lags
        self.models = {}
        self.X_sub_train = {}
        self.forecast_len = 1
        self.fh = ForecastingHorizon(np.arange(1, 2))
        self.next_y_window = None
        self.cutoff = None

    def create_sliding_window_data_refined(self, ts, window_length):
        splitter = SlidingWindowSplitter(
            fh=[1], window_length=window_length, step_length=1
        )
        # Split the series using the splitter
        split_series = list(splitter.split_series(ts))
        # Create an empty DataFrame to store the data
        data = pd.DataFrame()
        # Iterate over the split series and extract the features and target
        for i, (train, test) in enumerate(split_series):
            # Extract features and target from the split series
            features = ts[train.index]
            target = ts[test.index]
            split_data = {"date": pd.to_datetime(target.index[0].to_pydatetime())}

            for i, j in zip(range(window_length, 0, -1), range(0, window_length)):
                split_data[f"lag_{i}"] = features[j]

            split_data[f"target"] = target[0]

            split_df = pd.DataFrame(split_data, index=["date"])

            data = pd.concat([data, split_df])
        data.set_index("date", inplace=True)
        return data

    def fit(self, y, X):
        X_transformed, y_transformed = self.transform(y, X)

        for j in range(1, self.len_models + 1):
            model_name = f"model_{j}"
            X_train_name = f"X_train_{j}"

            lag_cols = [f"lag_{i}" for i in range(j, j + self.num_lags)]

            self.X_sub_train[X_train_name] = X_transformed[
                lag_cols + self.selected_cols
            ].copy()

            self.X_sub_train[X_train_name] = self.X_sub_train[X_train_name].asfreq("H")

            self.models[model_name] = YfromX(
                                        RandomForestRegressor(
                                            n_estimators=200, n_jobs=-1
                                        )
                                    )

            self.models[model_name].fit(
                y=y_transformed, X=self.X_sub_train[X_train_name], fh=self.fh
            )

        self.cutoff = self.models["model_1"].cutoff

    def transform(self, y, X):
        len_models = self.len_models
        num_lags = self.num_lags

        window_length = num_lags + len_models - 1

        data_df = TimeSeriesForecasting().create_sliding_window_data_refined(
            y, window_length
        )

        data_df = pd.merge(data_df, X, left_index=True, right_index=True)

        x_feature_names = [f"lag_{i}" for i in range(1, window_length + 1)]

        last_row = data_df[x_feature_names].iloc[[-1]]
        last_row = last_row.shift(1, axis=1)
        last_row.iloc[:, 0] = data_df['target'].iloc[-1]

        last_row.index = last_row.index + pd.Timedelta(hours=1)

        self.next_y_window = last_row

        self.selected_cols = X.columns.tolist()
        x_feature_names += self.selected_cols

        X_transformed = data_df[x_feature_names]
        y_transformed = data_df["target"]
        return X_transformed,y_transformed

    def predict(self, X):
        # X passed here is features like ail alone
        # Combine the last y window with the features

        X = pd.concat([self.next_y_window, X], axis=1)
        
        predictions_df = pd.DataFrame()
        
        X_sub_test = {}
        for j in range(1, self.len_models + 1):
            model_name = f"model_{j}"
            lag_cols = [f"lag_{i}" for i in range(j, j + self.num_lags)]
            X_test_name = f"X_test_{j}"

            X_sub_test[X_test_name] = X[lag_cols + self.selected_cols].copy()
            X_sub_test[X_test_name] = X_sub_test[X_test_name].asfreq("H")

            cutoff_time = self.models[model_name].cutoff
            prediction_for = cutoff_time + pd.DateOffset(hours=j)

            y_pred = self.models[model_name].predict(
                fh=self.fh, X=X_sub_test[X_test_name]
            )

            row = pd.DataFrame(
                {f"cutoff_hour_{self.models['model_1'].cutoff.hour[0]}": y_pred[0]},
                index=pd.Index(prediction_for),
            )

            predictions_df = predictions_df.append(row)
            
        predictions_df.index.name = "date"
        return predictions_df

    def update(self, new_observation_y, new_observation_X):

        X = pd.concat([self.next_y_window, new_observation_X], axis=1)
        
        X_sub_test = {}
        
        for j in range(1, self.len_models + 1):
            model_name = f"model_{j}"
            X_test_name = f"X_test_{j}"
            lag_cols = [f"lag_{i}" for i in range(j, j + self.num_lags)]

            X_sub_test[X_test_name] = X[lag_cols + self.selected_cols].copy()
            X_sub_test[X_test_name] = X_sub_test[X_test_name].asfreq("H")
            
            new_observation_y = new_observation_y.asfreq("H")

            self.models[model_name].update(
                y=new_observation_y, X=X_sub_test[X_test_name], update_params=False
            )
        
        # update the cutoff
        self.cutoff = self.models["model_1"].cutoff
        
        # Update the next y window
        self.next_y_window = self.next_y_window.shift(1, axis=1)
        self.next_y_window.iloc[:, 0] = new_observation_y.iloc[0]
        self.next_y_window.index = self.next_y_window.index + pd.Timedelta(hours=1)
        

In [19]:
price_old_df = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/supply_load_price.csv",
    parse_dates=["Date (MST)"],
    index_col="Date (MST)",
)

price_old_df = price_old_df.asfreq("H")
price_old_df = price_old_df.sort_values(by="Date (MST)")

selected_cols = [
    "ail",
    "gas_price",
    "gas_reserve_margin",
    "coal_reserve_margin",
    "wind_reserve_margin",
    "other_reserve_margin",
    "gas_supply_mix",
    "coal_supply_mix",
    "wind_supply_mix",
    "other_supply_mix",
    "total_reserve_margin",
    "demand_supply_ratio",
]
price_old_df_filtered = price_old_df.loc[
    "2022-12-01":"2023-03-28", selected_cols + ["price"]
]
y = price_old_df_filtered["price"]
y.name = "target"

X = price_old_df_filtered[selected_cols]

X = X.asfreq("H")
y = y.asfreq("H")

# test_size = 48
# forcast_len = 1

# total_forecast_len = 12

# y_train, y_test_full, X_train, X_test = temporal_train_test_split(y, X, test_size=test_size+total_forecast_len)

# y_test = y_test_full[:-total_forecast_len] # We need X for the last forcast_len hours to make predictions.

# y_train = y_train.asfreq('H')
# X_train = X_train.asfreq('H')
# X_test = X_test.asfreq('H')
# y_test = y_test.asfreq('H')

# forecasting = TimeSeriesForecasting(
#     len_models=12,
#     num_lags=24,
# )

In [20]:
import my_functions
from sktime.performance_metrics.forecasting import MeanSquaredScaledError, MeanSquaredError

import my_functions
from sktime.forecasting.model_selection import (
    ExpandingWindowSplitter,
)

cv = ExpandingWindowSplitter(
    initial_window=int(len(X) * 0.9), step_length=12, fh=np.arange(1, 13)
)

n_splits = cv.get_n_splits(y)
print(f"Number of Folds = {n_splits}")

Number of Folds = 23


In [21]:
rmse_cv_results = []
rmse_cv_std = []

results = my_functions.evaluate(
    forecaster=TimeSeriesForecasting,
    y=y,
    X=X,
    cv=cv,
    strategy="refit",
    return_data=True,
    scoring=MeanSquaredError(square_root=True),
    backend="loky",
    error_score='raise'
)
    
rmse = results["test_MeanSquaredError"].mean()
rmse_std = results["test_MeanSquaredError"].std()
rmse_cv_results.append(rmse)
rmse_cv_std.append(rmse_std)

In [22]:
rmse_cv_results_df = pd.DataFrame(
    {"Model": ["Direct"], "RMSE_CV": rmse_cv_results, "RMSE_CV_STD": rmse_cv_std}
).sort_values(by=["RMSE_CV"])
rmse_cv_results_df

Unnamed: 0,Model,RMSE_CV,RMSE_CV_STD
0,Direct,174.913185,122.402127


In [7]:
1/0

ZeroDivisionError: division by zero

In [None]:
forecasting.fit(y=y_train, X=X_train)

In [None]:
predictions_df = forecasting.predict(X_test.iloc[[0]])
predictions_df

In [None]:
rolling_prediction_df = pd.DataFrame(index=y_test_full.index)
rolling_prediction_df = pd.concat([rolling_prediction_df, predictions_df], axis=1)
rolling_prediction_df

In [None]:
# emulating the rolling prediction for the next hours

for i in range(0, len(y_test)): # Loop through the test set emulating the passing by of one hour. 
        
        predictions_df = pd.DataFrame()
        predictions_df
                
        new_observation_y, new_observation_X  = y_test[i:i+1], X_test[i:i+1]

        new_observation_y = new_observation_y.asfreq('H')
        new_observation_X = new_observation_X.asfreq('H')

        print(f'Cut off before update: {forecasting.models["model_1"].cutoff}')

        forecasting.update(y_test.iloc[[i]], X_test.iloc[[i]])

        print(f'Cut off after update: {forecasting.models["model_1"].cutoff}')

        cutoff_time = forecasting.models["model_1"].cutoff

        prediction_for = cutoff_time + pd.DateOffset(hours=1)

        print(f'Predicting for {prediction_for}')
        
        y_pred = forecasting.predict(X_test.iloc[[i+1]])

        rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)
        
        print(f'Update and prediction done for {new_observation_y.index[0]}')
        print(f'----------------------------------------------------------------------------------')

In [None]:
rolling_prediction_df

In [None]:
import pandas as pd
import numpy as np

def generate_step_predictions(rolling_prediction_df, y_test_full, num_steps):
    step_predictions = []
    
    for step in range(0, num_steps):
        diag_values = np.diag(rolling_prediction_df.values, -step)
        
        index_range = y_test_full.index[step:step + len(diag_values)]
        column_name = f'{step+1}_step_prediction'
        
        prediction_df = pd.DataFrame(diag_values, index=index_range, columns=[column_name])
        
        if y_test_full[step:step + len(prediction_df)].index.equals(prediction_df.index):
            step_predictions.append(prediction_df)
        else:
            print(f"Error: Index mismatch for {step}-step prediction.")
    
    return step_predictions

In [None]:
predictions = generate_step_predictions(rolling_prediction_df, y_test_full, total_forecast_len)

In [None]:
# predictions[0] # 1 initial prediction + 48 steps of update and prediction from the test set.

In [None]:
# predictions[1] # 1 initial prediction + 48 steps of update and prediction from the test set.

In [None]:
# predictions[2] # 1 initial prediction + 48 steps of update and prediction from the test set.

In [None]:
step_sizes = np.arange(1, 12+1)
for step, prediction_series in zip(step_sizes, predictions):
    if y_test_full[step-1:step+test_size].index.equals(prediction_series.index):
        rmse = mean_squared_error(y_test_full[step-1:step+test_size], prediction_series, squared=False)
        print(f"{step} Step RMSE for model: {rmse}")

In [None]:
rmse_cv_results_df = pd.DataFrame(
    {"Model": ['direct'], "RMSE_CV": rmse_cv_results, "RMSE_CV_STD": rmse_cv_std}
).sort_values(by=["RMSE_CV"])
rmse_cv_results_df