In [1]:
import pandas as pd
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import numpy as np
from sktime.forecasting.arima import ARIMA
from sktime.forecasting.compose import TransformedTargetForecaster, DirectTimeSeriesRegressionForecaster, DirectTabularRegressionForecaster
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
from sktime.forecasting.compose import make_reduction
from sklearn.exceptions import ConvergenceWarning
import requests
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sktime.forecasting.compose import YfromX
from sktime.transformations.series.boxcox import LogTransformer
from sktime.transformations.compose import ColumnwiseTransformer
from sktime.forecasting.compose import ForecastingPipeline

import warnings
warnings.filterwarnings('ignore')

## Direct Strategy V3
Adding more lags to the model


In [2]:
import pandas as pd
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import numpy as np
from sktime.forecasting.arima import ARIMA
from sktime.forecasting.compose import (
    TransformedTargetForecaster,
    DirectTimeSeriesRegressionForecaster,
    DirectTabularRegressionForecaster,
)
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
from sktime.forecasting.compose import make_reduction
from sklearn.exceptions import ConvergenceWarning
import requests
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import ElasticNetCV
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sktime.forecasting.compose import YfromX
from sktime.transformations.series.boxcox import LogTransformer
from sktime.transformations.compose import ColumnwiseTransformer
from sktime.forecasting.compose import ForecastingPipeline

import warnings

warnings.filterwarnings("ignore")
import my_reduce
import my_pipelinee

class TimeSeriesForecasting:
    def __init__(
        self,
        len_models=12,
        num_lags=24,
    ):
        self.len_models = len_models
        self.num_lags = num_lags
        self.models = {}
        self.X_sub_train = {}
        self.forecast_len = 1
        self.fh = ForecastingHorizon(np.arange(1, 2))
        self.next_y_window = None

    def create_sliding_window_data_refined(self, ts, window_length):
        splitter = SlidingWindowSplitter(
            fh=[1], window_length=window_length, step_length=1
        )
        # Split the series using the splitter
        split_series = list(splitter.split_series(ts))
        # Create an empty DataFrame to store the data
        data = pd.DataFrame()
        # Iterate over the split series and extract the features and target
        for i, (train, test) in enumerate(split_series):
            # Extract features and target from the split series
            features = ts[train.index]
            target = ts[test.index]
            split_data = {"date": pd.to_datetime(target.index[0].to_pydatetime())}

            for i, j in zip(range(window_length, 0, -1), range(0, window_length)):
                split_data[f"lag_{i}"] = features[j]

            split_data[f"target"] = target[0]

            split_df = pd.DataFrame(split_data, index=["date"])

            data = pd.concat([data, split_df])
        data.set_index("date", inplace=True)
        return data

    def fit(self, y, X):
        X_transformed, y_transformed = self.transform(y, X)

        for j in range(1, self.len_models + 1):
            model_name = f"model_{j}"
            X_train_name = f"X_train_{j}"

            lag_cols = [f"lag_{i}" for i in range(j, j + self.num_lags)]

            self.X_sub_train[X_train_name] = X_transformed[
                lag_cols + self.selected_cols
            ].copy()

            self.X_sub_train[X_train_name] = self.X_sub_train[X_train_name].asfreq("H")

            self.models[model_name] = my_reduce.YfromX(
                                        RandomForestRegressor(
                                            n_estimators=200, n_jobs=-1
                                        )
                                    )

            self.models[model_name].fit(
                y=y_transformed, X=self.X_sub_train[X_train_name], fh=self.fh
            )

    def transform(self, y, X):
        len_models = self.len_models
        num_lags = self.num_lags

        window_length = num_lags + len_models - 1

        data_df = TimeSeriesForecasting().create_sliding_window_data_refined(
            y, window_length
        )

        data_df = pd.merge(data_df, X, left_index=True, right_index=True)

        x_feature_names = [f"lag_{i}" for i in range(1, window_length + 1)]

        last_row = data_df[x_feature_names].iloc[[-1]]
        last_row = last_row.shift(1, axis=1)
        last_row.iloc[:, 0] = data_df['target'].iloc[-1]

        last_row.index = last_row.index + pd.Timedelta(hours=1)

        self.next_y_window = last_row

        self.selected_cols = X.columns.tolist()
        x_feature_names += self.selected_cols

        X_transformed = data_df[x_feature_names]
        y_transformed = data_df["target"]
        return X_transformed,y_transformed

    def predict(self, X):
        # X passed here is features like ail alone
        # Combine the last y window with the features

        X = pd.concat([self.next_y_window, X], axis=1)
        
        predictions_df = pd.DataFrame()
        
        X_sub_test = {}
        for j in range(1, self.len_models + 1):
            model_name = f"model_{j}"
            lag_cols = [f"lag_{i}" for i in range(j, j + self.num_lags)]
            X_test_name = f"X_test_{j}"

            X_sub_test[X_test_name] = X[lag_cols + self.selected_cols].copy()
            X_sub_test[X_test_name] = X_sub_test[X_test_name].asfreq("H")

            cutoff_time = self.models[model_name].cutoff
            prediction_for = cutoff_time + pd.DateOffset(hours=j)

            y_pred = self.models[model_name].predict(
                fh=self.fh, X=X_sub_test[X_test_name]
            )

            row = pd.DataFrame(
                {f"cutoff_hour_{self.models['model_1'].cutoff.hour[0]}": y_pred[0]},
                index=pd.Index(prediction_for),
            )

            predictions_df = predictions_df.append(row)
            
        predictions_df.index.name = "date"
        return predictions_df

    def update(self, new_observation_y, new_observation_X):

        X = pd.concat([self.next_y_window, new_observation_X], axis=1)
        
        X_sub_test = {}
        
        for j in range(1, self.len_models + 1):
            model_name = f"model_{j}"
            X_test_name = f"X_test_{j}"
            lag_cols = [f"lag_{i}" for i in range(j, j + self.num_lags)]

            X_sub_test[X_test_name] = X[lag_cols + self.selected_cols].copy()
            X_sub_test[X_test_name] = X_sub_test[X_test_name].asfreq("H")
            
            new_observation_y = new_observation_y.asfreq("H")

            self.models[model_name].update(
                y=new_observation_y, X=X_sub_test[X_test_name], update_params=False
            )
        
        # Update the next y window
        self.next_y_window = self.next_y_window.shift(1, axis=1)
        self.next_y_window.iloc[:, 0] = new_observation_y.iloc[0]
        self.next_y_window.index = self.next_y_window.index + pd.Timedelta(hours=1)
        

In [3]:
price_old_df = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/supply_load_price.csv",
    parse_dates=["Date (MST)"],
    index_col="Date (MST)",
)

price_old_df = price_old_df.asfreq("H")
price_old_df = price_old_df.sort_values(by="Date (MST)")
price_old_df = price_old_df["2022-12":]

y = price_old_df["price"]
y.name = "target"

selected_cols = ['ail', 'gas_reserve_margin', 'wind_reserve_margin', 'other_reserve_margin', 'load_on_gas_reserve', 'gas_price', 'gas_supply_mix', 'demand_supply_ratio', 'avail_gen_ratio', 'fossil_fuel_ratio', 'gas_tng_ratio']
X = price_old_df[selected_cols]

X = X.asfreq("H")
y = y.asfreq("H")

test_size = 48
forcast_len = 1

total_forecast_len = 12

y_train, y_test_full, X_train, X_test = temporal_train_test_split(y, X, test_size=test_size+total_forecast_len)

y_test = y_test_full[:-total_forecast_len] # We need X for the last forcast_len hours to make predictions.

y_train = y_train.asfreq('H')
X_train = X_train.asfreq('H')
X_test = X_test.asfreq('H')
y_test = y_test.asfreq('H')

forecasting = TimeSeriesForecasting(
    len_models=12,
    num_lags=24,
)

In [4]:
forecasting.fit(y=y_train, X=X_train)

In [5]:
predictions_df = forecasting.predict(X_test.iloc[[0]])
predictions_df

Unnamed: 0_level_0,cutoff_hour_10
date,Unnamed: 1_level_1
2023-03-29 11:00:00,64.7814
2023-03-29 12:00:00,66.52925
2023-03-29 13:00:00,64.08235
2023-03-29 14:00:00,66.1434
2023-03-29 15:00:00,66.4867
2023-03-29 16:00:00,65.34695
2023-03-29 17:00:00,70.0118
2023-03-29 18:00:00,63.9582
2023-03-29 19:00:00,64.75385
2023-03-29 20:00:00,68.3769


In [6]:
rolling_prediction_df = pd.DataFrame(index=y_test_full.index)
rolling_prediction_df = pd.concat([rolling_prediction_df, predictions_df], axis=1)
rolling_prediction_df

Unnamed: 0,cutoff_hour_10
2023-03-29 11:00:00,64.7814
2023-03-29 12:00:00,66.52925
2023-03-29 13:00:00,64.08235
2023-03-29 14:00:00,66.1434
2023-03-29 15:00:00,66.4867
2023-03-29 16:00:00,65.34695
2023-03-29 17:00:00,70.0118
2023-03-29 18:00:00,63.9582
2023-03-29 19:00:00,64.75385
2023-03-29 20:00:00,68.3769


In [7]:
# emulating the rolling prediction for the next hours

for i in range(0, len(y_test)): # Loop through the test set emulating the passing by of one hour. 
        
        predictions_df = pd.DataFrame()
        predictions_df
                
        new_observation_y, new_observation_X  = y_test[i:i+1], X_test[i:i+1]

        new_observation_y = new_observation_y.asfreq('H')
        new_observation_X = new_observation_X.asfreq('H')

        print(f'Cut off before update: {forecasting.models["model_1"].cutoff}')

        forecasting.update(y_test.iloc[[i]], X_test.iloc[[i]])

        print(f'Cut off after update: {forecasting.models["model_1"].cutoff}')

        cutoff_time = forecasting.models["model_1"].cutoff

        prediction_for = cutoff_time + pd.DateOffset(hours=1)

        print(f'Predicting for {prediction_for}')
        
        y_pred = forecasting.predict(X_test.iloc[[i+1]])

        rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)
        
        print(f'Update and prediction done for {new_observation_y.index[0]}')
        print(f'----------------------------------------------------------------------------------')

Cut off before update: DatetimeIndex(['2023-03-29 10:00:00'], dtype='datetime64[ns]', freq='H')
Cut off after update: DatetimeIndex(['2023-03-29 11:00:00'], dtype='datetime64[ns]', name='Date (MST)', freq='H')
Predicting for DatetimeIndex(['2023-03-29 11:00:00'], dtype='datetime64[ns]', name='Date (MST)', freq=None)
Update and prediction done for 2023-03-29 11:00:00
----------------------------------------------------------------------------------
Cut off before update: DatetimeIndex(['2023-03-29 11:00:00'], dtype='datetime64[ns]', name='Date (MST)', freq='H')
Cut off after update: DatetimeIndex(['2023-03-29 12:00:00'], dtype='datetime64[ns]', name='Date (MST)', freq='H')
Predicting for DatetimeIndex(['2023-03-29 13:00:00'], dtype='datetime64[ns]', name='Date (MST)', freq=None)
Update and prediction done for 2023-03-29 12:00:00
----------------------------------------------------------------------------------
Cut off before update: DatetimeIndex(['2023-03-29 12:00:00'], dtype='datetime

In [8]:
rolling_prediction_df

Unnamed: 0,cutoff_hour_10,cutoff_hour_11,cutoff_hour_12,cutoff_hour_13,cutoff_hour_14,cutoff_hour_15,cutoff_hour_16,cutoff_hour_17,cutoff_hour_18,cutoff_hour_19,...,cutoff_hour_1,cutoff_hour_2,cutoff_hour_3,cutoff_hour_4,cutoff_hour_5,cutoff_hour_6,cutoff_hour_7,cutoff_hour_8,cutoff_hour_9,cutoff_hour_10.1
2023-03-29 11:00:00,64.7814,,,,,,,,,,...,,,,,,,,,,
2023-03-29 12:00:00,66.52925,108.9586,,,,,,,,,...,,,,,,,,,,
2023-03-29 13:00:00,64.08235,166.1248,133.94615,,,,,,,,...,,,,,,,,,,
2023-03-29 14:00:00,66.1434,167.5386,127.91195,79.85305,,,,,,,...,,,,,,,,,,
2023-03-29 15:00:00,66.4867,162.64645,112.73835,76.0995,72.0922,,,,,,...,,,,,,,,,,
2023-03-29 16:00:00,65.34695,164.22035,120.89315,73.35135,69.4526,51.5917,,,,,...,,,,,,,,,,
2023-03-29 17:00:00,70.0118,163.3719,129.24885,77.49365,72.33935,54.2081,86.45115,,,,...,,,,,,,,,,
2023-03-29 18:00:00,63.9582,163.00575,118.06905,74.9809,70.52345,54.5837,80.17,165.9093,,,...,,,,,,,,,,
2023-03-29 19:00:00,64.75385,143.58705,134.54835,76.5769,67.33115,53.4105,86.77095,155.31635,112.4148,,...,,,,,,,,,,
2023-03-29 20:00:00,68.3769,137.2979,116.23645,75.94105,69.774,53.42755,85.6857,159.6797,113.54465,61.8899,...,,,,,,,,,,


In [9]:
import pandas as pd
import numpy as np

def generate_step_predictions(rolling_prediction_df, y_test_full, num_steps):
    step_predictions = []
    
    for step in range(0, num_steps):
        diag_values = np.diag(rolling_prediction_df.values, -step)
        
        index_range = y_test_full.index[step:step + len(diag_values)]
        column_name = f'{step+1}_step_prediction'
        
        prediction_df = pd.DataFrame(diag_values, index=index_range, columns=[column_name])
        
        if y_test_full[step:step + len(prediction_df)].index.equals(prediction_df.index):
            step_predictions.append(prediction_df)
        else:
            print(f"Error: Index mismatch for {step}-step prediction.")
    
    return step_predictions

In [10]:
predictions = generate_step_predictions(rolling_prediction_df, y_test_full, total_forecast_len)

In [11]:
# predictions[0] # 1 initial prediction + 48 steps of update and prediction from the test set.

In [12]:
# predictions[1] # 1 initial prediction + 48 steps of update and prediction from the test set.

In [13]:
# predictions[2] # 1 initial prediction + 48 steps of update and prediction from the test set.

In [14]:
step_sizes = np.arange(1, 12+1)
for step, prediction_series in zip(step_sizes, predictions):
    if y_test_full[step-1:step+test_size].index.equals(prediction_series.index):
        rmse = mean_squared_error(y_test_full[step-1:step+test_size], prediction_series, squared=False)
        print(f"{step} Step RMSE for model: {rmse}")

1 Step RMSE for model: 26.873944212930382
2 Step RMSE for model: 32.192773984286696
3 Step RMSE for model: 32.10796971710036
4 Step RMSE for model: 35.87653897393983
5 Step RMSE for model: 36.166545484732644
6 Step RMSE for model: 37.03738021516168
7 Step RMSE for model: 36.71256754955673
8 Step RMSE for model: 38.48953552564711
9 Step RMSE for model: 41.6672234327553
10 Step RMSE for model: 41.04732256979469
11 Step RMSE for model: 38.082117376534526
12 Step RMSE for model: 32.4862783332566
