In [1]:
import pandas as pd
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import numpy as np
from sktime.forecasting.arima import ARIMA
from my_pipeline import TransformedTargetForecaster
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
from sktime.utils.plotting import plot_series
import matplotlib.pyplot as plt
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings("ignore")

### Binning - Cleaned

Selective Binning

In [2]:
X_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/train/X_train.csv",
    parse_dates=["date"],
    index_col="date",
)

y_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/train/y_train.csv",
    parse_dates=["date"],
    index_col="date",
)

X_train = X_train.sort_values(by="date")
X_train = X_train.asfreq("H")
y_train = y_train.sort_values(by="date")
y_train = y_train.asfreq("H")

In [3]:
X = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/test/X_test.csv",
    parse_dates=["date"],
    index_col="date",
)

y = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/test/y_test.csv",
    parse_dates=["date"],
    index_col="date",
)

X = X.sort_values(by="date")
X = X.asfreq("H")
y = y.sort_values(by="date")
y = y.asfreq("H")

In [4]:
# Drop weekly_profile from X_train and X
X_train_wp = X_train["weekly_profile"]
X_wp = X["weekly_profile"]

X_train = X_train.drop(columns=["weekly_profile"])
X = X.drop(columns=["weekly_profile"])

In [5]:
# do the same for system_marginal_price_avg, rolling_std, rolling_median, rolling_min, rolling_max, exp_moving_avg, rolling_mean

X_train_smp_avg = X_train["system_marginal_price_avg"]
X_smp_avg = X["system_marginal_price_avg"]
X_train_rolling_std = X_train["rolling_std"]
X_rolling_std = X["rolling_std"]
X_train_rolling_median = X_train["rolling_median"]
X_rolling_median = X["rolling_median"]
X_train_rolling_min = X_train["rolling_min"]
X_rolling_min = X["rolling_min"]
X_train_rolling_max = X_train["rolling_max"]
X_rolling_max = X["rolling_max"]
X_train_exp_moving_avg = X_train["exp_moving_avg"]
X_exp_moving_avg = X["exp_moving_avg"]
X_train_rolling_mean = X_train["rolling_mean"]
X_rolling_mean = X["rolling_mean"]

X_train = X_train.drop(columns=["system_marginal_price_avg", "rolling_std", "rolling_median", "rolling_min", "rolling_max", "exp_moving_avg", "rolling_mean"])
X = X.drop(columns=["system_marginal_price_avg", "rolling_std", "rolling_median", "rolling_min", "rolling_max", "exp_moving_avg", "rolling_mean"])

In [6]:
import pickle

with open('my_dict.pickle', 'rb') as file:
    opt = pickle.load(file)

optimum_bins_per_column = opt

In [7]:
X_train_transformed = pd.DataFrame()

# Loop over the columns
for column in X_train.columns:
    # Retrieve the optimum number of bins for this column
    n_bins = optimum_bins_per_column[column]

    # Create a discretizer with the optimum number of bins
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')

    # Fit and transform the column
    X_column = np.array(X_train[column]).reshape(-1,1)
    result = est.fit_transform(X_column)

    # Add the transformed column to the new DataFrame
    X_train_transformed[column] = result.ravel()

X_train_transformed.index = X_train.index

In [8]:
X_train = X_train_transformed

In [9]:
X_transformed = pd.DataFrame()

# Loop over the columns
for column in X.columns:
    # Retrieve the optimum number of bins for this column
    n_bins = optimum_bins_per_column[column]

    # Create a discretizer with the optimum number of bins
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')

    # Fit and transform the column
    X_column = np.array(X[column]).reshape(-1,1)
    result = est.fit_transform(X_column)

    # Add the transformed column to the new DataFrame
    X_transformed[column] = result.ravel()

X_transformed.index = X.index

In [10]:
X = X_transformed

In [11]:
len(X_transformed.columns)

21

In [12]:
# Convert all dtyes to category
X_train = X_train.astype("int64")
X = X.astype("int64")

In [13]:
# Add weekly_profile back to X_train and X
X_train["weekly_profile"] = X_train_wp
X["weekly_profile"] = X_wp
X_train["system_marginal_price_avg"] = X_train_smp_avg
X["system_marginal_price_avg"] = X_smp_avg
X_train["rolling_std"] = X_train_rolling_std
X["rolling_std"] = X_rolling_std
X_train["rolling_median"] = X_train_rolling_median
X["rolling_median"] = X_rolling_median
X_train["rolling_min"] = X_train_rolling_min
X["rolling_min"] = X_rolling_min
X_train["rolling_max"] = X_train_rolling_max
X["rolling_max"] = X_rolling_max
X_train["exp_moving_avg"] = X_train_exp_moving_avg
X["exp_moving_avg"] = X_exp_moving_avg
X_train["rolling_mean"] = X_train_rolling_mean
X["rolling_mean"] = X_rolling_mean

In [14]:
# Move weekly_profile, system_marginal_price_avg, rolling_std, rolling_median, rolling_min, rolling_max, exp_moving_avg, rolling_mean to the front of the dataframe

columns_to_move = [
    'weekly_profile',
    'system_marginal_price_avg',
    'rolling_std',
    'rolling_median',
    'rolling_min',
    'rolling_max',
    'exp_moving_avg',
    'rolling_mean'
]

# Create a new DataFrame with the desired column order
X_train = X_train[columns_to_move + [col for col in X_train.columns if col not in columns_to_move]]
X = X[columns_to_move + [col for col in X_train.columns if col not in columns_to_move]]

In [15]:
X_test = X[:"2023-03-30"]
y_test_full = y[:"2023-03-30"]
forecast_len = 12

In [16]:
X_train.columns == X_test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [20]:
from cat_reduce import make_reduction
from sktime.transformations.series.boxcox import LogTransformer
from lightgbm import LGBMRegressor
from my_pipeline import ForecastingPipeline
from sktime.transformations.compose import ColumnwiseTransformer
from sklearn.preprocessing import StandardScaler
from sktime.transformations.series.adapt import TabularToSeriesAdaptor


def initialize_lgbm_forecaster():
    pipe = ForecastingPipeline(
        steps=[
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        (
                            "forecast",
                            make_reduction(
                                LGBMRegressor(
                                    # device="gpu",
                                    num_threads=11,
                                    n_estimators=500,
                                ),
                                window_length=24,
                                strategy="direct",
                            ),
                        ),
                    ]
                ),
            ),
        ]
    )

    return pipe


lgbm_pipeline = initialize_lgbm_forecaster()

fh = ForecastingHorizon(np.arange(1, 12 + 1))

In [21]:
rolling_prediction_df = pd.DataFrame(index=y_test_full.index)
rolling_prediction_low_df = pd.DataFrame(index=y_test_full.index)
rolling_prediction_high_df = pd.DataFrame(index=y_test_full.index)

In [22]:
lgbm_pipeline.fit(y=y_train, X=X_train, fh=fh)



In [23]:
y_pred = lgbm_pipeline.predict(fh, X=X_train.tail(1))
y_pred.columns = [f"cutoff_hour_{lgbm_pipeline.cutoff.hour[0]}"]
rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)

In [24]:
rolling_prediction_df

Unnamed: 0,cutoff_hour_23
2023-02-01 00:00:00,97.916575
2023-02-01 01:00:00,103.685346
2023-02-01 02:00:00,133.966891
2023-02-01 03:00:00,108.863609
2023-02-01 04:00:00,87.001920
...,...
2023-03-30 19:00:00,
2023-03-30 20:00:00,
2023-03-30 21:00:00,
2023-03-30 22:00:00,


In [25]:
# emulating the rolling prediction for the next hours

for i in range(0, len(y_test_full), 12):

        new_observation_y, new_observation_X  = y_test_full[i:i+12], X_test[i:i+12]
        
        new_observation_y = new_observation_y.asfreq('H')
        new_observation_X = new_observation_X.asfreq('H')

        print(f'Updating with actual values at {new_observation_y.index[0]}')
        print(f'Cut off before update: {lgbm_pipeline.cutoff}')

        lgbm_pipeline.update(y=new_observation_y, X=new_observation_X, update_params=True)

        print(f'Cut off after update: {lgbm_pipeline.cutoff}')

        lgbm_pipeline.cutoff.freq = 'H'

        cutoff_time = lgbm_pipeline.cutoff
        prediction_for = cutoff_time + pd.DateOffset(hours=i)

        print(f'Predicting for {prediction_for}')
        
        y_pred = lgbm_pipeline.predict(fh, X=new_observation_X)
        
        y_pred.columns = [f"cutoff_hour_{lgbm_pipeline.cutoff.hour[0]}"]
        
        rolling_prediction_df = pd.concat([rolling_prediction_df, y_pred], axis=1)
        
        print(f'Update and prediction done for {new_observation_y.index[0]}')
        print(f'----------------------------------------------------------------------------------')

Updating with actual values at 2023-02-01 00:00:00
Cut off before update: DatetimeIndex(['2023-01-31 23:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Cut off after update: DatetimeIndex(['2023-02-01 11:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Predicting for DatetimeIndex(['2023-02-01 11:00:00'], dtype='datetime64[ns]', name='date', freq=None)
Update and prediction done for 2023-02-01 00:00:00
----------------------------------------------------------------------------------
Updating with actual values at 2023-02-01 12:00:00
Cut off before update: DatetimeIndex(['2023-02-01 11:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Cut off after update: DatetimeIndex(['2023-02-01 23:00:00'], dtype='datetime64[ns]', name='date', freq='H')
Predicting for DatetimeIndex(['2023-02-02 11:00:00'], dtype='datetime64[ns]', name='date', freq=None)
Update and prediction done for 2023-02-01 12:00:00
---------------------------------------------------------------------------

In [26]:
rmse_list = []
fold_actuals = []
fold_predictions_list = []
fold_predictions_low_list = []
fold_predictions_high_list = []

for col in range(rolling_prediction_df.shape[1]-1):
    
    fold_predictions = rolling_prediction_df.iloc[:, col].dropna()
    
    fold_indices = fold_predictions.index  

    y_test_subset = y_test_full.loc[fold_indices]  
    
    rmse = np.sqrt(mean_squared_error(y_test_subset, fold_predictions))  
    
    rmse_list.append(rmse)

    fold_actuals.append(y_test_subset)
    fold_predictions_list.append(fold_predictions)

In [27]:
# Print Average RMSE of all folds
print(f"Average RMSE for each fold: {np.mean(rmse_list)}")

Average RMSE for each fold: 145.6520300990141


In [28]:
# Print hightst 5 RMSE
print(f"Top 5 RMSE for each fold: {np.sort(rmse_list)[-5:]}")

Top 5 RMSE for each fold: [315.58778221 321.77499991 381.10641123 435.12383961 521.82515233]


In [29]:
y_hist = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/filtered_target_medium.csv",
    parse_dates=["date"],
    index_col="date",
)

y_hist = y_hist.sort_values(by="date")
y_hist = y_hist.asfreq("H")

In [30]:
results_df = pd.DataFrame(columns=['Date','Data', 'RMSE'])

ddf = pd.DataFrame(columns=['HistoricalPrice',	'FuturePrice',	'Predicted', 'timestep'])

for i in range(len(fold_actuals)):

    df = y_hist[y_hist.index < fold_predictions_list[i].index[0]]

    df = df.iloc[-24:,:]
    
    predictions = np.array(fold_predictions_list[i])
    
    date_index = fold_actuals[i].index
    
    hist = pd.DataFrame(df.iloc[-12:,:]['price']).rename(columns={'price':'HistoricalPrice'})
    
    fitu = pd.DataFrame(fold_actuals[i]).rename(columns={'price':'FuturePrice'})
    
    pred = pd.DataFrame(predictions, index=date_index).rename(columns={0:'Predicted'})

    histfitu = pd.merge(hist, fitu, how='outer', left_index=True, right_index=True)
    
    hfp = pd.merge(histfitu, pred, how='outer', left_index=True, right_index=True)

    hfp['timestep'] = i
    
    hfp['periodstep'] = range(1, len(hfp)+1)
    
    hfp = hfp.reset_index()
    
    results_df = results_df.append({'Date':df.index[-1],
                                    'Data' : hfp
                                    }, ignore_index=True)
                                    
    ddf = pd.concat([ddf,hfp], axis=0)

In [31]:
len(fold_actuals), len(fold_predictions_list)

(116, 116)

In [35]:
import plotly.express as px

fig = px.line(ddf, x="periodstep", y=["HistoricalPrice", "FuturePrice", "Predicted"], animation_frame="timestep")
fig.update_layout(height=700)  
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 2000
fig.show()

In [33]:
import requests

def get_aeso_predictions(start_date, end_date):
    url = "https://api.aeso.ca/report/v1.1/price/poolPrice"
    headers = {
        "accept": "application/json",
        "X-API-Key": "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ6MHo4MnIiLCJpYXQiOjE2ODM1NzQyMTh9.Gbod9kjeDwP4SOJibSFof63X7GGZxbZdBmBVrgE409w",
    }
    params = {
        "startDate": start_date.date().strftime("%Y-%m-%d"),
        "endDate": end_date.date().strftime("%Y-%m-%d"),
    }

    response = requests.get(url, headers=headers, params=params)

    data = response.json()["return"]["Pool Price Report"]
    df = pd.DataFrame(data)
    df["actual"] = pd.to_numeric(df["pool_price"])
    df["forecast"] = pd.to_numeric(df["forecast_pool_price"])
    return df

In [34]:
aeso_predictions_df = get_aeso_predictions(y_test_full.index[0], y_test_full.index[-1])
rmse_aeso_predictions = mean_squared_error(aeso_predictions_df['actual'], aeso_predictions_df['forecast'], squared=False)
print(f"RMSE for the predictions by AESO for the same time period as the test set: {round(rmse_aeso_predictions, 2)} CAD/MWh")

RMSE for the predictions by AESO for the same time period as the test set: 112.51 CAD/MWh
