# Avg RMSE of predictions by AESO

### 6 hour predictions by AESO

We have obtained the six hour predictions made by AESO every hour starting from 25 th May 15:00:00 to 31st May 23:00:00. We have saved it as json files which is being preprocessed below

In [47]:
import os
import re
from datetime import datetime, timedelta
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

def get_current_time(file_path):
    """
    Get the current time from a file path.

    Args:
        file_path (str): The path of the file.

    Returns:
        datetime: The current time extracted from the file path.
    """
    date_time = None
    # Extract the date and time using regular expressions
    match = re.search(r"\d{4}-\d{2}-\d{2}_\d{2}-\d{2}", file_path)
    if match:
        date_time_str = match.group(0)
        
        # Convert the string to a datetime object
        date_time = datetime.strptime(date_time_str, "%Y-%m-%d_%H-%M")
    else:
        print("No date and time found in the string.")
    return date_time - timedelta(hours=1)


def process_pool_price_data(file_path):
    """
    Process pool price data from a JSON file.

    Args:
        file_path (str): The path of the JSON file.

    Returns:
        pandas.DataFrame: The filtered DataFrame containing relevant data.
    """
    data = pd.read_json(file_path)
    df = pd.DataFrame(data['return']['Pool Price Report'])
    filtered_df = df[df['pool_price'].eq('') & df['rolling_30day_avg'].eq('') & df['forecast_pool_price'].ne('')][['begin_datetime_mpt','forecast_pool_price']]
    return filtered_df


def create_wider_df(filtered_df, file_path):
    """
    Create a wider DataFrame with current time and forecasted pool prices.

    Args:
        filtered_df (pandas.DataFrame): The filtered DataFrame.
        file_path (str): The path of the file.

    Returns:
        pandas.DataFrame: The wider DataFrame with additional columns.
    """
    wider_df = pd.DataFrame({'current_alberta_time': [get_current_time(file_path)]})
    
    # Add columns T0 to T5 or up to the number of rows in filtered_df
    for i in range(min(6, len(filtered_df))):
        col_name = f'T{i}'
        if i < len(filtered_df):
            wider_df[col_name] = filtered_df.iloc[i]['forecast_pool_price']
        else:
            wider_df[col_name] = ''

    return wider_df


def process_folder(folder_path):
    """
    Process a folder containing JSON files.

    Args:
        folder_path (str): The path of the folder.

    Returns:
        pandas.DataFrame: The concatenated DataFrame of processed files.
    """
    # Get the list of files in the folder
    file_list = os.listdir(folder_path)
    dfs = []

    # Process each JSON file
    for file_name in file_list:
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            filtered_df = process_pool_price_data(file_path)
            wider_df = create_wider_df(filtered_df, file_path)
            dfs.append(wider_df)

    df = pd.concat(dfs, ignore_index=True)
    return df


def sort_and_save_data(df, output_file):
    """
    Sort and save the DataFrame to a CSV file.

    Args:
        df (pandas.DataFrame): The DataFrame to be sorted and saved.
        output_file (str): The path of the output CSV file.
    """
    df_sorted = df.sort_values('current_alberta_time')
    df_sorted = df_sorted.reset_index(drop=True)
    df_sorted.to_csv(output_file, index=False)


# Main execution
folder_path = '../../data/raw/pool_price/'
output_file = 'formatted_data.csv'

df = process_folder(folder_path)
df.set_index('current_alberta_time', inplace=True)
# sort_and_save_data(df, output_file)


### Actual pool price from AESO 

We have the actual pool price for every hourn the mentioned timeframe as published by AESO which is obtained using an API

In [48]:
import requests

url = 'https://api.aeso.ca/report/v1.1/price/poolPrice'
headers = {
    'accept': 'application/json',
    'X-API-Key': 'eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ6MHo4MnIiLCJpYXQiOjE2ODM1NzQyMTh9.Gbod9kjeDwP4SOJibSFof63X7GGZxbZdBmBVrgE409w'
}
params = {
    'startDate': '2023-05-25',
    'endDate': '2023-06-17'
}

response = requests.get(url, headers=headers, params=params)
actual_df = pd.DataFrame(response.json()['return']['Pool Price Report'])

actual_df = actual_df.loc[actual_df['begin_datetime_mpt'] >= '2023-05-25 23:00:00']
actual_df

Unnamed: 0,begin_datetime_utc,begin_datetime_mpt,pool_price,forecast_pool_price,rolling_30day_avg
24,2023-05-26 06:00,2023-05-26 00:00,87.53,68.53,147.78
25,2023-05-26 07:00,2023-05-26 01:00,86.98,86.26,147.83
26,2023-05-26 08:00,2023-05-26 02:00,64.87,59.28,147.83
27,2023-05-26 09:00,2023-05-26 03:00,59.22,58.86,147.67
28,2023-05-26 10:00,2023-05-26 04:00,112.05,92.64,147.47
...,...,...,...,...,...
571,2023-06-18 01:00,2023-06-17 19:00,38.36,43.8,181.02
572,2023-06-18 02:00,2023-06-17 20:00,42.32,34.57,180.99
573,2023-06-18 03:00,2023-06-17 21:00,40.46,47.26,180.98
574,2023-06-18 04:00,2023-06-17 22:00,45.25,48.15,180.99


### Tranformed table for actual prices

We are creating a dataframe in such a way that at a timestep T, we have the actual price for the next 6 hours including the current hour in each row.

In [49]:


import numpy as np
new_df = pd.DataFrame(columns = ['begin_datetime_mpt', 'P0', 'P1', 'P2', 'P3', 'P4', 'P5'])

# Calculate the pool_price list
pool_price_list = actual_df['pool_price'].tolist() + [np.nan]*5  # Append NaNs for P4, P5 in the last rows

for i in range(len(actual_df)):
    new_row = pd.DataFrame({
        'begin_datetime_mpt': actual_df.iloc[i, actual_df.columns.get_loc('begin_datetime_mpt')],
        'P0': [pool_price_list[i]],
        'P1': [pool_price_list[i+1]],
        'P2': [pool_price_list[i+2]],
        'P3': [pool_price_list[i+3]],
        'P4': [pool_price_list[i+4]],
        'P5': [pool_price_list[i+5]]
    })
    
    new_df = pd.concat([new_df, new_row], ignore_index=True)

# Print the new dataframe
new_df = new_df.loc[new_df['begin_datetime_mpt'] < '2023-06-14 20:00:00']
new_df

Unnamed: 0,begin_datetime_mpt,P0,P1,P2,P3,P4,P5
0,2023-05-26 00:00,87.53,86.98,64.87,59.22,112.05,261.51
1,2023-05-26 01:00,86.98,64.87,59.22,112.05,261.51,700.22
2,2023-05-26 02:00,64.87,59.22,112.05,261.51,700.22,699.60
3,2023-05-26 03:00,59.22,112.05,261.51,700.22,699.60,458.33
4,2023-05-26 04:00,112.05,261.51,700.22,699.60,458.33,64.29
...,...,...,...,...,...,...,...
472,2023-06-14 16:00,34.76,31.38,27.50,26.02,38.56,40.59
473,2023-06-14 17:00,31.38,27.50,26.02,38.56,40.59,29.27
474,2023-06-14 18:00,27.50,26.02,38.56,40.59,29.27,21.29
475,2023-06-14 19:00,26.02,38.56,40.59,29.27,21.29,18.47


In [50]:
# Check if there any hourly missing rows in the dataframe

new_df['begin_datetime_mpt'] = pd.to_datetime(new_df['begin_datetime_mpt'])

# Create a complete date range from start to end
start_date = new_df['begin_datetime_mpt'].min()
end_date = new_df['begin_datetime_mpt'].max()
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Check if all the dates in the complete range exist in your DataFrame
missing_dates = complete_date_range[~complete_date_range.isin(new_df['begin_datetime_mpt'])]

print(f"Number of missing dates: {len(missing_dates)}")
if len(missing_dates) > 0:
    print("Missing dates are:")
    print(missing_dates)

Number of missing dates: 0


### Tranformed table for predicted prices + actual price for that hour 

Now we have created a new table for the predictions by AESO in such a way that for each timestep T,  we have the predictions made at T - 1 for the next 6 hours. For eg, the first row below for T = 2023-05-26 00:00:00 contains the predictions made by AESO at 2023-05-25 23:00:00 for the next 6 hours. 

In [51]:
valid_df = actual_df[['begin_datetime_mpt','pool_price']]
valid_df.set_index('begin_datetime_mpt', inplace=True)
valid_df.index = pd.to_datetime(valid_df.index)
merged_df = pd.merge(df, valid_df, left_index=True, right_index=True, how='left')
merged_df = merged_df.reset_index()
merged_df

Unnamed: 0,current_alberta_time,T0,T1,T2,T3,T4,T5,pool_price
0,2023-05-26 12:00:00,177.09,65.5,60.88,58.68,58.9,47.16,69.79
1,2023-05-26 13:00:00,65.5,65.5,63.16,58.9,58.97,45.29,61.23
2,2023-05-26 14:00:00,65.5,60.94,58.9,42.96,42.42,39.73,64.64
3,2023-05-26 15:00:00,60.94,701.43,95.37,65.62,45.17,42.16,120.55
4,2023-05-26 16:00:00,701.43,621.94,105.88,57.21,42.62,42.33,427.80
...,...,...,...,...,...,...,...,...
101,2023-05-30 19:00:00,47.15,163.69,132.4,182.97,152.73,45.82,142.40
102,2023-05-30 20:00:00,163.69,159.44,139.15,134.02,42.83,41.65,143.65
103,2023-05-30 21:00:00,159.44,55.36,57.94,42.68,41.65,36.48,66.97
104,2023-05-30 22:00:00,55.36,37.44,28.55,26.28,24.28,22.07,43.54


## Check for any missing dates

In [52]:
start_date = merged_df['current_alberta_time'].min()
end_date = merged_df['current_alberta_time'].max()
complete_date_range = pd.date_range(start=start_date, end=end_date, freq='H')

# Reindex the merged DataFrame with the complete date range
merged_df = merged_df.set_index('current_alberta_time').reindex(complete_date_range)

# Fill missing values with the mean of each column
merged_df[['T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'pool_price']] = merged_df[['T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'pool_price']].apply(pd.to_numeric, errors='coerce')
merged_df = merged_df.fillna(merged_df.mean())

# Reset the index of the DataFrame with the new name 'current_alberta_time'
merged_df = merged_df.reset_index().rename(columns={'index': 'current_alberta_time'})


## Average of RMSEs of each fold

In [53]:
actual_df = new_df.loc[new_df['begin_datetime_mpt'] < '2023-05-31 00:00:00']  # actual pool prices by AESO 
actual_df = actual_df.loc[new_df['begin_datetime_mpt'] > '2023-05-26 11:00:00']  # actual pool prices by AESO 
pred_df = merged_df.loc[merged_df['current_alberta_time'] < '2023-05-31 00:00:00']  # predicted pool prices by AESO 

In [54]:

from sklearn.metrics import mean_squared_error

rmse_values = []

for i in range(len(pred_df)):
    row_df1 = actual_df.iloc[i][['P0', 'P1','P2', 'P3','P4', 'P5']]
    row_df2 = pred_df.iloc[i][['T0', 'T1','T2', 'T3','T4', 'T5']]
    rmse = mean_squared_error(row_df1, row_df2, squared=False)
    rmse_values.append(rmse)

# Create new dataframe with begin_datetime_mpt and RMSE
df_rmse = pd.DataFrame({
    'begin_datetime_mpt': actual_df['begin_datetime_mpt'],
    'rmse': rmse_values
})

df_rmse

Unnamed: 0,begin_datetime_mpt,rmse
12,2023-05-26 12:00:00,203.768462
13,2023-05-26 13:00:00,200.866043
14,2023-05-26 14:00:00,316.037891
15,2023-05-26 15:00:00,416.841223
16,2023-05-26 16:00:00,512.201149
...,...,...
115,2023-05-30 19:00:00,88.478267
116,2023-05-30 20:00:00,68.297422
117,2023-05-30 21:00:00,40.034342
118,2023-05-30 22:00:00,6.052561


The above table shows the RMSE for the 6 hour predictions made at `begin_datetime_mpt - 1 hour`. 
The average RMSE calculated using the predictions and the actual values of pool prices every 6 hours are calculated below - 

In [55]:
avg_rmse = df_rmse.mean()
avg_rmse

rmse    98.397752
dtype: float64

## 1 step, 2 step RMSE for dates 2023-05-25 15:00:00 to 2023-05-31 23:00:00

For generating the RMSE corresponding to all 1, 2, ... 6 step predictions made by AESO, we will first create 6 dataframes which contains the predictions made for that timestep and the actual prices. Then we will calculate the average RMSE for each time step. 

In [56]:
"""
This code generates lagged DataFrames based on the 'merged_df' DataFrame, shifting the 'pool_price' values by different lags from 0 to 5. 
The resulting lagged DataFrames are stored in 'dataframes', with separate variables assigned to each DataFrame for lags 0 to 5:
 df_1_lag, df_2_lag, df_3_lag, df_4_lag, and df_5_lag.
 
"""

dataframes = []
merged_df = merged_df.loc[merged_df['current_alberta_time'] < '2023-05-31 00:00:00'] 
for lag in range(6):
    df = merged_df.copy()
    df.iloc[:, lag:lag+1] = df.iloc[:, lag:lag+1].shift(lag)
    col_name= f'T{lag}'
    df = df[[col_name, 'pool_price']]
    dataframes.append(df)

df_1_lag = dataframes[0]  # DataFrame with 0 lag
df_2_lag = dataframes[1]  # DataFrame with 1 lag
df_3_lag = dataframes[2]  # DataFrame with 2 lag
df_4_lag = dataframes[3]  # DataFrame with 3 lag
df_5_lag = dataframes[4]  # DataFrame with 4 lag
df_6_lag = dataframes[5]  # DataFrame with 5 lag


The average RMSE calculated using the predictions and the actual values of pool prices every 6 hours for 1 step, 2 step, are calculated below - 

In [57]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt

# Assuming df_0_lag, df_1_lag, ..., df_5_lag are the DataFrames in the list
dataframes = [df_1_lag, df_2_lag, df_3_lag, df_4_lag, df_5_lag, df_6_lag]
table_data = []
rmse_list = []

for i, df in enumerate(dataframes):
    predicted_col = f'T{i}'
    actual_col = 'pool_price'
    
    # Remove rows with missing values
    df_cleaned = df.dropna(subset=[predicted_col, actual_col])
    
    # Calculate RMSE
    rmse = mean_squared_error(df_cleaned[actual_col], df_cleaned[predicted_col], squared=False)
    
    print(f"RMSE for {i + 1} step forecast is {rmse:.2f}")
    table_data.append([f"{i+1} Step RMSE", rmse])
    rmse_list.append(rmse)

avg_rmse = sum(rmse_list) / len(rmse_list)
table_data.append([f"Avg Fold RMSE", avg_rmse])
rmse_table = pd.DataFrame(table_data, columns=["", "RMSE (CAD)"])
rmse_table = rmse_table.round(2)
rmse_table.to_csv("aeso_error.csv")

RMSE for 1 step forecast is 104.79
RMSE for 2 step forecast is 116.36
RMSE for 3 step forecast is 143.97
RMSE for 4 step forecast is 152.41
RMSE for 5 step forecast is 191.53
RMSE for 6 step forecast is 217.49
