In [None]:
import pandas as pd
import copy
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import numpy as np
from sktime.forecasting.arima import ARIMA
from sktime.forecasting.compose import TransformedTargetForecaster
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.base import ForecastingHorizon
from sklearn.metrics import mean_squared_error
from sktime.utils.plotting import plot_series
from sktime.forecasting.compose import make_reduction
from sktime.transformations.series.boxcox import LogTransformer
from lightgbm import LGBMRegressor
from sktime.forecasting.compose import ForecastingPipeline
from sktime.transformations.compose import ColumnwiseTransformer
import requests
from datetime import datetime, timedelta
import pandas as pd

import warnings

warnings.filterwarnings("ignore")

In [None]:
X_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/filtered_features_medium.csv",
    parse_dates=["date"],
    index_col="date",
)

y_train = pd.read_csv(
    "https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/filtered_target_medium.csv",
    parse_dates=["date"],
    index_col="date",
)

X_train = X_train["2023-04-10":]
y_train = y_train["2023-04-10":]

forecast_len = 12

X_train = X_train.sort_values(by="date")
X_train = X_train.asfreq("H")
y_train = y_train.sort_values(by="date")
y_train = y_train.asfreq("H")

cols_for_log_transform = list(set(X_train.columns) - set(list(X_train.columns[X_train.lt(0).any()])))

In [None]:
X_train.columns 

In [None]:
def initialize_lgbm_forecaster():
    pipe = ForecastingPipeline(
        steps=[
            (
                "price_column_transformer",
                ColumnwiseTransformer(LogTransformer(), columns=cols_for_log_transform),
            ),
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        ("LogTransformer", LogTransformer()),
                        (
                            "forecast",
                            make_reduction(
                                LGBMRegressor(
                                    device="gpu", num_threads=6, n_estimators=1
                                ),
                                window_length=24,
                                strategy="direct",
                            )
                        )
                    ]
                )
            )
        ]
    )

    return pipe


def initialize_lgbm_forecaster_low():
    pipe = ForecastingPipeline(
        steps=[
            (
                "price_column_transformer",
                ColumnwiseTransformer(LogTransformer(), columns=cols_for_log_transform),
            ),
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        ("LogTransformer", LogTransformer()),
                        (
                            "forecast",
                            make_reduction(
                                LGBMRegressor(
                                    device="gpu",
                                    num_threads=6,
                                    n_estimators=1,
                                    objective="quantile",
                                    alpha=0.025,
                                ),
                                window_length=24,
                                strategy="direct",
                            )
                        )
                    ]
                )
            )
        ]
    )

    return pipe


def initialize_lgbm_forecaster_high():
    pipe = ForecastingPipeline(
        steps=[
            (
                "price_column_transformer",
                ColumnwiseTransformer(LogTransformer(), columns=cols_for_log_transform),
            ),
            (
                "forecaster",
                TransformedTargetForecaster(
                    [
                        ("LogTransformer", LogTransformer()),
                        (
                            "forecast",
                            make_reduction(
                                LGBMRegressor(
                                    device="gpu",
                                    num_threads=6,
                                    n_estimators=1,
                                    objective="quantile",
                                    alpha=0.975,
                                ),
                                window_length=24,
                                strategy="direct",
                            )
                        )
                    ]
                )
            )
        ]
    )

    return pipe

In [None]:
lgbm_pipeline = initialize_lgbm_forecaster()
lgbm_pipeline_low = initialize_lgbm_forecaster_low()
lgbm_pipeline_high = initialize_lgbm_forecaster_high()
fh = ForecastingHorizon(np.arange(1, forecast_len + 1))

In [None]:
lgbm_pipeline.fit(y=y_train, X=X_train, fh=fh)
lgbm_pipeline_low.fit(y=y_train, X=X_train, fh=fh)
lgbm_pipeline_high.fit(y=y_train, X=X_train, fh=fh)

In [None]:
y_pred = lgbm_pipeline.predict(fh, X=X_train.tail(1))
y_pred.columns = [f"predictions"]

y_pred_lower = lgbm_pipeline_low.predict(fh, X=X_train.tail(1))
y_pred_lower.columns = [f"lower_bound"]

y_pred_higher = lgbm_pipeline_high.predict(fh, X=X_train.tail(1))
y_pred_higher.columns = [f"upper_bound"]

In [None]:
y_pred = pd.concat([y_pred, y_pred_lower, y_pred_higher], axis=1)
y_pred

In [None]:
X_train.columns

In [None]:
features = ["price"] + X_train.columns.to_list()

In [None]:
last_window = lgbm_pipeline.forecaster_.forecaster_._get_last_window()
n_cols = len(features)

window_length = 24
y_last, X_last = lgbm_pipeline.forecaster_.forecaster_._get_last_window()
X_pred = np.zeros((1, n_cols, window_length))

X_pred[:, 0, :] = y_last
X_pred[:, 1:, :] = X_last.T
X_pred = X_pred.reshape(1, -1)

X_pred = X_pred.flatten()

In [None]:
reduced_feature_list = []

for feature in features:
    for i in range(24, 0, -1):
        reduced_feature_list.append(f'{feature}_lag{i}')
        
X_reduced_df = pd.DataFrame(
    [X_pred],
    columns=reduced_feature_list
).round(3)

X_reduced_df.iloc[[0]]

In [None]:
lgbm_models = lgbm_pipeline.forecaster_.forecaster_.estimators_

In [None]:
import shap
shap.initjs()

shap_df = pd.DataFrame([], columns=reduced_feature_list)

# Lists for storing base values: Average of predications made using X_train
base_list = []
base_transformed = [] 

for model in lgbm_models:
    explainerModel = shap.TreeExplainer(model)
    shap_values = explainerModel.shap_values(X_reduced_df).flatten()
    shap_df.loc[len(shap_df)] = shap_values
    base_list.append(lgbm_pipeline.forecaster_._get_inverse_transform(lgbm_pipeline.forecaster_.transformers_pre_, np.array([explainerModel.expected_value]))[0])
    base_transformed.append(explainerModel.expected_value)
    

shap_df['date'] = y_pred.index
shap_df['base_val'] = base_list
shap_df['base_val_transformed'] = base_transformed

In [None]:
top_features_df = pd.DataFrame(columns=['Date', 'Feature', 'Value','Base','Base_transformed'])
# Iterate over each row
for index, row in shap_df.iterrows():
    date = row['date']
    base = row['base_val']
    base_transformed = row['base_val_transformed']
    # Drop the 'Date' column for sorting
    sorted_row = row.drop(['date','base_val','base_val_transformed']).sort_values(ascending=False)
    
    # Select top 4 features and values
    top_features = sorted_row.iloc[:4]
    
    # Calculate the sum of the remaining features
    other_value = sorted_row.iloc[4:].sum()
    
    # Create a DataFrame for top features
    temp_df = pd.DataFrame({'Date': [date] * 4,
                            'Feature': top_features.index,
                            'Value': top_features.values,
                           'Base': [base] * 4,
                           'Base_transformed': [base_transformed] * 4})
    
    # Append 'other' feature
    other_df = pd.DataFrame({'Date': [date],
                             'Feature': ['other'],
                             'Value': [other_value],
                            'Base': [base],
                            'Base_transformed': [base_transformed]})
    
    # Concatenate top features and 'other' feature
    temp_df = pd.concat([temp_df, other_df], ignore_index=True)
    
    # Append to the new DataFrame
    top_features_df = pd.concat([top_features_df, temp_df], ignore_index=True)

# Rename the columns
top_features_df.columns = ['Date', 'Feature', 'Value','Base','Base_transformed']

top_features_df['Value'] = top_features_df['Value']/top_features_df['Base_transformed']
top_features_df = top_features_df.drop('Base_transformed', axis=1)

In [None]:
# Get the original date of the first row
original_date = top_features_df.iloc[0]['Date']

# Calculate the start date for duplication (12 hours before the original date)
start_date = original_date - pd.DateOffset(hours=12)

# Create a list of dates for duplication
dates = pd.date_range(start=start_date, periods=12, freq='H')

# Duplicate the first 5 rows for each date
duplicated_rows = pd.concat([top_features_df.iloc[:5].assign(Date=date) for date in dates])

# Reset the index of the duplicated rows
duplicated_rows.reset_index(drop=True, inplace=True)

In [None]:
final_feature_df = pd.concat([top_features_df, duplicated_rows])
final_feature_df.reset_index(drop=True, inplace=True)
final_feature_df.sort_values(by=['Date'], inplace=True)
final_feature_df.head()

In [None]:
# overwrite to the hive
# from pyspark.sql.types import DoubleType
# df = spark.createDataFrame(final_feature_df)
# df = df.withColumn("Base", df["Base"].cast('long'))
# df.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/shap")

In [None]:
def generate_sentence_dataframe(df):
    # Convert 'Date' column to datetime format
    df['Date'] = pd.to_datetime(df['Date'])

    # Group the DataFrame by date
    grouped = df.groupby('Date')

    # Create an empty DataFrame to store the generated sentences
    result_df = pd.DataFrame(columns=['Date', 'Sentence'])

    # Iterate over each date and generate the sentence
    for date, group in grouped:
        # Get the base value for the date
        base_value = group['Base'].iloc[0]

        # Create the sentence
        sentence = f"The average power price of the past month is ${base_value:.1f}. The top two variables that impact the prediction are:"

        # Sort the group by descending absolute values to get the top two variables
        sorted_group = group[group['Feature'] != 'other'].sort_values('Value', key=abs, ascending=False).head(2)

        # Iterate over the sorted group
        for _, row in sorted_group.iterrows():
            feature = row['Feature']
            value = row['Value']

            # Determine whether the value is an increase or decrease
            if value < 0:
                change_type = "decreases"
            else:
                change_type = "increases"

            # Calculate the absolute value of the change and format as a percentage with one decimal place
            abs_value = abs(value)
            change_percentage = abs_value * 100

            # Add the feature information to the sentence
            sentence += f" {feature} {change_type} the prediction by {change_percentage:.3f}%,"
        
        # Remove the trailing comma and add a period at the end of the sentence
        sentence = sentence.rstrip(",") + "."

        # Create a temporary DataFrame for the current date and sentence
        temp_df = pd.DataFrame({'Date': [date], 'Sentence': [sentence]})

        # Append the temporary DataFrame to the result DataFrame
        result_df = pd.concat([result_df, temp_df], ignore_index=True)

    # Sort the result DataFrame by date in ascending order
    result_df = result_df.sort_values('Date', ascending=True)

    return result_df

generate_sentence_dataframe(final_feature_df).head()

In [None]:
# explain_feature_importance = generate_sentence_dataframe(final_feature_df)
# df_shap_explain = spark.createDataFrame(explain_feature_importance)
# df_shap_explain.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/shap_explain")

In [None]:
price_prediction = y_pred.rename(columns={y_pred.columns.tolist()[0]: 'price'})
price_prediction = price_prediction.reset_index()
price_prediction = price_prediction.rename(columns={'index': 'date'})
price_prediction_upper = copy.deepcopy(price_prediction)
price_prediction_upper['indicator'] = 'past_predicted'
price_prediction_upper['upper_bound'] = np.nan
price_prediction_upper['lower_bound'] = np.nan
price_prediction_lower = copy.deepcopy(price_prediction)
price_prediction_lower['indicator'] = np.nan
price_prediction_lower['price'] = np.nan
price_prediction = pd.concat([price_prediction_upper, price_prediction_lower])

In [None]:
price_prediction['date'] = pd.to_datetime(price_prediction['date'])
first_row_date = price_prediction['date'].iloc[0]
start_time = first_row_date - pd.Timedelta(hours=12)
price_past = pd.DataFrame({
    'date': pd.date_range(start=start_time, periods=12, freq='H'),
    'price': [None] * 12,
    'upper_bound': [None] * 12,
    'lower_bound': [None] * 12,
    'indicator': ['past_predicted'] * 12,
    'price': [None] * 12
})

In [None]:

start_date = (first_row_date - timedelta(days=1)).strftime('%Y-%m-%d')

# Calculate the end date by adding one day to current_date
end_date = (first_row_date + timedelta(days=1)).strftime('%Y-%m-%d')

url = 'https://api.aeso.ca/report/v1.1/price/poolPrice'
headers = {
    'accept': 'application/json',
    'X-API-Key': 'eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ6MHo4MnIiLCJpYXQiOjE2ODM1NzQyMTh9.Gbod9kjeDwP4SOJibSFof63X7GGZxbZdBmBVrgE409w'
}

# Set the start date as a parameter
params = {
    'startDate': start_date,
    'endDate': end_date
}

response = requests.get(url, headers=headers, params=params)
api_data = response.json()

In [None]:
api_data = pd.DataFrame(api_data['return']['Pool Price Report'])
api_data = api_data.drop(['begin_datetime_utc', 'forecast_pool_price', 'rolling_30day_avg'], axis=1)
price_past['date'] = pd.to_datetime(price_past['date'])
api_data['begin_datetime_mpt'] = pd.to_datetime(api_data['begin_datetime_mpt'])

# Perform the join operation based on the matching columns
joined_df = pd.merge(price_past, api_data, left_on='date', right_on='begin_datetime_mpt')

# Replace the null values in price_past with the corresponding values from api_data
joined_df['price'] = joined_df['price'].fillna(joined_df['pool_price'])

joined_df = joined_df.drop(['begin_datetime_mpt', 'pool_price'], axis=1)
input_features = pd.read_csv('https://raw.githubusercontent.com/slalom-ubc-mds/Power-Price-Prediction/main/data/processed/filtered_features_medium.csv')
input_features = input_features[['date','wind_supply_mix','wind_reserve_margin','gas_supply_mix','load_on_gas_reserve']]
input_features['date'] = pd.to_datetime(input_features['date'])
joined_df = pd.merge(joined_df, input_features, left_on='date', right_on='date') 
predicted_price = pd.concat([joined_df,price_prediction], axis=0, ignore_index=True)
predicted_price

In [None]:
# df_predicted_price = spark.createDataFrame(predicted_price)
# df_predicted_price = df_predicted_price.withColumn("price", df_predicted_price["price"].cast('Double'))
# df_predicted_price.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/predicted_price")