# XGBoost Model

In [1]:
# Load the necessary packages
from darts import TimeSeries
from darts.models import XGBModel
import pandas as pd
import plotly.graph_objs as go
import numpy as np


In [2]:
# Load in the train and test data
train_df = pd.read_csv('../../data/Final_data/train_df.csv')
test_df = pd.read_csv('../../data/Final_data/test_df.csv')

train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])

# Create the time series
series_train = TimeSeries.from_dataframe(train_df, 'Date', 'Day_ahead_price (€/MWh)').astype('float32')
series_test = TimeSeries.from_dataframe(test_df, 'Date', 'Day_ahead_price (€/MWh)').astype('float32')

# Define the future covariates columns from your dataframe
future_covariates_columns =  ['Solar_radiation (W/m2)', 'Wind_speed (m/s)',
       'Temperature (°C)', 'Biomass (GWh)', 'Hard_coal (GWh)', 'Hydro (GWh)',
       'Lignite (GWh)', 'Natural_gas (GWh)', 'Other (GWh)',
       'Pumped_storage_generation (GWh)', 'Solar_energy (GWh)',
       'Wind_offshore (GWh)', 'Wind_onshore (GWh)',
       'Net_total_export_import (GWh)', 'BEV_vehicles', 'Oil_price (EUR)',
       'TTF_gas_price (€/MWh)', 'Nuclear_energy (GWh)', 'Lag_1_day',
       'Lag_2_days', 'Lag_3_days', 'Lag_4_days', 'Lag_5_days', 'Lag_6_days',
       'Lag_7_days', 'Day_of_week', 'Month', 'Rolling_mean_7']

# Convert future covariates to TimeSeries objects
future_covariates_train = TimeSeries.from_dataframe(train_df, 'Date', future_covariates_columns).astype('float32')
future_covariates_test = TimeSeries.from_dataframe(test_df, 'Date', future_covariates_columns).astype('float32')

# Concatenate the train and test data
df = pd.concat([train_df, test_df])
df['Date'] = pd.to_datetime(df['Date'])

df

Unnamed: 0,Date,Day_ahead_price (€/MWh),Solar_radiation (W/m2),Wind_speed (m/s),Temperature (°C),Biomass (GWh),Hard_coal (GWh),Hydro (GWh),Lignite (GWh),Natural_gas (GWh),...,Lag_1_day,Lag_2_days,Lag_3_days,Lag_4_days,Lag_5_days,Lag_6_days,Lag_7_days,Day_of_week,Month,Rolling_mean_7
0,2012-01-08,26.83,17.54,5.21,3.74,98.605,189.718,48.467,354.178,256.892,...,32.58,36.26,20.35,32.16,35.03,33.82,18.19,6,1,31.00
1,2012-01-09,47.91,13.04,4.24,3.80,98.605,344.154,49.054,382.756,282.438,...,26.83,32.58,36.26,20.35,32.16,35.03,33.82,0,1,33.02
2,2012-01-10,45.77,28.71,4.30,4.81,98.605,360.126,51.143,334.267,267.311,...,47.91,26.83,32.58,36.26,20.35,32.16,35.03,1,1,34.55
3,2012-01-11,47.83,21.58,4.08,5.14,98.605,360.330,50.693,385.000,277.343,...,45.77,47.91,26.83,32.58,36.26,20.35,32.16,2,1,36.79
4,2012-01-12,43.10,25.12,6.77,4.98,98.605,306.521,50.732,332.985,266.820,...,47.83,45.77,47.91,26.83,32.58,36.26,20.35,3,1,40.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754,2024-07-24,66.61,225.04,3.47,17.54,110.007,43.469,85.857,199.246,194.291,...,79.62,88.75,58.45,59.32,86.47,90.75,76.79,2,7,75.71
755,2024-07-25,78.34,272.71,2.12,17.85,110.410,50.676,82.632,195.983,209.610,...,66.61,79.62,88.75,58.45,59.32,86.47,90.75,3,7,73.94
756,2024-07-26,93.04,172.33,2.60,19.09,110.852,42.333,79.531,205.273,205.773,...,78.34,66.61,79.62,88.75,58.45,59.32,86.47,4,7,74.88
757,2024-07-27,80.74,176.67,2.05,19.63,110.479,33.307,74.958,184.012,216.412,...,93.04,78.34,66.61,79.62,88.75,58.45,59.32,5,7,77.94


### Hyperparameter Tuning

##### Here only using the split into train and test set

In [3]:
#Define the number of trials and jobs
n_trials = 50
n_jobs = 8

: 

In [4]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.models import XGBModel
from darts.dataprocessing.transformers import Scaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler, StandardScaler
import plotly.graph_objs as go
from darts.metrics import mape, rmse, mse, mae
import optuna

# Convert future covariates to TimeSeries objects
future_covariates_train = TimeSeries.from_dataframe(train_df, 'Date', future_covariates_columns).astype('float32')
future_covariates_full = TimeSeries.from_dataframe(df, 'Date', future_covariates_columns, fill_missing_dates=True, freq="D").astype('float32')

# Determine required start date for future covariates
input_chunk_length = 300  # Set based on desired look-back period
required_start_date = pd.Timestamp(test_df['Date'].iloc[0]) - pd.DateOffset(days=input_chunk_length)

# Ensure future_covariates_full covers the required range
required_end_date = pd.Timestamp(test_df['Date'].iloc[0]) + pd.DateOffset(days=len(series_test)-1)

# Check if future_covariates_full has sufficient data
if future_covariates_full.start_time() > required_start_date or future_covariates_full.end_time() < required_end_date:
    print("Warning: The future_covariates_full is not long enough to cover the required input chunk length and prediction range.")
    # Extend the future_covariates_full or adjust your dataset

# Slice the future covariates to the required range, including data from the training period
future_covariates_test = future_covariates_full.slice(required_start_date, required_end_date)

# Scaling the data
scaler_series = Scaler(MaxAbsScaler())
scaler_covariates = Scaler(MaxAbsScaler())

# Fit the scaler on the training data
series_train_scaled = scaler_series.fit_transform(series_train)
future_covariates_train_scaled = scaler_covariates.fit_transform(future_covariates_train)

# Transform the test series and future covariates using the same scaler
series_test_scaled = scaler_series.transform(series_test)
future_covariates_test_scaled = scaler_covariates.transform(future_covariates_test)

# Define the Optuna objective function
def objective(trial):
    # Define hyperparameters to tune
    max_depth = trial.suggest_int('max_depth', 3, 15)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.3, log=True)
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    input_chunk_length = trial.suggest_int('input_chunk_length', 10, 300)
    min_child_weight = trial.suggest_float('min_child_weight', 0.1, 10.0)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    gamma = trial.suggest_float('gamma', 0, 5)
    
    model = XGBModel(
        lags=input_chunk_length,
        output_chunk_length=1,
        lags_future_covariates=[0],
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        gamma=gamma,
        random_state=42
    )

    # Train the model
    model.fit(series_train_scaled, future_covariates=future_covariates_train_scaled, verbose=False) 

    # Make predictions on the test set
    forecast_scaled = model.predict(n=len(series_test_scaled), future_covariates=future_covariates_test_scaled)

    # Inverse transform the forecast to the original scale
    forecast = scaler_series.inverse_transform(forecast_scaled)

    # Evaluate the model performance using MAPE
    error = mse(series_test, forecast)

    return error

# Create an Optuna study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs)

# Print the best hyperparameters
print('/n Best hyperparameters: ', study.best_params)

# Extract the best hyperparameters from the Optuna study
best_params = study.best_params

# Use the best hyperparameters to define the XGBoost model
best_model = XGBModel(
    lags=best_params['input_chunk_length'],
    output_chunk_length=1,  # Predicting one day at a time
    lags_future_covariates=[0],  # Using the current value of future covariates for prediction
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    min_child_weight=best_params['min_child_weight'],  
    subsample=best_params['subsample'],               
    colsample_bytree=best_params['colsample_bytree'], 
    gamma=best_params['gamma'],                       
    random_state=42
)

# Train the model with the best hyperparameters
best_model.fit(series_train_scaled, future_covariates=future_covariates_train_scaled)

# Make predictions on the test set
forecast_scaled = best_model.predict(n=len(series_test_scaled), future_covariates=future_covariates_test_scaled)

# Inverse transform the forecast to the original scale
forecast = scaler_series.inverse_transform(forecast_scaled)

# Convert TimeSeries to DataFrame for Plotly plotting
test_df_plotly = series_test.pd_dataframe()
forecast_df_plotly = forecast.pd_dataframe()

# Plot the results using Plotly
fig = go.Figure()

# Add actual test data trace
fig.add_trace(go.Scatter(x=test_df_plotly.index, y=test_df_plotly['Day_ahead_price (€/MWh)'],
                         mode='lines', name='Actual Test Data', line=dict(color='darkblue')))

# Add forecast data trace
fig.add_trace(go.Scatter(x=forecast_df_plotly.index, y=forecast_df_plotly['Day_ahead_price (€/MWh)'],
                         mode='lines', name='XGBoost Forecast on Test Data', line=dict(color='red')))

# Update layout
fig.update_layout(
    title='XGBoost Model - Test Performance Only',
    xaxis_title='Date',
    yaxis_title='Day Ahead Price (€/MWh)',
    legend=dict(
        x=1,   # Set x position to 1 (far right)
        y=1,   # Set y position to 1 (top)
        xanchor='right',  # Anchor the legend's x position to the right
        yanchor='top',    # Anchor the legend's y position to the top
        bordercolor='black',  # Optional: Add a border around the legend
        borderwidth=1        # Optional: Set the border width
    ),
    template='plotly_white'
)

# Show the plot
fig.show()


# Evaluate the model using Darts' metrics
print(f'Mean Absolute Error on Test Set: {mae(series_test, forecast)}')
print(f'Mean Absolute Percentage Error on Test Set: {mape(series_test, forecast)}')
print(f'Mean Squared Error on Test Set: {mse(series_test, forecast)}')
print(f'Root Mean Squared Error on Test Set: {rmse(series_test, forecast)}')



[I 2024-10-08 14:18:39,601] A new study created in memory with name: no-name-de246c2f-1ec5-4298-8fe9-541e4f2a42c3


In [27]:
# Save the created figure as png file and the error metrics 
fig.write_image(f"../../predictions/XGBoost/XGBoost_forecast_{n_trials}.png")
error_metrics = pd.DataFrame({'MAE': [mae(series_test, forecast)], 'MAPE': [mape(series_test, forecast)], 'MSE': [mse(series_test, forecast)], 'RMSE': [rmse(series_test, forecast)]})
error_metrics.to_csv(f'../../predictions/XGBoost/XGBoost_error_metrics_{n_trials}.csv', index=False)

# Also save hyperparameters of the model in the csv file
best_params_df = pd.DataFrame(best_params, index=[0])
best_params_df.to_csv(f'../../predictions/XGBoost/XGBoost_hyperparameters_{n_trials}.csv', index=False)