some snippets borrowed from https://github.com/KasperGroesLudvigsen/xgboost_time_series

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from typing import Tuple
from sklearn.multioutput import MultiOutputRegressor

In [2]:
from ESRNN.m4_data import *
from ESRNN.utils_evaluation import evaluate_prediction_owa
from ESRNN.utils_visualization import plot_grid_prediction

In [3]:
X_train_df, y_train_df, X_test_df, y_test_df = prepare_m4_data(dataset_name="Hourly",
                                                               directory="../data/M4",
                                                               num_obs=414)





In [4]:
target_variable = 'y'
timestemp_col = 'ds'

hyperparameters = {
    "in_length": 7*24,             # Using 7 days (168 hours) of past observations
    "step_size": 24,               # Sliding the window by 24 steps each time
    "target_sequence_length": 2*24, # Forecasting 48 hours ahead
    "n_estimators": 20,            # Number of gradient boosted trees
    "max_depth": 6,                # Maximum depth of a tree
    "subsample": 0.5,              # Fraction of samples to be used for fitting each tree
    "min_child_weight": 1,         # Minimum sum of instance weight (hessian) needed in a child
    "selected_features": [target_variable]  # Features selected for training the model
}

In [5]:
y_train_df

Unnamed: 0,unique_id,ds,y
0,H1,1970-01-01 00:00:00,605.0
1,H1,1970-01-01 01:00:00,586.0
2,H1,1970-01-01 02:00:00,586.0
3,H1,1970-01-01 03:00:00,559.0
4,H1,1970-01-01 04:00:00,511.0
...,...,...,...
353495,H99,1970-01-29 23:00:00,27926.0
353496,H99,1970-01-30 00:00:00,26744.0
353497,H99,1970-01-30 01:00:00,25829.0
353498,H99,1970-01-30 02:00:00,25421.0


In [6]:
def get_indices_entire_sequence(data: pd.DataFrame, hyperparameters: dict) -> list:
    """
    Produce all the start and end index positions that are needed to produce
    the sub-sequences for the dataset.

    Args:
        data (pd.DataFrame): Partitioned data set, e.g., training data
        hyperparameters (dict): A dictionary containing the hyperparameters
        
    Return:
        indices: a list of tuples
    """

    window_size = hyperparameters['in_length'] + hyperparameters['target_sequence_length']
    step_size = hyperparameters['step_size']
    stop_position = len(data) - 1

    subseq_first_idx = 0
    subseq_last_idx = window_size

    indices = []

    while subseq_last_idx < stop_position:
        indices.append((subseq_first_idx, subseq_last_idx))
        subseq_first_idx += step_size
        subseq_last_idx += step_size

    return indices

In [7]:
def get_x_y(
        indices: list,
        data: pd.DataFrame,
        target_variable: str,
        target_sequence_length: int,
        input_seq_len: int
) -> Tuple[np.array, np.array]:
    
    print ("Preparing data...")
    """
    Obtaining the model inputs and targets (X,Y)
    """
    
    y_data = data[target_variable].values

    for i, idx in enumerate(indices):

        data_instance = y_data[idx[0]:idx[1]]

        x = data_instance[0: input_seq_len]
        y = data_instance[input_seq_len:input_seq_len + target_sequence_length]

        assert len(x) == input_seq_len
        assert len(y) == target_sequence_length

        if i == 0:
            X = x.reshape(1, -1)
            Y = y.reshape(1, -1)
        else:
            X = np.concatenate((X, x.reshape(1, -1)), axis=0)
            Y = np.concatenate((Y, y.reshape(1, -1)), axis=0)

    print ("Finished preparing data!")

    return X, Y

In [8]:
y_hat_df_xgb = pd.DataFrame()

for series_id in y_train_df['unique_id'].unique():

    print(f"Processing {series_id}...")

    series_train = y_train_df[y_train_df['unique_id'] == series_id]
    series_test = y_test_df[y_test_df['unique_id'] == series_id]

    training_indices = get_indices_entire_sequence(
        data=series_train,
        hyperparameters=hyperparameters)

    x_train, y_train = get_x_y(
        indices=training_indices, 
        data=series_train[hyperparameters["selected_features"]],
        target_variable=target_variable,
        target_sequence_length=hyperparameters["target_sequence_length"],
        input_seq_len=hyperparameters["in_length"]
    )

    x_test = series_train[hyperparameters["selected_features"]].iloc[-hyperparameters["in_length"]:].to_numpy().reshape(1, -1)

    model = xgb.XGBRegressor(
        n_estimators=hyperparameters["n_estimators"],
        max_depth=hyperparameters["max_depth"],
        subsample=hyperparameters["subsample"],
        min_child_weight=hyperparameters["min_child_weight"],
        objective="reg:squarederror",
        tree_method="hist"
    )

    trained_model = MultiOutputRegressor(model).fit(x_train, y_train)

    y_hat = trained_model.predict(x_test).reshape(-1, 1)

    series_predictions = pd.DataFrame({
    'unique_id': series_id,
    'ds': series_test['ds'].values,  
    'y': y_hat.squeeze()
    })

    y_hat_df_xgb = pd.concat([y_hat_df_xgb, series_predictions], axis=0)
    # print(y_hat_df_xgb)

    print(f"Finished {series_id}!") 



Processing H1...
Preparing data...
Finished preparing data!
Finished H1!
Processing H10...
Preparing data...
Finished preparing data!
Finished H10!
Processing H100...
Preparing data...
Finished preparing data!
Finished H100!
Processing H101...
Preparing data...
Finished preparing data!
Finished H101!
Processing H102...
Preparing data...
Finished preparing data!
Finished H102!
Processing H103...
Preparing data...
Finished preparing data!
Finished H103!
Processing H104...
Preparing data...
Finished preparing data!
Finished H104!
Processing H105...
Preparing data...
Finished preparing data!
Finished H105!
Processing H106...
Preparing data...
Finished preparing data!
Finished H106!
Processing H107...
Preparing data...
Finished preparing data!
Finished H107!
Processing H108...
Preparing data...
Finished preparing data!
Finished H108!
Processing H109...
Preparing data...
Finished preparing data!
Finished H109!
Processing H11...
Preparing data...
Finished preparing data!
Finished H11!
Process

In [9]:
y_hat_df_xgb.reset_index(inplace=True, drop=True)
y_hat_df_xgb = y_hat_df_xgb.rename(columns={'y' : 'y_hat'})

In [10]:
evaluate_prediction_owa(y_hat_df_xgb, y_train_df, X_test_df, y_test_df, naive2_seasonality=24)

OWA: 1.147 
SMAPE: 22.128 
MASE: 2.611 


(1.1469583150597575, 2.6110341848765204, 22.128071070124957)

In [11]:
y_hat_df_xgb.to_csv('../results/m4/y_hat_df_xgb.csv', index=False)
