In [3]:
import math
import numpy as np
import pandas as pd
from statsforecast.core import StatsForecast
from statsforecast.models import AutoARIMA
import pmdarima as pm
import time
import xgboost as xgb


from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from tqdm.notebook import tqdm

  from tqdm.autonotebook import tqdm


In [4]:
full_df = pd.read_csv('../data/m4/daily-train.csv')

def get_ts(full_df, index):
    df = full_df.iloc[index:index+1, 1:].transpose()
    df.columns = ['y']
    df = df[df['y'].notna()]
    return df

# df = get_ts(full_df, 6)
df = get_ts(full_df, 1)
df.tail()

Unnamed: 0,y
V1003,2978.0
V1004,2991.9
V1005,2995.3
V1006,3000.5
V1007,2968.5


In [5]:
def get_features_from_lags(df, feature_count=7, keep_y=False):
    res = df[['y']].copy()
    lags = ['lag_{}'.format(lag) for lag in range(1, feature_count + 1)]
    
    # The lags are computed for the *previous* value because when we forecast,
    # we cannot compute them for the current value
    res['previous'] = res['y'].shift(1)
    for i, lag in enumerate(lags):
        res[lag] = res['previous'] - res['previous'].shift(i + 1)
    
    res['day'] = np.arange(res.shape[0]) % 7
    res['lag_to_predict'] = res['y'] - res['previous']
    
    # Ignore the first row, as it has no previous values, it cannot be predicted
    if keep_y:
        columns_to_drop = ['previous']
    else:
        columns_to_drop = ['previous', 'y']
    return res[1:].drop(columns_to_drop, axis=1)

def get_future_lags(df, feature_count=7):
#     df.iloc[-1:].shift(1, axis=1)
    res = df[['y']].copy()
    lags = ['lag_{}'.format(lag) for lag in range(1, feature_count + 1)]
    
    for i, lag in enumerate(lags):
        res[lag] = res['y'] - res['y'].shift(i + 1)
    
    res['day'] = np.arange(res.shape[0]) % 7
    
    return res

f = get_features_from_lags(df)
# f.drop('residuals', axis=1)
f
# get_future_lags(df)


Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,day,lag_to_predict
V3,,,,,,,,1,0.1
V4,0.1,,,,,,,2,9.9
V5,9.9,10.0,,,,,,3,2.1
V6,2.1,12.0,12.1,,,,,4,-3.5
V7,-3.5,-1.4,8.5,8.6,,,,5,-7.3
...,...,...,...,...,...,...,...,...,...
V1003,-41.3,-71.4,-76.3,-69.8,-55.7,-59.9,-70.1,0,14.1
V1004,14.1,-27.2,-57.3,-62.2,-55.7,-41.6,-45.8,1,13.9
V1005,13.9,28.0,-13.3,-43.4,-48.3,-41.8,-27.7,2,3.4
V1006,3.4,17.3,31.4,-9.9,-40.0,-44.9,-38.4,3,5.2


In [7]:
def train_xgb(df, in_sample_validation=False):
    xgb_reg_params = {
    #         'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.1)),
        'learning_rate': 0.01,
        'max_depth':        40,
        'early_stopping_rounds': 100,
        'eval_metric': 'rmse',
        'n_estimators': 100,
    }

    if in_sample_validation:
        # Validate against in sample data
        train = df
        validation = df
    else:
        # Validate against out of sample
        train = df[:-50]
        validation = df[-50:]

    X_train = train.drop('lag_to_predict', axis=1)
    y_train = train[['lag_to_predict']]


    reg = xgb.XGBRegressor(**xgb_reg_params)
    eval_set = [(validation.drop('lag_to_predict', axis=1), validation[['lag_to_predict']])]
    reg.fit(X_train, y_train, eval_set=eval_set, verbose=False)
    
    return reg

def predict_xgb(df, model):
    return model.predict(df)

lgs = get_features_from_lags(df)
train_df = lgs[:-10]
test_df = lgs[-10:].copy()

model = train_xgb(train_df)
test_df['yhat'] = predict_xgb(test_df.drop(['lag_to_predict'], axis=1), model)
print("MSE:", mean_squared_error(test_df['lag_to_predict'], test_df['yhat']))
# # test_df

MSE: 465.10932008111195
