In [None]:
import sys
import os
sys.path.append("../")

In [None]:
import warnings
import time
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import random

from ngboost.scores import CRPScore, LogScore
from ngboost.learners import default_tree_learner
from lightgbm import LGBMRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn import model_selection
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_log_error, mean_squared_error



In [None]:
from uncertainty_estimation.uncertainty_estimation_models import Model, XGBoost, CQR, LightGBM, LSF, NGBoost, TFTPytorchFC, PGBM, LightGBMQuantileRegressor
from uncertainty_estimation.constants import DistEnum, PredEnum

Using /root/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py39_cu117/split_decision...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py39_cu117/split_decision/build.ninja...
Building extension module split_decision...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module split_decision...
  warn(f"Failed to load image Python extension: {e}")
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


## 1 Data preparation

In [None]:
import os

ue_dir_path = os.path.dirname(os.path.dirname(os.getcwd()))
full_df_path = os.path.join(ue_dir_path, 'datasets', 'rossmann_full_df.pickle')

full_df = pd.read_pickle(full_df_path)

## REMOVE BEFORE PUBLISHING

In [None]:
full_df = full_df.iloc[:10000, :]

In [None]:
full_df.head().T

Unnamed: 0,0,1,2,3,4
Store,1,2,3,4,5
DayOfWeek,5,5,5,5,5
Date,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00
Sales,5263.0,6064.0,8314.0,13995.0,4822.0
Customers,555.0,625.0,821.0,1498.0,559.0
Open,1.0,1.0,1.0,1.0,1.0
Promo,True,True,True,True,True
StateHoliday,False,False,False,False,False
SchoolHoliday,True,True,True,True,True
StoreType,c,a,a,c,a


After the data is loaded the train data is sorted after entity and date.

In [None]:
full_df = full_df.sort_values(['Store', 'Date'])
full_df = full_df.reset_index(drop=True)

In [None]:
display(full_df['Date'].max())
display(full_df['Date'].min())

Timestamp('2015-07-31 00:00:00')Timestamp('2013-01-01 00:00:00')

In [None]:
display(len(full_df))

844338

In [None]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'StoreType', 'Assortment', 
    'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'Week', 'Promo_fw', 
    'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw', 'CompetitionDistance_na']

cont_vars = ['CompetitionDistance', 'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']
full_df[cat_vars] = full_df[cat_vars].astype('object')

## 2 Vectorizer

In [None]:
from pandas.api.types import is_numeric_dtype, is_categorical_dtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline

import category_encoders as ce
import feature_engine.imputation as fe

class CategoricalSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field

    def fit(self, x, y=None):
        return self

    def transform(self, dataframe):
        dt = dataframe[self.field].dtype
        return dataframe[[self.field]]


class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field

    def fit(self, x, y=None):
        return self

    def transform(self, dataframe):
        dt = dataframe[self.field].dtype
        if is_categorical_dtype(dt):
            return dataframe[self.field].cat.codes[:, None]
        elif is_numeric_dtype(dt):
            return dataframe[self.field][:, None]
        else:
            return dataframe[self.field]

        
def create_feature_vectorizer_without_nan():
    vectorizer_tree = FeatureUnion([
        # Categoricals
        ('Store',
         Pipeline([('select', CategoricalSelector('Store')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('DayOfWeek',
         Pipeline([('select', CategoricalSelector('DayOfWeek')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('Year',
         Pipeline([('select', CategoricalSelector('Year')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('Week',
         Pipeline([('select', CategoricalSelector('Week')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('Month',
         Pipeline([('select', CategoricalSelector('Month')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('Day',
         Pipeline([('select', CategoricalSelector('Day')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('StateHoliday',
         Pipeline([('select', CategoricalSelector('StateHoliday')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('StoreType',
         Pipeline([('select', CategoricalSelector('StoreType')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('Assortment',
         Pipeline([('select', CategoricalSelector('Assortment')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('PromoInterval',
         Pipeline([('select', CategoricalSelector('PromoInterval')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('CompetitionOpenSinceYear',
         Pipeline([('select', CategoricalSelector('CompetitionOpenSinceYear')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('Promo2SinceYear',
         Pipeline([('select', CategoricalSelector('Promo2SinceYear')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('Promo_fw',
         Pipeline([('select', CategoricalSelector('Promo_fw')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('Promo_bw',
         Pipeline([('select', CategoricalSelector('Promo_bw')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('StateHoliday_fw',
         Pipeline([('select', CategoricalSelector('StateHoliday_fw')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('StateHoliday_bw',
         Pipeline([('select', CategoricalSelector('StateHoliday_bw')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('SchoolHoliday_fw',
         Pipeline([('select', CategoricalSelector('SchoolHoliday_fw')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('SchoolHoliday_bw',
         Pipeline([('select', CategoricalSelector('SchoolHoliday_bw')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        ('CompetitionDistance_na',
         Pipeline([('select', CategoricalSelector('SchoolHoliday_bw')),
                   ('oe', ce.OrdinalEncoder(handle_missing="value", handle_unknown='value'))])),
        
        # Continuous
        ('CompetitionDistance',Pipeline([('select', ItemSelector('CompetitionDistance')),
                                        ('fe', fe.ArbitraryNumberImputer(arbitrary_number=-1))])),
        ('AfterStateHoliday',Pipeline([('select', ItemSelector('AfterStateHoliday'))])),
        ('BeforeStateHoliday',Pipeline([('select', ItemSelector('BeforeStateHoliday'))])),
        ('Promo',Pipeline([('select', ItemSelector('Promo'))])),
        ('SchoolHoliday',Pipeline([('select', ItemSelector('SchoolHoliday'))])),
    ], n_jobs=1)

    return vectorizer_tree

In [None]:
class CategoricalSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field

    def fit(self, x, y=None):
        return self

    def transform(self, dataframe):
        dt = dataframe[self.field].dtype
        return dataframe[[self.field]]


class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field

    def fit(self, x, y=None):
        return self

    def transform(self, dataframe):
        dt = dataframe[self.field].dtype
        if is_categorical_dtype(dt):
            return dataframe[self.field].cat.codes[:, None]
        elif is_numeric_dtype(dt):
            return dataframe[self.field][:, None]
        else:
            return dataframe[self.field]

def create_feature_vectorizer_with_nan():
    vectorizer_tree = FeatureUnion([
        # Categoricals
        ('Store',
         Pipeline([('select', CategoricalSelector('Store')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('DayOfWeek',
         Pipeline([('select', CategoricalSelector('DayOfWeek')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('Year',
         Pipeline([('select', CategoricalSelector('Year')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
         ('Week',
         Pipeline([('select', CategoricalSelector('Week')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('Month',
         Pipeline([('select', CategoricalSelector('Month')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('Day',
         Pipeline([('select', CategoricalSelector('Day')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('StateHoliday',
         Pipeline([('select', CategoricalSelector('StateHoliday')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('StoreType',
         Pipeline([('select', CategoricalSelector('StoreType')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('Assortment',
         Pipeline([('select', CategoricalSelector('Assortment')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('PromoInterval',
         Pipeline([('select', CategoricalSelector('PromoInterval')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('CompetitionOpenSinceYear',
         Pipeline([('select', CategoricalSelector('CompetitionOpenSinceYear')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('Promo2SinceYear',
         Pipeline([('select', CategoricalSelector('Promo2SinceYear')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('Promo_fw',
         Pipeline([('select', CategoricalSelector('Promo_fw')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('Promo_bw',
         Pipeline([('select', CategoricalSelector('Promo_bw')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('StateHoliday_fw',
         Pipeline([('select', CategoricalSelector('StateHoliday_fw')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('StateHoliday_bw',
         Pipeline([('select', CategoricalSelector('StateHoliday_bw')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('SchoolHoliday_fw',
         Pipeline([('select', CategoricalSelector('SchoolHoliday_fw')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        ('SchoolHoliday_bw',
         Pipeline([('select', CategoricalSelector('SchoolHoliday_bw')),
                   ('oe', ce.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan'))])),
        
        # Continuous
        ('CompetitionDistance',Pipeline([('select', ItemSelector('CompetitionDistance'))])),
        ('AfterStateHoliday',Pipeline([('select', ItemSelector('AfterStateHoliday'))])),
        ('BeforeStateHoliday',Pipeline([('select', ItemSelector('BeforeStateHoliday'))])),
        ('Promo',Pipeline([('select', ItemSelector('Promo'))])),
        ('SchoolHoliday',Pipeline([('select', ItemSelector('SchoolHoliday'))])),
    ], n_jobs=1)

    return vectorizer_tree

In [None]:
vectorizer_without_nan = create_feature_vectorizer_without_nan()
vectorizer_with_nan = create_feature_vectorizer_with_nan()

In [None]:
print(vectorizer_without_nan.fit_transform(full_df).shape)
print(vectorizer_with_nan.fit_transform(full_df).shape)

(844338, 24)
(844338, 23)


## 3 Model Application

For our models taking in the data as tabular data we need to create train, val, test splits for our Forecast Horizon.
Each split only needs to contain the rows where the target is in which we want to train on / predict on. This means that the feature in the same row is the only information taken into account when doing the prediction.

In [None]:
# inputs required for networks taking in tabular data 
train_val_df = full_df[full_df_copy['Date'] < "20150427"].sort_values(['Store', 'Date'])
valid_df = full_df[(full_df_copy['Date'] < "20150614") & (full_df['Date'] >= "20150427")].sort_values(['Store', 'Date'])
train_df = full_df[full_df_copy['Date'] < "20150614"].sort_values(['Store', 'Date'])
test_df = full_df[(full_df_copy['Date'] >= "20150614")].sort_values(['Store', 'Date'])

display(train_df.tail())
display(valid_df.head())
display(valid_df.tail())
display(test_df.head())

Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Id,AfterPromo,AfterStateHoliday,AfterSchoolHoliday,BeforePromo,BeforeStateHoliday,BeforeSchoolHoliday,Promo_bw,StateHoliday_bw,SchoolHoliday_bw,Promo_fw,StateHoliday_fw,SchoolHoliday_fw,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,CompetitionDistance_na
1115,2,2015-06-09T00:00:00.000+0000,5119.0,363.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,4,5,60,-6,0,-48,3.0,1.0,0.0,1.0,0.0,0.0,2015,6,24,9,1,160,False,False,False,False,False,False,1433808000.0,0
1115,3,2015-06-10T00:00:00.000+0000,4676.0,357.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,5,6,61,-5,0,-47,2.0,1.0,0.0,2.0,0.0,0.0,2015,6,24,10,2,161,False,False,False,False,False,False,1433894400.0,0
1115,4,2015-06-11T00:00:00.000+0000,5216.0,380.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,6,7,62,-4,0,-46,1.0,0.0,0.0,3.0,0.0,0.0,2015,6,24,11,3,162,False,False,False,False,False,False,1433980800.0,0
1115,5,2015-06-12T00:00:00.000+0000,5315.0,378.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,7,8,63,-3,0,-45,0.0,0.0,0.0,4.0,0.0,0.0,2015,6,24,12,4,163,False,False,False,False,False,False,1434067200.0,0
1115,6,2015-06-13T00:00:00.000+0000,7736.0,503.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,8,9,64,-2,0,-44,0.0,0.0,0.0,5.0,0.0,0.0,2015,6,24,13,5,164,False,False,False,False,False,False,1434153600.0,0


Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Id,AfterPromo,AfterStateHoliday,AfterSchoolHoliday,BeforePromo,BeforeStateHoliday,BeforeSchoolHoliday,Promo_bw,StateHoliday_bw,SchoolHoliday_bw,Promo_fw,StateHoliday_fw,SchoolHoliday_fw,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,CompetitionDistance_na
1,1,2015-04-27T00:00:00.000+0000,5575.0,574.0,1.0,True,False,False,c,a,1270.0,9.0,2008.0,0,,,,,0,21,17,0,-4,-91,1.0,0.0,0.0,5.0,1.0,0.0,2015,4,18,27,0,117,False,False,False,False,False,False,1430092800.0,0
1,2,2015-04-28T00:00:00.000+0000,5199.0,552.0,1.0,True,False,False,c,a,1270.0,9.0,2008.0,0,,,,,0,22,18,0,-3,-90,2.0,0.0,0.0,5.0,1.0,0.0,2015,4,18,28,1,118,False,False,False,False,False,False,1430179200.0,0
1,3,2015-04-29T00:00:00.000+0000,5775.0,579.0,1.0,True,False,False,c,a,1270.0,9.0,2008.0,0,,,,,0,23,19,0,-2,-89,3.0,0.0,0.0,5.0,1.0,0.0,2015,4,18,29,2,119,False,False,False,False,False,False,1430265600.0,0
1,4,2015-04-30T00:00:00.000+0000,6228.0,650.0,1.0,True,False,False,c,a,1270.0,9.0,2008.0,0,,,,,0,24,20,0,-1,-88,4.0,0.0,0.0,5.0,1.0,0.0,2015,4,18,30,3,120,True,False,False,False,False,False,1430352000.0,0
1,6,2015-05-02T00:00:00.000+0000,5850.0,653.0,1.0,False,False,False,c,a,1270.0,9.0,2008.0,0,,,,,1,1,22,-2,-12,-86,5.0,1.0,0.0,5.0,0.0,0.0,2015,5,18,2,5,122,False,False,False,False,False,False,1430524800.0,0


Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Id,AfterPromo,AfterStateHoliday,AfterSchoolHoliday,BeforePromo,BeforeStateHoliday,BeforeSchoolHoliday,Promo_bw,StateHoliday_bw,SchoolHoliday_bw,Promo_fw,StateHoliday_fw,SchoolHoliday_fw,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,CompetitionDistance_na
1115,2,2015-06-09T00:00:00.000+0000,5119.0,363.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,4,5,60,-6,0,-48,3.0,1.0,0.0,1.0,0.0,0.0,2015,6,24,9,1,160,False,False,False,False,False,False,1433808000.0,0
1115,3,2015-06-10T00:00:00.000+0000,4676.0,357.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,5,6,61,-5,0,-47,2.0,1.0,0.0,2.0,0.0,0.0,2015,6,24,10,2,161,False,False,False,False,False,False,1433894400.0,0
1115,4,2015-06-11T00:00:00.000+0000,5216.0,380.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,6,7,62,-4,0,-46,1.0,0.0,0.0,3.0,0.0,0.0,2015,6,24,11,3,162,False,False,False,False,False,False,1433980800.0,0
1115,5,2015-06-12T00:00:00.000+0000,5315.0,378.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,7,8,63,-3,0,-45,0.0,0.0,0.0,4.0,0.0,0.0,2015,6,24,12,4,163,False,False,False,False,False,False,1434067200.0,0
1115,6,2015-06-13T00:00:00.000+0000,7736.0,503.0,1.0,False,False,False,d,c,5350.0,,,1,22.0,2012.0,"Mar,Jun,Sept,Dec",,8,9,64,-2,0,-44,0.0,0.0,0.0,5.0,0.0,0.0,2015,6,24,13,5,164,False,False,False,False,False,False,1434153600.0,0


Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Id,AfterPromo,AfterStateHoliday,AfterSchoolHoliday,BeforePromo,BeforeStateHoliday,BeforeSchoolHoliday,Promo_bw,StateHoliday_bw,SchoolHoliday_bw,Promo_fw,StateHoliday_fw,SchoolHoliday_fw,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,CompetitionDistance_na
1,1,2015-06-15T00:00:00.000+0000,5518.0,586.0,1.0,True,False,False,c,a,1270.0,9.0,2008.0,0,,,,,0,11,66,0,0,-42,1.0,0.0,0.0,5.0,0.0,0.0,2015,6,25,15,0,166,False,False,False,False,False,False,1434326400.0,0
1,2,2015-06-16T00:00:00.000+0000,4852.0,503.0,1.0,True,False,False,c,a,1270.0,9.0,2008.0,0,,,,,0,12,67,0,0,-41,2.0,0.0,0.0,4.0,0.0,0.0,2015,6,25,16,1,167,False,False,False,False,False,False,1434412800.0,0
1,3,2015-06-17T00:00:00.000+0000,4000.0,476.0,1.0,True,False,False,c,a,1270.0,9.0,2008.0,0,,,,,0,13,68,0,0,-40,3.0,0.0,0.0,3.0,0.0,0.0,2015,6,25,17,2,168,False,False,False,False,False,False,1434499200.0,0
1,4,2015-06-18T00:00:00.000+0000,4645.0,498.0,1.0,True,False,False,c,a,1270.0,9.0,2008.0,0,,,,,0,14,69,0,0,-39,4.0,0.0,0.0,2.0,0.0,0.0,2015,6,25,18,3,169,False,False,False,False,False,False,1434585600.0,0
1,5,2015-06-19T00:00:00.000+0000,4202.0,487.0,1.0,True,False,False,c,a,1270.0,9.0,2008.0,0,,,,,0,15,70,0,0,-38,5.0,0.0,0.0,1.0,0.0,0.0,2015,6,25,19,4,170,False,False,False,False,False,False,1434672000.0,0


In [None]:
TARGET = 'Sales'
target_transformer = 'log1p'
forecast_horizon = 48

## 3.1 LightGBM

In [None]:
lightgbm_params = {
    "boosting_type": 'gbdt',
    "objective": 'regression',
    "n_jobs": -1, 
    "min_split_gain": 0.0,
    "min_data_in_leaf": 1,
    "max_bin": 1024,
    "num_leaves": 64, 
    "max_depth": -1,
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "bagging_freq": 1, 
    "seed": 1,
    "lambda": 1,
}

early_stopping_round = 20

start_time = time.perf_counter()
    
# fitting model on train set with early stopping on valid set
lightgbm_reg = LightGBM(vectorizer_with_nan, target_transformer=target_transformer)
lightgbm_fit_params = {**lightgbm_params, "early_stopping_round": early_stopping_round}
lightgbm_reg.fit(train_val_df, TARGET, X_val=valid_df, y_val=valid_df[TARGET], params=lightgbm_fit_params, verbose=True)
lightgbm_best_iteration = lightgbm_reg.best_iteration
print("Early stopping performed. Best iteration:", lightgbm_best_iteration)

# fitting model on train+val set with best_iteration
lightgbm_full_train_reg = LightGBM(vectorizer_with_nan, target_transformer=target_transformer)
lightgbm_full_train_params = {**lightgbm_params, "n_estimators": lightgbm_best_iteration}
lightgbm_full_train_reg.fit(train_df, TARGET, params=lightgbm_full_train_params, verbose=True)

# predicting on test set with our fully trained model
lightgbm_pred = lightgbm_full_train_reg.predict(test_df)
lightgbm_metrics = lightgbm_full_train_reg.metrics(test_df[TARGET], lightgbm_pred)

end_time = time.perf_counter()
full_time = np.round(end_time - start_time, 2)
lightgbm_metrics['time'] = full_time

e4bdf26cf59b40ddb18f8ff20915cf5a
[1]	valid_0's l2: 0.160617
[2]	valid_0's l2: 0.153625
[3]	valid_0's l2: 0.14661
[4]	valid_0's l2: 0.141007
[5]	valid_0's l2: 0.135964
[6]	valid_0's l2: 0.130492
[7]	valid_0's l2: 0.127126
[8]	valid_0's l2: 0.123155
[9]	valid_0's l2: 0.119451
[10]	valid_0's l2: 0.115901
[11]	valid_0's l2: 0.114463
[12]	valid_0's l2: 0.111633
[13]	valid_0's l2: 0.108981
[14]	valid_0's l2: 0.107544
[15]	valid_0's l2: 0.105766
[16]	valid_0's l2: 0.104299
[17]	valid_0's l2: 0.102308
[18]	valid_0's l2: 0.0991005
[19]	valid_0's l2: 0.0977477
[20]	valid_0's l2: 0.0955909
[21]	valid_0's l2: 0.0926906
[22]	valid_0's l2: 0.0918178
[23]	valid_0's l2: 0.0909734
[24]	valid_0's l2: 0.0886536
[25]	valid_0's l2: 0.0873099
[26]	valid_0's l2: 0.0863762
[27]	valid_0's l2: 0.0858924
[28]	valid_0's l2: 0.0836161
[29]	valid_0's l2: 0.0815558
[30]	valid_0's l2: 0.0810937
[31]	valid_0's l2: 0.0805799
[32]	valid_0's l2: 0.0785332
[33]	valid_0's l2: 0.0779857
[34]	valid_0's l2: 0.0763855
[35]	val

In [None]:
lightgbm_metrics

## 3.2 Bootstrapping (Data Sampling)

In [None]:
# fitting model with parameters obtained from early stopping with train_val set
n_resamples = 100
case_resampling_results = np.zeros(shape = (len(test_df), n_resamples))

start_time = time.perf_counter()

train_temp = train_df.reset_index(drop=True).copy()
for i in range(n_resamples):
    print(f"resampling i: {i+1}")

    if i == 0:
        train_resampled = train_temp
    else:
        train_resampled = train_temp.iloc[random.choices(list(range(len(train_temp))), k = len(train_temp))].reset_index(drop=True)

    lightgbm_full_train_params = {**lightgbm_params, "n_estimators": lightgbm_best_iteration}
    lgb_model = LightGBM(vectorizer_with_nan, target_transformer=target_transformer)
    lgb_model.fit(train_resampled, TARGET, params=lightgbm_full_train_params, verbose=True)

    # predicting on test set with our fully trained model
    predictions = lgb_model.predict(test_df)
    case_resampling_results[:, i] = predictions[PredEnum.POINT_ESTIMATES]

samples = case_resampling_results
quantiles = np.concatenate((np.quantile(case_resampling_results, q = 0.1, axis = 1)[:,np.newaxis], np.quantile(case_resampling_results, q = 0.9, axis = 1)[:,np.newaxis]), axis = 1)
point_pred = case_resampling_results.mean(axis = 1)
    
bootstrap_metrics = {}
bootstrap_metrics['rmse'] = Model.rmse(test_df[TARGET], point_pred)
bootstrap_metrics['rmspe'] = Model.rmspe(test_df[TARGET], point_pred)
bootstrap_metrics['avg_interval_length'] = Model.avg_interval_length(quantiles)
bootstrap_metrics['sharpness'] = Model.avg_interval_length(quantiles)
bootstrap_metrics['coverage'] = Model.coverage(test_df[TARGET], quantiles)
bootstrap_metrics['crps'] = Model.crps(test_df[TARGET], case_resampling_results)
bootstrap_metrics['nll_from_samples'] = Model.neg_log_likelihood_with_kde(np.array(test_df[TARGET]), case_resampling_results)

end_time = time.perf_counter()
full_time = np.round(end_time - start_time, 2)
bootstrap_metrics['time'] = full_time

d84b31d6869042bead2e53169b7939b1
resampling i: 1
Elapsed time for fitting LightGBM model: 20.53 s
resampling i: 2
Elapsed time for fitting LightGBM model: 27.93 s
resampling i: 3
Elapsed time for fitting LightGBM model: 28.12 s
resampling i: 4
Elapsed time for fitting LightGBM model: 27.48 s
resampling i: 5
Elapsed time for fitting LightGBM model: 27.2 s
resampling i: 6
Elapsed time for fitting LightGBM model: 27.5 s
resampling i: 7
Elapsed time for fitting LightGBM model: 27.46 s
resampling i: 8
Elapsed time for fitting LightGBM model: 30.21 s
resampling i: 9
Elapsed time for fitting LightGBM model: 28.22 s
resampling i: 10
Elapsed time for fitting LightGBM model: 30.66 s
resampling i: 11
Elapsed time for fitting LightGBM model: 27.7 s
resampling i: 12
Elapsed time for fitting LightGBM model: 27.12 s
resampling i: 13
Elapsed time for fitting LightGBM model: 26.81 s
resampling i: 14
Elapsed time for fitting LightGBM model: 27.18 s
resampling i: 15
Elapsed time for fitting LightGBM mode

In [None]:
bootstrap_metrics

Out[26]: {'rmse': 962.3463783666828,
 'rmspe': 0.12462515780224556,
 'avg_interval_length': 1258.5053691365442,
 'sharpness': 1258.5053691365442,
 'coverage': 0.5626580071484614,
 'crps': 492.8727313829994,
 'nll_from_samples': 10.453515299090732,
 'time': 21.25}

## 3.3 Quantile Regression

In [None]:
lightgbm_quant_params = {
    "boosting_type": 'gbdt',
    "objective": 'quantile',
    "n_jobs": -1, 
    "min_split_gain": 0.0,
    "min_data_in_leaf": 1,
    "max_bin": 1024,
    "num_leaves": 64, 
    "max_depth": -1,
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "seed": 1,
    "lambda": 1,
    "bagging_freq": 1,  
}

early_stopping_round = 20
quantiles = [0.1, 0.5, 0.9]

start_time = time.perf_counter()
    
# fitting model on train set with early stopping on valid set
lightgbm_quant_reg = LightGBMQuantileRegressor(vectorizer_with_nan, target_transformer=target_transformer)
lightgbm_quant_fit_params = {**lightgbm_quant_params, "early_stopping_round": early_stopping_round}
lightgbm_quant_reg.fit(train_val_df, TARGET, X_val=valid_df, y_val=valid_df[TARGET], params=lightgbm_quant_fit_params, quantiles=quantiles, verbose=True)
lightgbm_quant_best_iteration = int(np.mean(list(lightgbm_quant_reg.best_iterations.values())))
print("Early stopping performed. Best iteration:", lightgbm_quant_best_iteration)

    # fitting model on train+val set with best_iteration
lightgbm_full_train_quant_reg = LightGBMQuantileRegressor(vectorizer_with_nan, target_transformer=target_transformer)
lightgbm_quant_full_train_params = {**lightgbm_quant_params, "n_estimators": lightgbm_quant_best_iteration}
lightgbm_full_train_quant_reg.fit(train_df, TARGET, params=lightgbm_quant_full_train_params, quantiles=quantiles, verbose=True)
lightgbm_full_train_quant_reg.best_iterations = lightgbm_quant_reg.best_iterations

# predicting on test set with our fully trained model
lightgbm_quant_pred = lightgbm_full_train_quant_reg.predict(test_df)
lightgbm_quant_metrics = lightgbm_full_train_quant_reg.metrics(test_df[TARGET], lightgbm_quant_pred, confidence_interval_quantiles=[0.1, 0.9])

end_time = time.perf_counter()
full_time = np.round(end_time - start_time, 2)
lightgbm_quant_metrics['time'] = full_time

eaf1526970d94c5c9134213a2337fa94
[1]	valid_0's quantile: 0.073891
[2]	valid_0's quantile: 0.0720418
[3]	valid_0's quantile: 0.070186
[4]	valid_0's quantile: 0.0687681
[5]	valid_0's quantile: 0.0676141
[6]	valid_0's quantile: 0.0663061
[7]	valid_0's quantile: 0.0651943
[8]	valid_0's quantile: 0.0641024
[9]	valid_0's quantile: 0.0631393
[10]	valid_0's quantile: 0.0622466
[11]	valid_0's quantile: 0.0614045
[12]	valid_0's quantile: 0.0606769
[13]	valid_0's quantile: 0.0600169
[14]	valid_0's quantile: 0.0595511
[15]	valid_0's quantile: 0.0589853
[16]	valid_0's quantile: 0.0583849
[17]	valid_0's quantile: 0.0578707
[18]	valid_0's quantile: 0.0570253
[19]	valid_0's quantile: 0.0565005
[20]	valid_0's quantile: 0.0561017
[21]	valid_0's quantile: 0.0556577
[22]	valid_0's quantile: 0.0552979
[23]	valid_0's quantile: 0.0548717
[24]	valid_0's quantile: 0.0542067
[25]	valid_0's quantile: 0.0538729
[26]	valid_0's quantile: 0.0534963
[27]	valid_0's quantile: 0.0530041
[28]	valid_0's quantile: 0.052837

## 3.4 NGBoost

### 3.4.1 NGBoost with NLL

In [None]:
ngboost_base_params = {
    'boosting': 'rf',
    'n_estimators': 1,
    'bagging_fraction': 0.99,
    'bagging_freq': 1 
}

learner = LGBMRegressor(**ngboost_base_params)

ngboost_nll_params = {'Score':LogScore, 
            'Base':learner, 
            'natural_gradient':True,
            "learning_rate": 0.1,
            "n_estimators": 1000,
            "col_sample": 0.7, 
            "minibatch_frac": 0.7, 
            "random_state": 1, 
                 } 

early_stopping_round = 20
quantiles = [0.05, 0.1, 0.5, 0.9, 0.95]

start_time = time.perf_counter()
    
# fitting model on train set with early stopping on valid set
ngboost_nll_early_stopping_params = {**ngboost_nll_params, "early_stopping_rounds": early_stopping_round}
ngboost_nll_reg = NGBoost(vectorizer_without_nan, target_transformer=target_transformer, distribution=DistEnum.NORMAL, **ngboost_nll_early_stopping_params)
ngboost_nll_reg.fit(train_val_df, TARGET, X_val=valid_df, y_val=np.array(valid_df[TARGET]), verbose=True)
ngboost_nll_best_iteration = ngboost_nll_reg.best_iteration
print("Early stopping performed. Best iteration:", ngboost_nll_best_iteration)

# fitting model on train+val set with best_iteration
ngboost_nll_full_train_params = {**ngboost_nll_params, "n_estimators": ngboost_nll_best_iteration}
ngboost_nll_full_train_reg = NGBoost(vectorizer_without_nan, target_transformer=target_transformer, distribution=DistEnum.NORMAL, **ngboost_nll_full_train_params)
ngboost_nll_full_train_reg.fit(train_df, TARGET, verbose=True)
    
# predicting on test set with our fully trained model
ngboost_nll_pred = ngboost_nll_full_train_reg.predict(test_df, quantiles=quantiles, prediction_types=[PredEnum.POINT_ESTIMATES, PredEnum.QUANTILES, PredEnum.SAMPLES, PredEnum.DISTRIBUTION_PARAMS], sample_size=400)

ngboost_nll_metrics = ngboost_nll_full_train_reg.metrics(np.array(test_df[TARGET]), ngboost_nll_pred, confidence_interval_quantiles=[0.1,0.9])

end_time = time.perf_counter()
full_time = np.round(end_time - start_time, 2)
ngboost_nll_metrics['time'] = full_time

8682d5d5845b4100a66206c9ee356542
[iter 0] loss=0.5667 val_loss=0.5267 scale=1.0000 norm=0.6523


alue: boosting=rf
Elapsed time for fitting NGBoost model: 1185.85 s
  nll_temp = torch.tensor([-dist[i].log_prob(torch.tensor(y_test[i])) for i in range(len(dist))])
path: predictions/pointpredictions_ngboost.csv
dirname: predictions
filename: pointpredictions_ngboost.csv
artifact_path: predictions
path: /tmp/tmp9whtxew8/pointpredictions_ngboost.csv
tmp_path: /tmp/tmp9whtxew8/pointpredictions_ngboost.csv
path: predictions/quantiles_ngboost0.05.csv
dirname: predictions
filename: quantiles_ngboost0.05.csv
artifact_path: predictions
path: /tmp/tmp8gkt1z81/quantiles_ngboost0.05.csv
tmp_path: /tmp/tmp8gkt1z81/quantiles_ngboost0.05.csv
path: predictions/quantiles_ngboost0.1.csv
dirname: predictions
filename: quantiles_ngboost0.1.csv
artifact_path: predictions
path: /tmp/tmp1n6cgds9/quantiles_ngboost0.1.csv
tmp_path: /tmp/tmp1n6cgds9/quantiles_ngboost0.1.csv
path: predictions/quantiles_ngboost0.5.

In [None]:
ngboost_nll_metrics

### 3.4.2 NGBoost with CRPS

In [1]:
ngboost_base_params = {
    'boosting': 'rf',
    'n_estimators': 1,
    'bagging_fraction': 0.99, 
    'bagging_freq': 1
}

learner = LGBMRegressor(**ngboost_base_params)

ngboost_crps_params = {'Score': CRPScore,
            'Base':learner, 
            'natural_gradient':True,
            "learning_rate": 0.1,
            "n_estimators": 1000,
            "col_sample": 0.7, 
            "minibatch_frac": 0.7,    
            "random_state": 1, 
                 }

early_stopping_round = 20
quantiles = [0.05, 0.1, 0.5, 0.9, 0.95]

start_time = time.perf_counter()
    
# fitting model on train set with early stopping on valid set
ngboost_crps_early_stopping_params = {**ngboost_crps_params, "early_stopping_rounds": early_stopping_round}
ngboost_crps_reg = NGBoost(vectorizer_without_nan, target_transformer=target_transformer, distribution=DistEnum.NORMAL, **ngboost_crps_early_stopping_params)
ngboost_crps_reg.fit(train_val_df, TARGET, X_val=valid_df, y_val=np.array(valid_df[TARGET]), verbose=True)
ngboost_crps_best_iteration = ngboost_crps_reg.best_iteration
print("Early stopping performed. Best iteration:", ngboost_crps_best_iteration)

# fitting model on train+val set with best_iteration
ngboost_crps_full_train_params = {**ngboost_crps_params, "n_estimators": ngboost_crps_best_iteration}
ngboost_crps_full_train_reg = NGBoost(vectorizer_without_nan, target_transformer=target_transformer, distribution=DistEnum.NORMAL, **ngboost_crps_full_train_params)
ngboost_crps_full_train_reg.fit(train_df, TARGET, verbose=True)

# predicting on test set with our fully trained model
ngboost_crps_pred = ngboost_crps_full_train_reg.predict(test_df, quantiles=quantiles, prediction_types=[PredEnum.POINT_ESTIMATES, PredEnum.QUANTILES, PredEnum.SAMPLES, PredEnum.DISTRIBUTION_PARAMS], sample_size=400)

ngboost_crps_metrics = ngboost_crps_full_train_reg.metrics(np.array(test_df[TARGET]), ngboost_crps_pred, confidence_interval_quantiles=[0.1,0.9])

end_time = time.perf_counter()
full_time = np.round(end_time - start_time, 2)
ngboost_crps_metrics['time'] = full_time

NameError: name 'LGBMRegressor' is not defined

## 3.5 PGBM

### 3.5.1 PGBM normal

In [None]:
pgbm_normal_params = {
    'derivatives': 'exact',
    'distribution': 'normal',
    'device': 'gpu',
    'gpu_device_id': 0,
    "n_jobs": -1,
    "min_split_gain": 0.0,
    "min_data_in_leaf": 1,
    "max_bin": 1024,
    "max_leaves": 64,
    "max_depth": -1,
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "seed": 1,
    "lambda": 1,
}

early_stopping_round = 20
quantiles = [0.05, 0.1, 0.5, 0.9, 0.95]

start_time = time.perf_counter()
    
# fitting model on train set with early stopping on valid set
pgbm_normal_fit_params = {**pgbm_normal_params, "early_stopping_round": early_stopping_round}
pgbm_normal_reg = PGBM(vectorizer_without_nan, target_transformer=target_transformer)
pgbm_normal_reg.fit(train_val_df, TARGET, X_val=valid_df, y_val=np.array(valid_df[TARGET]), params=pgbm_normal_fit_params, apply_optimize_distribution=False, verbose=True)
pgbm_normal_best_iteration = pgbm_normal_reg.best_iteration
print("Early stopping performed. Best iteration:", pgbm_normal_best_iteration)

# fitting model on train+val set with best_iteration
pgbm_normal_full_fit_params = {**pgbm_normal_params, "n_estimators": pgbm_normal_best_iteration}
pgbm_normal_full_train_reg = PGBM(vectorizer_without_nan, target_transformer=target_transformer)
pgbm_normal_full_train_reg.fit(train_df, TARGET, params=pgbm_normal_full_fit_params, apply_optimize_distribution=False, verbose=True)

# predicting on test set with our fully trained model
pgbm_normal_pred = pgbm_normal_full_train_reg.predict(test_df, quantiles=quantiles, prediction_types=[PredEnum.POINT_ESTIMATES, PredEnum.QUANTILES, PredEnum.SAMPLES, PredEnum.DISTRIBUTION_PARAMS], sample_size=300)

pgbm_normal_metrics = pgbm_normal_full_train_reg.metrics(np.array(test_df[TARGET]), pgbm_normal_pred, confidence_interval_quantiles=[0.1,0.9])

end_time = time.perf_counter()
full_time = np.round(end_time - start_time, 2)
pgbm_normal_metrics['time'] = full_time

6be1ae84f6cf442db89f8b3ab5eb5287
Training on GPU
Estimator 0/1000, Train metric: 0.4159, Validation metric: 0.4036
Estimator 1/1000, Train metric: 0.4064, Validation metric: 0.3944
Estimator 2/1000, Train metric: 0.3979, Validation metric: 0.3861
Estimator 3/1000, Train metric: 0.3906, Validation metric: 0.3795
Estimator 4/1000, Train metric: 0.3868, Validation metric: 0.3764
Estimator 5/1000, Train metric: 0.3807, Validation metric: 0.3705
Estimator 6/1000, Train metric: 0.3765, Validation metric: 0.3670
Estimator 7/1000, Train metric: 0.3710, Validation metric: 0.3618
Estimator 8/1000, Train metric: 0.3656, Validation metric: 0.3570
Estimator 9/1000, Train metric: 0.3606, Validation metric: 0.3524
Estimator 10/1000, Train metric: 0.3566, Validation metric: 0.3489
Estimator 11/1000, Train metric: 0.3530, Validation metric: 0.3461
Estimator 12/1000, Train metric: 0.3502, Validation metric: 0.3444
Estimator 13/1000, Train metric: 0.3486, Validation metric: 0.3436
Estimator 14/1000, Trai

In [None]:
pgbm_normal_metrics

### 3.5.2 PGBM best dist

In [None]:
pgbm_dist_params = {
    'derivatives': 'exact',
    'distribution': 'normal',
    'device': 'gpu',
    'gpu_device_id': 0,
    "n_jobs": -1,
    "min_split_gain": 0.0,
    "min_data_in_leaf": 1,
    "max_bin": 1024,
    "max_leaves": 64,
    "max_depth": -1,
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "seed": 1,
    "lambda": 1,
}

early_stopping_round = 20
quantiles = [0.05, 0.1, 0.5, 0.9, 0.95]

start_time = time.perf_counter()

    
# fitting model on train set with early stopping on valid set
pgbm_dist_fit_params = {**pgbm_dist_params, "early_stopping_round": early_stopping_round}
pgbm_dist_reg = PGBM(vectorizer_without_nan, target_transformer=target_transformer)
pgbm_dist_reg.fit(train_val_df, TARGET, X_val=valid_df, y_val=np.array(valid_df[TARGET]), params=pgbm_dist_fit_params, apply_optimize_distribution=True, verbose=True)
best_distribution = pgbm_dist_reg.model.distribution
best_tree_correlation = pgbm_dist_reg.model.tree_correlation
best_iteration = pgbm_dist_reg.best_iteration
print("Early stopping performed. Best iteration:", best_iteration)
print("Best distribution found:", best_distribution)
print("Best tree correlation found:", best_tree_correlation)

# fitting model on train+val set with best_iteration
pgbm_dist_full_fit_params = {**pgbm_dist_params, "n_estimators": best_iteration}
pgbm_dist_full_train_reg = PGBM(vectorizer_without_nan, target_transformer=target_transformer)
pgbm_dist_full_train_reg.fit(train_df, TARGET, params=pgbm_dist_full_fit_params, apply_optimize_distribution=False, verbose=True)

# predicting on test set with our fully trained model
pgbm_dist_full_train_reg.model.distribution = best_distribution
pgbm_dist_full_train_reg.model.tree_correlation = best_tree_correlation
pgbm_dist_pred = pgbm_dist_full_train_reg.predict(test_df, quantiles=quantiles, prediction_types=[PredEnum.POINT_ESTIMATES, PredEnum.QUANTILES, PredEnum.SAMPLES, PredEnum.DISTRIBUTION_PARAMS], sample_size=300)

pgbm_dist_metrics = pgbm_dist_full_train_reg.metrics(np.array(test_df[TARGET]), pgbm_dist_pred, confidence_interval_quantiles=[0.1,0.9])
pgbm_dist_metrics['time'] = fit_time
    
end_time = time.perf_counter()
full_time = np.round(end_time - start_time, 2)
pgbm_dist_metrics['time'] = full_time

f49c673699f74b959224c30627c22a7e
Training on GPU
Estimator 0/1000, Train metric: 0.4159, Validation metric: 0.4036
Estimator 1/1000, Train metric: 0.4064, Validation metric: 0.3944
Estimator 2/1000, Train metric: 0.3979, Validation metric: 0.3861
Estimator 3/1000, Train metric: 0.3906, Validation metric: 0.3795
Estimator 4/1000, Train metric: 0.3868, Validation metric: 0.3764
Estimator 5/1000, Train metric: 0.3807, Validation metric: 0.3705
Estimator 6/1000, Train metric: 0.3765, Validation metric: 0.3670
Estimator 7/1000, Train metric: 0.3710, Validation metric: 0.3618
Estimator 8/1000, Train metric: 0.3656, Validation metric: 0.3570
Estimator 9/1000, Train metric: 0.3606, Validation metric: 0.3524
Estimator 10/1000, Train metric: 0.3566, Validation metric: 0.3489
Estimator 11/1000, Train metric: 0.3530, Validation metric: 0.3461
Estimator 12/1000, Train metric: 0.3502, Validation metric: 0.3444
Estimator 13/1000, Train metric: 0.3486, Validation metric: 0.3436
Estimator 14/1000, Trai

In [None]:
pgbm_dist_metrics

## 3.6 LSF

In [None]:
lsf_params = {'min_bin_size': np.log(len(train_df))**2}

base_estimator = lightgbm_full_train_reg.model
quantiles = [0.05, 0.1, 0.5, 0.9, 0.95]

start_time = time.perf_counter()

# fitting model on train set with early stopping on valid set
# since we cannot apply validation we are only training on the full train dataset once
lsf_reg = LSF(vectorizer_with_nan, target_transformer=target_transformer, base_model=base_estimator, model_trained=True, **lsf_params)

lsf_reg.fit(train_df, TARGET, verbose=True)
fit_time = np.round(end_time - start_time, 2)

# predicting on test set with our fully trained model
lsf_pred = lsf_reg.predict(test_df, quantiles=[0.05, 0.1, 0.5, 0.9, 0.95], prediction_types=[PredEnum.POINT_ESTIMATES, PredEnum.QUANTILES, PredEnum.SAMPLES])
predict_time = np.round(end_time - start_time, 2)
lsf_metrics = lsf_reg.metrics(np.array(test_df[TARGET]), lsf_pred, confidence_interval_quantiles=[0.1,0.9])

end_time = time.perf_counter()
full_time = np.round(end_time - start_time, 2)
lsf_metrics['time'] = full_time

2f045a3fab1a4950aa135c8030fa7a75
Elapsed time for fitting LSF model: 29.05 s
  samples = np.array([np.array(samples_for_one_input) for samples_for_one_input in samples])
path: predictions/pointpredictions_lsf_w_lightgbm.csv
dirname: predictions
filename: pointpredictions_lsf_w_lightgbm.csv
artifact_path: predictions
path: /tmp/tmp1rk_axs4/pointpredictions_lsf_w_lightgbm.csv
tmp_path: /tmp/tmp1rk_axs4/pointpredictions_lsf_w_lightgbm.csv
path: predictions/quantiles_lsf_w_lightgbm0.05.csv
dirname: predictions
filename: quantiles_lsf_w_lightgbm0.05.csv
artifact_path: predictions
path: /tmp/tmp4fuc8jbc/quantiles_lsf_w_lightgbm0.05.csv
tmp_path: /tmp/tmp4fuc8jbc/quantiles_lsf_w_lightgbm0.05.csv
path: predictions/quantiles_lsf_w_lightgbm0.1.csv
dirname: predictions
filename: quantiles_lsf_w_lightgbm0.1.csv
artifact_path: predictions
path: /tmp/tmpfd_yurcw/quantiles_lsf_w_lightgbm0.1.csv
tmp_path: /tmp/tmpfd_yurcw/quantiles_lsf_w_lightgbm0.1.csv
path: predictions/quantiles_lsf_w_lightgbm0.5.cs

## 3.7 TFT in PytorchFC

We need a dataset without any gaps for TFT. Missing dates for some stores will not work with the existing code implementation and missing features (meaning some days exist for some time series and in other time series they are missing) in some part of the sequence are also a problem for encoder decoder structure as different time series.

In [None]:
full_df_with_zero_sales = pd.read_pickle("/dbfs/mnt/tum/data/kaggle/rossmann/rossmann_full_df_with_zero_sales.pickle")
full_df_with_zero_sales = full_df_with_zero_sales.sort_values(['Store', 'Date'])
full_df_with_zero_sales = full_df_with_zero_sales.reset_index(drop=True)

In [None]:
len(full_df_with_zero_sales)

Out[9]: 1050330

For this Neural Network Model the target it not only inferred by the features in the same row, but also by features of "previous" rows as they will be encoded to better predict the upcoming values.
We do not only need to provide the corresponding rows for the target in our forecast horizon , but also the previous features and target values in lookback length in order to predict. This means that we do not split our dataframe as for tabular data.
We create a `full_train_df`, that contains information from start until holdout date (train_val split will be done internally in fit method) and a `full_test_df` which contains information from start until end of holdout date.

In [None]:
# inputs required for neural network
full_train_df = full_df_with_zero_sales[full_df_with_zero_sales['Date'] < "20150614"].sort_values(['Store', 'Date'])
full_test_df = full_df_with_zero_sales.copy()
display(full_train_df.tail())

index,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Missing,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,State,file,week,trend,file_DE,week_DE,trend_DE,Date_DE,State_DE,Month_DE,Day_DE,Dayofweek_DE,Dayofyear_DE,Is_month_end_DE,Is_month_start_DE,Is_quarter_end_DE,Is_quarter_start_DE,Is_year_end_DE,Is_year_start_DE,Elapsed_DE,Max_TemperatureC,Mean_TemperatureC,Min_TemperatureC,Dew_PointC,MeanDew_PointC,Min_DewpointC,Max_Humidity,Mean_Humidity,Min_Humidity,Max_Sea_Level_PressurehPa,Mean_Sea_Level_PressurehPa,Min_Sea_Level_PressurehPa,Max_VisibilityKm,Mean_VisibilityKm,Min_VisibilitykM,Max_Wind_SpeedKm_h,Mean_Wind_SpeedKm_h,Max_Gust_SpeedKm_h,Precipitationmm,CloudCover,Events,WindDirDegrees,StateName,CompetitionOpenSince,CompetitionDaysOpen,CompetitionMonthsOpen,Promo2Since,Promo2Days,Promo2Weeks,AfterSchoolHoliday,BeforeSchoolHoliday,AfterStateHoliday,BeforeStateHoliday,AfterPromo,BeforePromo,SchoolHoliday_bw,StateHoliday_bw,Promo_bw,SchoolHoliday_fw,StateHoliday_fw,Promo_fw
1050277,1115,2,2015-06-09T00:00:00.000+0000,5119.0,363.0,1.0,0.0,False,0.0,0,2015,6,24,9,1,160,False,False,False,False,False,False,1433808000,d,c,5350.0,1,1900,1,22,2012,"Mar,Jun,Sept,Dec",HE,Rossmann_DE_HE,2015-06-14 - 2015-06-20,85,Rossmann_DE,2015-06-14 - 2015-06-20,82,2015-06-14T00:00:00.000+0000,,6,14,6,165,False,False,False,False,False,False,1434240000,20,16,12,7,6,4,67,48,29,1026,1025,1024,10.0,10.0,10.0,32,24,47.0,0.0,6.0,,22,Hessen,1900-01-15T00:00:00.000+0000,0,0,2012-05-28T00:00:00.000+0000,1107,25,60,-48,5,0,4,-6,0.0,1.0,3.0,0.0,0.0,1.0
1050278,1115,3,2015-06-10T00:00:00.000+0000,4676.0,357.0,1.0,0.0,False,0.0,0,2015,6,24,10,2,161,False,False,False,False,False,False,1433894400,d,c,5350.0,1,1900,1,22,2012,"Mar,Jun,Sept,Dec",HE,Rossmann_DE_HE,2015-06-14 - 2015-06-20,85,Rossmann_DE,2015-06-14 - 2015-06-20,82,2015-06-14T00:00:00.000+0000,,6,14,6,165,False,False,False,False,False,False,1434240000,21,17,12,11,8,6,67,53,39,1026,1024,1022,10.0,10.0,10.0,26,16,,0.0,6.0,,59,Hessen,1900-01-15T00:00:00.000+0000,0,0,2012-05-28T00:00:00.000+0000,1108,25,61,-47,6,0,5,-5,0.0,1.0,2.0,0.0,0.0,2.0
1050279,1115,4,2015-06-11T00:00:00.000+0000,5216.0,380.0,1.0,0.0,False,0.0,0,2015,6,24,11,3,162,False,False,False,False,False,False,1433980800,d,c,5350.0,1,1900,1,22,2012,"Mar,Jun,Sept,Dec",HE,Rossmann_DE_HE,2015-06-14 - 2015-06-20,85,Rossmann_DE,2015-06-14 - 2015-06-20,82,2015-06-14T00:00:00.000+0000,,6,14,6,165,False,False,False,False,False,False,1434240000,24,21,17,12,9,8,64,47,28,1022,1019,1015,10.0,10.0,10.0,23,14,,0.0,5.0,Rain,51,Hessen,1900-01-15T00:00:00.000+0000,0,0,2012-05-28T00:00:00.000+0000,1109,25,62,-46,7,0,6,-4,0.0,0.0,1.0,0.0,0.0,3.0
1050280,1115,5,2015-06-12T00:00:00.000+0000,5315.0,378.0,1.0,0.0,False,0.0,0,2015,6,24,12,4,163,False,False,False,False,False,False,1434067200,d,c,5350.0,1,1900,1,22,2012,"Mar,Jun,Sept,Dec",HE,Rossmann_DE_HE,2015-06-14 - 2015-06-20,85,Rossmann_DE,2015-06-14 - 2015-06-20,82,2015-06-14T00:00:00.000+0000,,6,14,6,165,False,False,False,False,False,False,1434240000,31,22,14,16,12,9,78,54,25,1015,1012,1009,31.0,15.0,10.0,40,11,58.0,0.0,5.0,Rain,42,Hessen,1900-01-15T00:00:00.000+0000,0,0,2012-05-28T00:00:00.000+0000,1110,25,63,-45,8,0,7,-3,0.0,0.0,0.0,0.0,0.0,4.0
1050281,1115,6,2015-06-13T00:00:00.000+0000,7736.0,503.0,1.0,0.0,False,0.0,0,2015,6,24,13,5,164,False,False,False,False,False,False,1434153600,d,c,5350.0,1,1900,1,22,2012,"Mar,Jun,Sept,Dec",HE,Rossmann_DE_HE,2015-06-14 - 2015-06-20,85,Rossmann_DE,2015-06-14 - 2015-06-20,82,2015-06-14T00:00:00.000+0000,,6,14,6,165,False,False,False,False,False,False,1434240000,26,21,17,17,14,9,94,65,27,1011,1010,1008,31.0,13.0,10.0,29,11,39.0,0.0,6.0,Rain,240,Hessen,1900-01-15T00:00:00.000+0000,0,0,2012-05-28T00:00:00.000+0000,1111,25,64,-44,9,0,8,-2,0.0,0.0,0.0,0.0,0.0,5.0


In [None]:
static_cat_vars = ['Store', 'StoreType', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'Assortment']
dynamic_cat_vars = ['DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'PromoInterval', 'Week', 'Promo_fw', 
    'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw']
cont_vars = ['AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

# bring categorical features to type 'str'
full_train_df[static_cat_vars+dynamic_cat_vars] = full_train_df[static_cat_vars+dynamic_cat_vars].astype(str)
full_test_df[static_cat_vars+dynamic_cat_vars] = full_test_df[static_cat_vars+dynamic_cat_vars].astype(str)

# # add_time_idx_to_df()
full_train_df = TFTPytorchFC.add_time_idx_to_df(X=full_train_df, group_ids="Store")
full_test_df = TFTPytorchFC.add_time_idx_to_df(X=full_test_df, group_ids="Store")

# obtain_y_test_out_of_X_test()
tft_y_test = TFTPytorchFC.obtain_y_test_out_of_X_test(X_test= full_test_df, forecast_horizon=forecast_horizon, time_idx="time_index_tft", target=TARGET, group_ids="Store")

Time index called "time_index_tft" added to provided dataframe
Time index called "time_index_tft" added to provided dataframe


In [None]:
from pytorch_lightning import Trainer as Lightning_Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_forecasting.data.encoders import TorchNormalizer, NaNLabelEncoder

tft_pytorch_params = {
    'hidden_size': 240, 
    'lstm_layers': 2, 
    'dropout': 0.1, 
    'attention_head_size': 4,
    'learning_rate': 0.001, 
    'log_interval': -1,
    'log_val_interval': -1,
    'reduce_on_plateau_patience': 1000,  
}

trainer_params = {'max_epochs': 100,
                  'accelerator': 'gpu',
                  'devices': 1,
                  'limit_train_batches': 100, 
                  'gradient_clip_algorithm': 'norm', 
                  'gradient_clip_val': 100 
                 }

params_dataloader = {
    'num_workers': 8,
    'batch_size': 128
}

lookback = forecast_horizon*3

trainer_params['callbacks'] = EarlyStopping(monitor="val_loss", patience=10, mode="min")
lightning_trainer = Lightning_Trainer(**trainer_params)

start_time = time.perf_counter()
    
tft_reg = TFTPytorchFC(lookback=lookback, forecast_horizon=forecast_horizon, time_idx="time_index_tft", group_ids=["Store"], static_categoricals=static_cat_vars, time_varying_known_categoricals=dynamic_cat_vars, time_varying_known_reals=cont_vars,  time_varying_unknown_reals = [TARGET], quantiles = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95])
    
tft_model_trained = tft_reg.fit(full_train_df, TARGET, lightning_trainer=lightning_trainer, params_tft=tft_pytorch_params, params_dataloader=params_dataloader, params_dataset_creation={}, verbose = True)
tft_time = np.round(end_time - start_time, 2)

tft_pred = tft_reg.predict(full_test_df, prediction_types=[PredEnum.POINT_ESTIMATES, PredEnum.QUANTILES])
tft_metrics = tft_reg.metrics(tft_y_test, tft_pred, confidence_interval_quantiles=[0.1,0.9])


#Evaluate MAPE & RMSPE without zero values as in Kaggle competition
y_test = np.reshape(tft_y_test, newshape=(full_train_df['Store'].nunique()*forecast_horizon,1))
predictions = np.reshape(tft_pred[PredEnum.POINT_ESTIMATES], newshape=(full_train_df['Store'].nunique()*forecast_horizon,1))
indices_nonzero = np.where(y_test!=0)
# Take only entries which have no zeros in ground truth
y_test = y_test[indices_nonzero]
predictions = predictions[indices_nonzero]
tft_metrics['rmspe_only_nonzero'] = np.sqrt(np.mean(np.square((y_test - predictions) / (y_test))))
tft_metrics['mape_only_nonzero'] = mean_absolute_percentage_error(y_test, predictions)

end_time = time.perf_counter()
full_time = np.round(end_time - start_time, 2)
tft_metrics['time'] = full_time

04f0f472ed284b2684f2f297c8e6e395
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(
Missing logger folder: /databricks/driver/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 92.0 K
3  | prescalers                         | ModuleDict                      | 96    
4  | static_variable_selection          | VariableSelectionNetwork        | 4.2 K 
5  | encoder_variable_selection         | VariableSelectionNetwork        | 51.5 K
6  | deco

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Elapsed time for fitting TFTPytorchFC model: 28962.81 s
path: models/model_tft.pkl
dirname: models
filename: model_tft.pkl
artifact_path: models
path: /tmp/tmp7_qtmds1/model_tft.pkl
tmp_path: /tmp/tmp7_qtmds1/model_tft.pkl
path: models/trainer_tft.pkl
dirname: models
filename: trainer_tft.pkl
artifact_path: models
path: /tmp/tmpkxyqz8h6/trainer_tft.pkl
tmp_path: /tmp/tmpkxyqz8h6/trainer_tft.pkl
path: predictions/pointpredictions_tft.csv
dirname: predictions
filename: pointpredictions_tft.csv
artifact_path: predictions
path: /tmp/tmp5o_nzck5/pointpredictions_tft.csv
tmp_path: /tmp/tmp5o_nzck5/pointpredictions_tft.csv
path: predictions/groundtruth_tft.csv
dirname: predictions
filename: groundtruth_tft.csv
artifact_path: predictions
path: /tmp/tmpu0xjeenx/groundtruth_tft.csv
tmp_path: /tmp/tmpu0xjeenx/groundtruth_tft.csv
path: predictions/quantiles_tft0.05.csv
dirname: predictions
filename: quantiles_tft0.05.csv
artifact_path: predictions
path: /tmp/tmppnmju156/quantiles_tft0.05.csv
tmp_p

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-4489943609334959>[0m in [0;36m<cell line: 6>[0;34m()[0m
[1;32m     52[0m     [0mmlflow[0m[0;34m.[0m[0mlog_metrics[0m[0;34m([0m[0mtft_metrics[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     53[0m [0;34m[0m[0m
[0;32m---> 54[0;31m     [0mmlflow[0m[0;34m.[0m[0mset_tag[0m[0;34m([0m[0;34m"number_of_rows_train"[0m[0;34m,[0m [0mlen[0m[0;34m([0m[0msmaller_train_df[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     55[0m     [0mmlflow[0m[0;34m.[0m[0mset_tag[0m[0;34m([0m[0;34m"number_of_entities"[0m[0;34m,[0m [0msmaller_train_df[0m[0;34m[[0m[0;34m'Store'[0m[0;34m][0m[0;34m.[0m[0mnunique[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     56[0m     [0mmlflow[0m[0;34m.[0m[0mset_tag[0m[0;34m([

In [None]:
tft_metrics

Out[38]: {<PredEnum.POINT_ESTIMATES: 'point_estimates'>: array([[3.3465129e-01, 7.0914424e+03, 5.5127896e+03, ..., 5.7591460e+03,
         6.3972549e+03, 6.3521089e+03],
        [3.5204864e-01, 8.2911348e+03, 6.7564336e+03, ..., 6.8420571e+03,
         7.2872529e+03, 7.5049224e+03],
        [3.5193133e-01, 1.2279685e+04, 9.4032070e+03, ..., 1.0585309e+04,
         1.1704678e+04, 1.2021682e+04],
        ...,
        [3.4777778e-01, 8.1329141e+03, 6.1861294e+03, ..., 6.8212578e+03,
         7.5689517e+03, 7.3007705e+03],
        [3.2237825e-01, 6.8060566e+03, 5.4586694e+03, ..., 5.6836724e+03,
         6.0889873e+03, 6.1645010e+03],
        [3.7114045e-01, 1.4601336e+04, 1.1182717e+04, ..., 1.2082696e+04,
         1.3536629e+04, 1.3765246e+04]], dtype=float32),
 <PredEnum.QUANTILES: 'quantiles'>: {0.05: array([[1.6902703e-01, 5.8649287e+03, 4.6701143e+03, ..., 4.7916885e+03,
          5.3095605e+03, 5.2118535e+03],
         [2.1492589e-01, 6.8672153e+03, 5.7575532e+03, ..., 5.8242168e+03