# NN: Multi Time series
- Although LGBM is easy to be implemented, LGBM
    - cannot leverage exo cat features (holiday, weekday)
    - cannot learn the relationship between exogenous features on the whole time series x[1,w] + y[i]
- This notebook is to demostrate how to handle multi time series (multi horizon) forecasting using nn and find out if nn can overcome the above shortcoming of LGBM
    - DeepAR
    - N-BEATS

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

import statsmodels.api as sm

from evaluation import Metric, Evaluation
from plotting import Plot, ModelEvaluation
from time_series import TsHandler
from preprocess import Preprocessor, Dataset, Scaler
from multi_horizon import Prediction
from config.get_config import ConfigHandler

import warnings
warnings.filterwarnings('ignore')

import random
seed = 42

In [2]:
# Read model_params from notebooks/config/model_params.yml
model_params = ConfigHandler.read_yml("model_params_path")

WINDOWSIZE = model_params['common']['windowsize']
HORIZON = model_params['common']['horizon']
LGBM_PARAMS = model_params['lgbm']['lgbm_params']
LGBM_TRAINER_ARGS = model_params['lgbm']['lgbm_trainer_args']

print(f"windowsize:{WINDOWSIZE}")
print(f"horizon:{HORIZON}")
print(f"lgbm_params:{LGBM_PARAMS}")
print(f"lgbm_trainer_args:{LGBM_TRAINER_ARGS}")

windowsize:12
horizon:28
lgbm_params:{'objective': 'l1', 'metric': 'rmse', 'learning_rate': 0.01}
lgbm_trainer_args:{'num_boost_round': 1000, 'early_stopping_rounds': 100, 'verbose_eval': 100}


In [3]:
df_sales = pd.read_csv('./data/sales_train_evaluation.csv')
df_sales

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0


In [4]:
# agg to item lvl for obtaining multi time series with obvious patterns
df = df_sales.drop(columns=[
    'id',
    # 'item_id', 
    'dept_id', 
    'cat_id', 
    'store_id', 
    'state_id'
    ])
df_item_id = df.groupby(['item_id']).sum()
df_item_id

Unnamed: 0_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FOODS_1_001,6,6,4,6,7,18,10,4,11,10,...,7,8,8,4,7,7,5,7,5,9
FOODS_1_002,4,5,7,4,3,4,1,7,2,4,...,5,8,7,2,6,5,0,6,6,4
FOODS_1_003,14,8,3,6,3,8,13,10,11,6,...,7,8,6,11,9,7,7,10,6,5
FOODS_1_004,0,0,0,0,0,0,0,0,0,0,...,115,104,107,81,73,62,71,75,83,93
FOODS_1_005,34,32,13,20,10,21,18,20,25,41,...,15,16,25,9,7,25,19,25,17,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HOUSEHOLD_2_512,5,4,1,3,2,4,2,8,7,5,...,5,7,11,4,8,3,4,5,4,7
HOUSEHOLD_2_513,0,0,0,0,0,0,0,0,0,0,...,4,0,5,3,6,3,0,9,3,4
HOUSEHOLD_2_514,4,8,2,1,1,2,3,8,2,1,...,2,2,2,6,1,0,1,4,3,3
HOUSEHOLD_2_515,0,0,0,0,0,0,0,0,0,0,...,1,1,5,5,1,2,3,0,3,1


In [5]:
# find the max of sum(sales) all over the time (~= more obvious pattern)
df_item_id.sum(axis=1).sort_values(ascending=False)

item_id
FOODS_3_090        1017916
FOODS_3_586         932236
FOODS_3_252         573723
FOODS_3_555         497881
FOODS_3_587         402159
                    ...   
HOBBIES_2_084          786
HOUSEHOLD_2_005        782
HOUSEHOLD_2_175        759
HOBBIES_2_119          673
HOUSEHOLD_2_101        593
Length: 3049, dtype: int64

In [6]:
# Show top 10 items
desc_index = df_item_id.sum(axis=1).sort_values(ascending=False).index[:10]
df_top10 = df_item_id.reindex(desc_index)
df_top10

Unnamed: 0_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FOODS_3_090,1046,1036,673,642,531,877,1117,1311,1306,517,...,638,736,675,374,369,380,400,626,682,679
FOODS_3_586,516,479,328,376,319,430,405,624,537,432,...,458,499,556,364,415,375,334,451,498,583
FOODS_3_252,289,273,157,172,132,218,227,363,255,204,...,326,430,378,272,318,330,225,309,345,416
FOODS_3_555,321,315,214,204,159,265,240,425,355,224,...,255,256,269,209,188,215,212,279,295,320
FOODS_3_587,298,353,234,310,205,214,190,572,508,428,...,194,233,352,233,160,190,170,216,240,294
FOODS_3_714,238,215,141,144,96,139,169,277,219,166,...,196,282,238,177,226,219,156,228,268,259
FOODS_3_694,291,230,187,200,136,234,254,325,251,200,...,242,246,255,200,207,178,187,233,248,255
FOODS_3_226,210,168,119,165,116,199,191,235,171,173,...,187,229,251,147,166,171,183,218,246,255
FOODS_3_202,106,113,65,80,75,90,98,133,168,91,...,180,197,233,187,119,126,137,170,182,251
FOODS_3_120,0,0,0,0,0,0,0,0,0,0,...,180,297,278,208,281,183,207,178,276,356


In [7]:
Plot.plot_multi_ts(df_top10)

# Exo features
- exo cat features: holiday, weekday
- exo num features: sell price pdiff

In [17]:
df_prices = pd.read_csv('./data/sell_prices.csv')
df_prices = df_prices.groupby(['item_id', 'wm_yr_wk'])['sell_price'].mean().reset_index()
df_prices['ma'] = df_prices.groupby(['item_id'])['sell_price'].rolling(window=4).mean().reset_index(drop=True)
df_prices['sell_price_pdiff'] = (df_prices['sell_price'] - df_prices['ma']) / df_prices['ma']

df_prices

Unnamed: 0,item_id,wm_yr_wk,sell_price,ma,sell_price_pdiff
0,FOODS_1_001,11101,2.00,,
1,FOODS_1_001,11102,2.00,,
2,FOODS_1_001,11103,2.00,,
3,FOODS_1_001,11104,2.00,2.00000,0.000000
4,FOODS_1_001,11105,2.00,2.00000,0.000000
...,...,...,...,...,...
712585,HOUSEHOLD_2_516,11617,5.94,5.81625,0.021277
712586,HOUSEHOLD_2_516,11618,5.94,5.81625,0.021277
712587,HOUSEHOLD_2_516,11619,5.94,5.94000,0.000000
712588,HOUSEHOLD_2_516,11620,5.94,5.94000,0.000000


In [18]:
df_calendar = pd.read_csv('./data/calendar.csv', parse_dates=['date'])

df_calendar['holiday'] = df_calendar[['event_name_1', 'event_name_2']].apply(lambda x: ','.join(np.sort(x.dropna())), axis=1)
df_calendar = df_calendar[['d', 'wm_yr_wk', 'weekday', 'holiday']]
df_calendar

Unnamed: 0,d,wm_yr_wk,weekday,holiday
0,d_1,11101,Saturday,
1,d_2,11101,Sunday,
2,d_3,11101,Monday,
3,d_4,11101,Tuesday,
4,d_5,11101,Wednesday,
...,...,...,...,...
1964,d_1965,11620,Wednesday,
1965,d_1966,11620,Thursday,
1966,d_1967,11620,Friday,
1967,d_1968,11621,Saturday,


In [19]:
cat_cols = ['weekday', 'holiday']
df_calendar, label_encode_mapping = Preprocessor.label_encoding(df_calendar, cat_cols)

df_calendar

Unnamed: 0,d,wm_yr_wk,weekday,holiday,weekday_encoded,holiday_encoded
0,d_1,11101,Saturday,,6,0
1,d_2,11101,Sunday,,7,0
2,d_3,11101,Monday,,1,0
3,d_4,11101,Tuesday,,2,0
4,d_5,11101,Wednesday,,3,0
...,...,...,...,...,...,...
1964,d_1965,11620,Wednesday,,3,0
1965,d_1966,11620,Thursday,,4,0
1966,d_1967,11620,Friday,,5,0
1967,d_1968,11621,Saturday,,6,0


# DeepAR