## Demand forecasting with the Temporal Fusion Transformer
- https://pytorch-forecasting.readthedocs.io/en/stable/tutorials/stallion.html

<div style="text-align: right"> <b>Author : Kwang Myung Yu</b></div>
<div style="text-align: right"> Initial upload: 2023.09.01</div>
<div style="text-align: right"> Last update: 2023.09.01</div>

In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from scipy import stats
import warnings; warnings.filterwarnings('ignore')
#plt.style.use('ggplot')
plt.style.use('seaborn-whitegrid')
%matplotlib inline

In [2]:
import os
import warnings

os.chdir("../../..")

In [3]:
import copy
from pathlib import Path
import warnings

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
import numpy as np
import pandas as pd
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

In [4]:
from pytorch_forecasting.data.examples import get_stallion_data

data = get_stallion_data()

In [5]:
data.head()

Unnamed: 0,agency,sku,volume,date,industry_volume,soda_volume,avg_max_temp,price_regular,price_actual,discount,...,labor_day,independence_day,revolution_day_memorial,regional_games,fifa_u_17_world_cup,football_gold_cup,beer_capital,music_fest,discount_in_percent,timeseries
0,Agency_22,SKU_01,52.272,2013-01-01,492612703,718394219,25.845238,1168.903668,1069.166193,99.737475,...,0,0,0,0,0,0,0,0,8.532566,0
238,Agency_37,SKU_04,0.0,2013-01-01,492612703,718394219,26.505,1852.273642,1611.466298,240.807344,...,0,0,0,0,0,0,0,0,13.000635,5
237,Agency_59,SKU_03,812.9214,2013-01-01,492612703,718394219,22.219737,1270.795012,1197.18426,73.610752,...,0,0,0,0,0,0,0,0,5.792496,9
236,Agency_11,SKU_01,316.44,2013-01-01,492612703,718394219,25.36,1176.155397,1082.757488,93.397909,...,0,0,0,0,0,0,0,0,7.94095,14
235,Agency_05,SKU_05,420.9093,2013-01-01,492612703,718394219,24.079012,1327.003396,1207.822992,119.180404,...,0,0,0,0,0,0,0,0,8.981168,22


In [6]:
data.dtypes

agency                                    category
sku                                       category
volume                                     float64
date                                datetime64[ms]
industry_volume                              int64
soda_volume                                  int64
avg_max_temp                               float64
price_regular                              float64
price_actual                               float64
discount                                   float64
avg_population_2017                          int64
avg_yearly_household_income_2017             int64
easter_day                                   int64
good_friday                                  int64
new_year                                     int64
christmas                                    int64
labor_day                                    int64
independence_day                             int64
revolution_day_memorial                      int64
regional_games                 

In [7]:
data.shape

(21000, 26)

먼저, 시계열의 각 행을 timestep으로 식별할 수 있는 판다스 데이터 프레임으로 변환해야 합니다.  
다행히도 대부분의 데이터 집합은 이미 이 형식으로 되어 있습니다.  
이 자습서에서는 다양한 음료의 판매량을 설명하는 Kaggle의 Stallion 데이터 집합을 사용하겠습니다.  
우리의 임무는 대리점, 즉 매장에서 판매되는 제품, 즉 SKU(재고 관리 단위)별 판매량을 6개월 동안 예측하는 것입니다.  
매월 약 21,000건의 과거 판매 기록이 있습니다.  
과거 판매량 외에도 판매 가격, 대리점 위치, 공휴일과 같은 특별한 날, 전체 업계에서 판매된 수량에 대한 정보가 있습니다.

데이터 세트는 이미 올바른 형식이지만 몇 가지 중요한 기능이 누락되어 있습니다.  
가장 중요한 것은 각 시간 단계마다 1씩 증가하는 시간 인덱스를 추가해야 한다는 것입니다.  
또한 날짜 피처를 추가하는 것이 유용하며, 이 경우 날짜 레코드에서 월을 추출하는 것을 의미합니다.

In [8]:
data["time_idx"] = data['date'].dt.year*12 + data['date'].dt.month

In [9]:
data.head()

Unnamed: 0,agency,sku,volume,date,industry_volume,soda_volume,avg_max_temp,price_regular,price_actual,discount,...,independence_day,revolution_day_memorial,regional_games,fifa_u_17_world_cup,football_gold_cup,beer_capital,music_fest,discount_in_percent,timeseries,time_idx
0,Agency_22,SKU_01,52.272,2013-01-01,492612703,718394219,25.845238,1168.903668,1069.166193,99.737475,...,0,0,0,0,0,0,0,8.532566,0,24157
238,Agency_37,SKU_04,0.0,2013-01-01,492612703,718394219,26.505,1852.273642,1611.466298,240.807344,...,0,0,0,0,0,0,0,13.000635,5,24157
237,Agency_59,SKU_03,812.9214,2013-01-01,492612703,718394219,22.219737,1270.795012,1197.18426,73.610752,...,0,0,0,0,0,0,0,5.792496,9,24157
236,Agency_11,SKU_01,316.44,2013-01-01,492612703,718394219,25.36,1176.155397,1082.757488,93.397909,...,0,0,0,0,0,0,0,7.94095,14,24157
235,Agency_05,SKU_05,420.9093,2013-01-01,492612703,718394219,24.079012,1327.003396,1207.822992,119.180404,...,0,0,0,0,0,0,0,8.981168,22,24157


In [10]:
# add additional features
data["month"] = data.date.dt.month.astype(str).astype("category")  # categories have be strings
data["log_volume"] = np.log(data.volume + 1e-8)
data["avg_volume_by_sku"] = data.groupby(["time_idx", "sku"], observed=True).volume.transform("mean")
data["avg_volume_by_agency"] = data.groupby(["time_idx", "agency"], observed=True).volume.transform("mean")

In [11]:
data.dtypes

agency                                    category
sku                                       category
volume                                     float64
date                                datetime64[ms]
industry_volume                              int64
soda_volume                                  int64
avg_max_temp                               float64
price_regular                              float64
price_actual                               float64
discount                                   float64
avg_population_2017                          int64
avg_yearly_household_income_2017             int64
easter_day                                   int64
good_friday                                  int64
new_year                                     int64
christmas                                    int64
labor_day                                    int64
independence_day                             int64
revolution_day_memorial                      int64
regional_games                 

In [12]:
# we want to encode special days as one variable and thus need to first reverse one-hot encoding
special_days = [
    "easter_day",
    "good_friday",
    "new_year",
    "christmas",
    "labor_day",
    "independence_day",
    "revolution_day_memorial",
    "regional_games",
    "fifa_u_17_world_cup",
    "football_gold_cup",
    "beer_capital",
    "music_fest",
]
data[special_days] = data[special_days].apply(lambda x: x.map({0: "-", 1: x.name})).astype("category")
data.sample(10, random_state=521)

Unnamed: 0,agency,sku,volume,date,industry_volume,soda_volume,avg_max_temp,price_regular,price_actual,discount,...,football_gold_cup,beer_capital,music_fest,discount_in_percent,timeseries,time_idx,month,log_volume,avg_volume_by_sku,avg_volume_by_agency
291,Agency_25,SKU_03,0.5076,2013-01-01,492612703,718394219,25.845238,1264.162234,1152.473405,111.688829,...,-,-,-,8.835008,228,24157,1,-0.678062,1225.306376,99.6504
871,Agency_29,SKU_02,8.748,2015-01-01,498567142,762225057,27.584615,1316.098485,1296.804924,19.293561,...,-,-,-,1.465966,177,24181,1,2.168825,1634.434615,11.397086
19532,Agency_47,SKU_01,4.968,2013-09-01,454252482,789624076,30.665957,1269.25,1266.49049,2.75951,...,-,-,-,0.217413,322,24165,9,1.603017,2625.472644,48.29565
2089,Agency_53,SKU_07,21.6825,2013-10-01,480693900,791658684,29.197727,1193.842373,1128.124395,65.717978,...,-,beer_capital,-,5.504745,240,24166,10,3.076505,38.529107,2511.035175
9755,Agency_17,SKU_02,960.552,2015-03-01,515468092,871204688,23.60812,1338.334248,1232.128069,106.206179,...,-,-,music_fest,7.935699,259,24183,3,6.867508,2143.677462,396.02214
7561,Agency_05,SKU_03,1184.6535,2014-02-01,425528909,734443953,28.668254,1369.556376,1161.135214,208.421162,...,-,-,-,15.218151,21,24170,2,7.077206,1566.643589,1881.866367
19204,Agency_11,SKU_05,5.5593,2017-08-01,623319783,1049868815,31.915385,1922.486644,1651.307674,271.17897,...,-,-,-,14.105636,17,24212,8,1.715472,1385.225478,109.6992
8781,Agency_48,SKU_04,4275.1605,2013-03-01,509281531,892192092,26.767857,1761.258209,1546.05967,215.198539,...,-,-,music_fest,12.218455,151,24159,3,8.360577,1757.950603,1925.272108
2540,Agency_07,SKU_21,0.0,2015-10-01,544203593,761469815,28.987755,0.0,0.0,0.0,...,-,-,-,0.0,300,24190,10,-18.420681,0.0,2418.71955
12084,Agency_21,SKU_03,46.3608,2017-04-01,589969396,940912941,32.47891,1675.922116,1413.571789,262.350327,...,-,-,-,15.654088,181,24208,4,3.836454,2034.293024,109.3818


In [13]:
data.head()

Unnamed: 0,agency,sku,volume,date,industry_volume,soda_volume,avg_max_temp,price_regular,price_actual,discount,...,football_gold_cup,beer_capital,music_fest,discount_in_percent,timeseries,time_idx,month,log_volume,avg_volume_by_sku,avg_volume_by_agency
0,Agency_22,SKU_01,52.272,2013-01-01,492612703,718394219,25.845238,1168.903668,1069.166193,99.737475,...,-,-,-,8.532566,0,24157,1,3.956461,2613.377501,103.80546
238,Agency_37,SKU_04,0.0,2013-01-01,492612703,718394219,26.505,1852.273642,1611.466298,240.807344,...,-,-,-,13.000635,5,24157,1,-18.420681,1361.511918,0.5499
237,Agency_59,SKU_03,812.9214,2013-01-01,492612703,718394219,22.219737,1270.795012,1197.18426,73.610752,...,-,-,-,5.792496,9,24157,1,6.700634,1225.306376,2041.909586
236,Agency_11,SKU_01,316.44,2013-01-01,492612703,718394219,25.36,1176.155397,1082.757488,93.397909,...,-,-,-,7.94095,14,24157,1,5.757134,2613.377501,125.69022
235,Agency_05,SKU_05,420.9093,2013-01-01,492612703,718394219,24.079012,1327.003396,1207.822992,119.180404,...,-,-,-,8.981168,22,24157,1,6.042417,1179.728165,1638.4635


### Create dataset and dataloaders

다음 단계는 데이터 프레임을 파이토치 예측 시계열 데이터 집합으로 변환하는 것입니다.  
데이터 세트에 어떤 특징이 범주형인지 연속형인지, 어떤 특징이 정적인지 시간에 따라 변화하는지 알려주는 것 외에도 데이터를 정규화하는 방법도 결정해야 합니다.  
여기서는 각 시계열을 개별적으로 표준 스케일링하고 값이 항상 양수임을 나타냅니다. 일반적으로 정규화에 의해 유발되는 룩어헤드 바이어스를 피하기 위해 훈련할 때 각 인코더 시퀀스에 대해 동적으로 스케일링하는 EncoderNormalizer를 선호합니다. 그러나 데이터에 0이 많아서 합리적으로 안정적인 정규화를 찾는 데 문제가 있는 경우와 같이 룩어헤드 바이어스를 허용할 수 있습니다. 또는 추론에서 보다 안정적인 정규화를 기대할 수도 있습니다. 후자의 경우, 추론을 실행할 때 존재하지 않을 "이상한" 점프를 학습하지 않도록 하여 보다 현실적인 데이터 집합으로 학습할 수 있습니다.

In [14]:
max_prediction_length = 6
max_encoder_length = 24
training_cutoff = data["time_idx"].max() - max_prediction_length

In [15]:
training_cutoff

24210

In [16]:
training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx = 'time_idx',
    target='volume',
    group_ids=["agency", "sku"],
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["agency", "sku"],
    static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    time_varying_known_categoricals=["special_days", "month"],
    variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], transformation="softplus"
    ),  # use softplus and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)