In [19]:
%reload_ext watermark
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%config IPCompleter.use_jedi=False
%watermark -v -p numpy,pandas,matplotlib,sklearn,torch,torchvision,pytorch_lightning,pytorch_forecasting

CPython 3.6.9
IPython 7.16.1

numpy 1.19.4
pandas 1.1.5
matplotlib 3.3.3
sklearn 0.24.0
torch 1.8.0.dev20210103+cu101
torchvision 0.9.0.dev20210103+cu101
pytorch_lightning 1.2.0
pytorch_forecasting 0.8.3


In [7]:
%%bash

mirror="mirrors.intra.didiyun.com"

pip install statsmodels pyarrow --trusted-host $mirror --index-url http://$mirror/pip/simple
pip install pytorch-forecasting --timeout 120 --retries 20 

Looking in indexes: http://mirrors.intra.didiyun.com/pip/simple
Collecting pyarrow
  Downloading http://mirrors.intra.didiyun.com/pip/packages/33/67/2f4fcce1b41bcc7e88a6bfdb42046597ae72e5bc95c2789b7c5ac893c433/pyarrow-3.0.0-cp36-cp36m-manylinux2014_x86_64.whl (20.7 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-3.0.0


You should consider upgrading via the '/usr/bin/python3.6 -m pip install --upgrade pip' command.
You should consider upgrading via the '/usr/bin/python3.6 -m pip install --upgrade pip' command.


In [4]:
import os
import warnings

warnings.filterwarnings("ignore")

In [33]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters


from pytorch_forecasting.data.examples import get_stallion_data

In [9]:
data = get_stallion_data()
data.head()

Unnamed: 0,agency,sku,volume,date,industry_volume,soda_volume,avg_max_temp,price_regular,price_actual,discount,...,labor_day,independence_day,revolution_day_memorial,regional_games,fifa_u_17_world_cup,football_gold_cup,beer_capital,music_fest,discount_in_percent,timeseries
0,Agency_22,SKU_01,52.272,2013-01-01,492612703,718394219,25.845238,1168.903668,1069.166193,99.737475,...,0,0,0,0,0,0,0,0,8.532566,0
238,Agency_37,SKU_04,0.0,2013-01-01,492612703,718394219,26.505,1852.273642,1611.466298,240.807344,...,0,0,0,0,0,0,0,0,13.000635,5
237,Agency_59,SKU_03,812.9214,2013-01-01,492612703,718394219,22.219737,1270.795012,1197.18426,73.610752,...,0,0,0,0,0,0,0,0,5.792496,9
236,Agency_11,SKU_01,316.44,2013-01-01,492612703,718394219,25.36,1176.155397,1082.757488,93.397909,...,0,0,0,0,0,0,0,0,7.94095,14
235,Agency_05,SKU_05,420.9093,2013-01-01,492612703,718394219,24.079012,1327.003396,1207.822992,119.180404,...,0,0,0,0,0,0,0,0,8.981168,22


In [34]:
data["time_idx"] = data["date"].dt.year * 12 + data["date"].dt.month
data["time_idx"] -= data["time_idx"].min()

data["month"] = data.date.dt.month.astype(str).astype("category")  # categories have be strings
data["log_volume"] = np.log(data.volume + 1e-8)
data["avg_volume_by_sku"] = data.groupby(["time_idx", "sku"], observed=True).volume.transform("mean")
data["avg_volume_by_agency"] = data.groupby(["time_idx", "agency"], observed=True).volume.transform("mean")

# we want to encode special days as one variable and thus need to first reverse one-hot encoding
special_days = [
    "easter_day",
    "good_friday",
    "new_year",
    "christmas",
    "labor_day",
    "independence_day",
    "revolution_day_memorial",
    "regional_games",
    "fifa_u_17_world_cup",
    "football_gold_cup",
    "beer_capital",
    "music_fest",
]
data[special_days] = data[special_days].apply(lambda x: x.map({0: "-", 1: x.name})).astype("category")
data.sample(10, random_state=521)

Unnamed: 0,agency,sku,volume,date,industry_volume,soda_volume,avg_max_temp,price_regular,price_actual,discount,...,football_gold_cup,beer_capital,music_fest,discount_in_percent,timeseries,time_idx,month,log_volume,avg_volume_by_sku,avg_volume_by_agency
291,Agency_25,SKU_03,0.5076,2013-01-01,492612703,718394219,25.845238,1264.162234,1152.473405,111.688829,...,-,-,-,8.835008,228,0,1,-0.678062,1225.306376,99.6504
871,Agency_29,SKU_02,8.748,2015-01-01,498567142,762225057,27.584615,1316.098485,1296.804924,19.293561,...,-,-,-,1.465966,177,24,1,2.168825,1634.434615,11.397086
19532,Agency_47,SKU_01,4.968,2013-09-01,454252482,789624076,30.665957,1269.25,1266.49049,2.75951,...,-,-,-,0.217413,322,8,9,1.603017,2625.472644,48.29565
2089,Agency_53,SKU_07,21.6825,2013-10-01,480693900,791658684,29.197727,1193.842373,1128.124395,65.717978,...,-,beer_capital,-,5.504745,240,9,10,3.076505,38.529107,2511.035175
9755,Agency_17,SKU_02,960.552,2015-03-01,515468092,871204688,23.60812,1338.334248,1232.128069,106.206179,...,-,-,music_fest,7.935699,259,26,3,6.867508,2143.677462,396.02214
7561,Agency_05,SKU_03,1184.6535,2014-02-01,425528909,734443953,28.668254,1369.556376,1161.135214,208.421162,...,-,-,-,15.218151,21,13,2,7.077206,1566.643589,1881.866367
19204,Agency_11,SKU_05,5.5593,2017-08-01,623319783,1049868815,31.915385,1922.486644,1651.307674,271.17897,...,-,-,-,14.105636,17,55,8,1.715472,1385.225478,109.6992
8781,Agency_48,SKU_04,4275.1605,2013-03-01,509281531,892192092,26.767857,1761.258209,1546.05967,215.198539,...,-,-,music_fest,12.218455,151,2,3,8.360577,1757.950603,1925.272108
2540,Agency_07,SKU_21,0.0,2015-10-01,544203593,761469815,28.987755,0.0,0.0,0.0,...,-,-,-,0.0,300,33,10,-18.420681,0.0,2418.71955
12084,Agency_21,SKU_03,46.3608,2017-04-01,589969396,940912941,32.47891,1675.922116,1413.571789,262.350327,...,-,-,-,15.654088,181,51,4,3.836454,2034.293024,109.3818


In [36]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
volume,21000.0,1492.404,2711.497,0.0,8.272388,158.436,1774.793,22526.61
industry_volume,21000.0,543921400.0,62880220.0,413051800.0,509055300.0,551200000.0,589371500.0,670015700.0
soda_volume,21000.0,851200000.0,78243400.0,696401500.0,789088000.0,864919600.0,900555100.0,1049869000.0
avg_max_temp,21000.0,28.6124,3.972833,16.73103,25.37482,28.47927,31.5684,45.29048
price_regular,21000.0,1451.536,683.3624,0.0,1311.547,1495.175,1725.652,19166.62
price_actual,21000.0,1267.347,587.7573,-3121.69,1178.366,1324.696,1517.311,4925.404
discount,21000.0,184.3741,257.47,0.0,54.93511,138.3072,272.2986,19166.62
avg_population_2017,21000.0,1045065.0,929192.6,12271.0,60189.0,1232242.0,1729177.0,3137874.0
avg_yearly_household_income_2017,21000.0,151073.5,50409.59,90240.0,110057.0,131411.0,206553.0,247220.0
discount_in_percent,21000.0,10.57488,9.590813,0.0,3.749628,8.94899,15.64706,226.7401


In [37]:
max_prediction_length = 6
max_encoder_length = 24
training_cutoff = data["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="volume",
    group_ids=["agency", "sku"],
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["agency", "sku"],
    static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    time_varying_known_categoricals=["special_days", "month"],
    variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], transformation="softplus"
    ),  # use softplus and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)

batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)