## How to use Transformer Networks to build a Forecasting model: training
- https://towardsdatascience.com/how-to-use-transformer-networks-to-build-a-forecasting-model-297f9270e630

<div style="text-align: right"> <b>Author : Kwang Myung Yu</b></div>
<div style="text-align: right"> Initial upload: 2023.11.06</div>
<div style="text-align: right"> Last update: 2023.11.06</div>

In [1]:
import datetime
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline
# print(plt.stype.available)

# Options for pandas
pd.options.display.max_columns = 30

In [2]:
import json
from tqdm import tqdm
import pytorch_lightning as pl
import torch
import torch.nn as nn
from torchinfo import summary
from model import TimeSeriesForcasting
from train_utils import split_df, pad_arr, df_to_np

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader

In [3]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, groups, grp_by, split, features, target):
        self.groups = groups
        self.grp_by = grp_by
        self.split = split
        self.features = features
        self.target = target

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        group = self.groups[idx]

        df = self.grp_by.get_group(group)

        src, trg = split_df(df, split=self.split)

        src = src[self.features + [self.target]]

        src = df_to_np(src)

        trg_in = trg[self.features + [f"{self.target}_lag_1"]]

        trg_in = np.array(trg_in)
        trg_out = np.array(trg[self.target])

        src = torch.tensor(src, dtype=torch.float)
        trg_in = torch.tensor(trg_in, dtype=torch.float)
        trg_out = torch.tensor(trg_out, dtype=torch.float)

        return src, trg_in, trg_out

In [4]:
data_csv_path = "data/processed_data.csv"
feature_target_names_path = "data/config.json"

In [5]:
data = pd.read_csv(data_csv_path)

In [6]:
data.head()

Unnamed: 0,timestamp,index,article,amplitude,offset,views,day_of_month,day_of_year,month,week_of_year,year,views_lag_1
0,2015-01-01,0,e288d86c0c8641a7b25ba1cc435e28d7,3.203435,0.774873,2.014996,0.032258,0.00274,0.083333,0.018868,0.0,0.0
1,2015-01-02,1,e288d86c0c8641a7b25ba1cc435e28d7,3.197775,0.743587,3.224294,0.064516,0.005479,0.083333,0.018868,0.0,2.014996
2,2015-01-03,2,e288d86c0c8641a7b25ba1cc435e28d7,3.192114,0.716603,2.987859,0.096774,0.008219,0.083333,0.018868,0.0,3.224294
3,2015-01-04,3,e288d86c0c8641a7b25ba1cc435e28d7,3.186454,0.694619,3.996244,0.129032,0.010959,0.083333,0.018868,0.0,2.987859
4,2015-01-05,4,e288d86c0c8641a7b25ba1cc435e28d7,3.180794,0.678206,3.76322,0.16129,0.013699,0.083333,0.037736,0.0,3.996244


In [7]:
with open(feature_target_names_path) as f:
    feature_target_names = json.load(f)

In [8]:
feature_target_names

{'features': ['day_of_month', 'day_of_year', 'month', 'week_of_year', 'year'],
 'target': 'views',
 'group_by_key': 'article',
 'lag_features': ['views_lag_1']}

In [9]:
data_train = data[~data[feature_target_names["target"]].isna()]
data_train

Unnamed: 0,timestamp,index,article,amplitude,offset,views,day_of_month,day_of_year,month,week_of_year,year,views_lag_1
0,2015-01-01,0,e288d86c0c8641a7b25ba1cc435e28d7,3.203435,0.774873,2.014996,0.032258,0.002740,0.083333,0.018868,0.0,0.000000
1,2015-01-02,1,e288d86c0c8641a7b25ba1cc435e28d7,3.197775,0.743587,3.224294,0.064516,0.005479,0.083333,0.018868,0.0,2.014996
2,2015-01-03,2,e288d86c0c8641a7b25ba1cc435e28d7,3.192114,0.716603,2.987859,0.096774,0.008219,0.083333,0.018868,0.0,3.224294
3,2015-01-04,3,e288d86c0c8641a7b25ba1cc435e28d7,3.186454,0.694619,3.996244,0.129032,0.010959,0.083333,0.018868,0.0,2.987859
4,2015-01-05,4,e288d86c0c8641a7b25ba1cc435e28d7,3.180794,0.678206,3.763220,0.161290,0.013699,0.083333,0.037736,0.0,3.996244
...,...,...,...,...,...,...,...,...,...,...,...,...
36539995,2019-12-28,1822,28939ba7e81d47a8944d2029a7966c9e,-1.324961,-1.346280,-0.415053,0.903226,0.991781,1.000000,0.981132,0.8,-0.409966
36539996,2019-12-29,1823,28939ba7e81d47a8944d2029a7966c9e,-1.326153,-1.193783,-0.266651,0.935484,0.994521,1.000000,0.981132,0.8,-0.415053
36539997,2019-12-30,1824,28939ba7e81d47a8944d2029a7966c9e,-1.327345,-1.035194,-0.247692,0.967742,0.997260,1.000000,0.018868,0.8,-0.266651
36539998,2019-12-31,1825,28939ba7e81d47a8944d2029a7966c9e,-1.328537,-0.875333,-0.433264,1.000000,1.000000,1.000000,0.018868,0.8,-0.247692


In [10]:
log_dir = "ts_logs"
model_dir = "ts_models"
batch_size = 128
epochs = 10
horizon_size = 30

In [11]:
grp_by_train = data_train.groupby(by = feature_target_names["group_by_key"])
groups = list(grp_by_train.groups)

In [12]:
groups

['00029e3ab17145d4b67490c5ba410959',
 '0007dd782f1b49c5bc0097d3948368eb',
 '0009445ebeed4697afdafcaef3695b62',
 '000bae0555fd4e459454fc06d71f969e',
 '000d333b96674836afcc52d9216a2e09',
 '000dd95e176a4686b8554b97762424e0',
 '000ddc08df3a4748add8c193ce277482',
 '0012d650d9b64f1dab35498ca4b6943e',
 '0013b836a0ae4ca2b4f94e2b01cba2fe',
 '0013b90f8e01445eb38dffad5502fc41',
 '00141ad925ce495fb90f8bd1fa5295c2',
 '001880f9d86b4c19b583ae391bca044e',
 '001b4ff1b1bc4a56bc3cc914d3c302d2',
 '001cb5c5243b4d9d9a83e642521ac85f',
 '001d95052d5b448a8435fd0a79b169c5',
 '001ef8f6697e4ef8838cb699c7779344',
 '00213dc0ff8d4d549bcc7eb8818052f0',
 '0025b5f1843842ea9c000300f37262ba',
 '00268e35bc26482c9fe3fe674269cde6',
 '002c2a6426e24cd4b0f7c219850f4b57',
 '002f1c2bfcb246e6876617306c2dad7b',
 '0030b80d4aa147c69d87e3d0b59b593d',
 '003124caf83f47ba9043373ab6a20b89',
 '003e59f1b1554e308c47415269f87f73',
 '00416be1b6d349dabf2810c722a40da7',
 '00498e212aa948c1997ffd7489915040',
 '004a8751334c404183ce068b2021601f',
 

In [13]:
grp_by_train.get_group(groups[0])

Unnamed: 0,timestamp,index,article,amplitude,offset,views,day_of_month,day_of_year,month,week_of_year,year,views_lag_1
13622112,2015-01-01,0,00029e3ab17145d4b67490c5ba410959,-1.083332,0.280487,0.886673,0.032258,0.002740,0.083333,0.018868,0.0,0.000000
13622113,2015-01-02,1,00029e3ab17145d4b67490c5ba410959,-1.081474,0.137385,0.512138,0.064516,0.005479,0.083333,0.018868,0.0,0.886673
13622114,2015-01-03,2,00029e3ab17145d4b67490c5ba410959,-1.079616,-0.003654,0.384091,0.096774,0.008219,0.083333,0.018868,0.0,0.512138
13622115,2015-01-04,3,00029e3ab17145d4b67490c5ba410959,-1.077759,-0.138979,0.324364,0.129032,0.010959,0.083333,0.018868,0.0,0.384091
13622116,2015-01-05,4,00029e3ab17145d4b67490c5ba410959,-1.075901,-0.265083,0.158865,0.161290,0.013699,0.083333,0.037736,0.0,0.324364
...,...,...,...,...,...,...,...,...,...,...,...,...
13623934,2019-12-28,1822,00029e3ab17145d4b67490c5ba410959,-0.652312,1.073367,1.270303,0.903226,0.991781,1.000000,0.981132,0.8,1.238714
13623935,2019-12-29,1823,00029e3ab17145d4b67490c5ba410959,-0.650454,1.100713,1.356222,0.935484,0.994521,1.000000,0.981132,0.8,1.270303
13623936,2019-12-30,1824,00029e3ab17145d4b67490c5ba410959,-0.648596,1.105171,1.297386,0.967742,0.997260,1.000000,0.018868,0.8,1.356222
13623937,2019-12-31,1825,00029e3ab17145d4b67490c5ba410959,-0.646738,1.086627,1.358771,1.000000,1.000000,1.000000,0.018868,0.8,1.297386


In [14]:
full_groups = [
        grp for grp in groups if grp_by_train.get_group(grp).shape[0] > 2 * horizon_size
    ]

full_groups

['00029e3ab17145d4b67490c5ba410959',
 '0007dd782f1b49c5bc0097d3948368eb',
 '0009445ebeed4697afdafcaef3695b62',
 '000bae0555fd4e459454fc06d71f969e',
 '000d333b96674836afcc52d9216a2e09',
 '000dd95e176a4686b8554b97762424e0',
 '000ddc08df3a4748add8c193ce277482',
 '0012d650d9b64f1dab35498ca4b6943e',
 '0013b836a0ae4ca2b4f94e2b01cba2fe',
 '0013b90f8e01445eb38dffad5502fc41',
 '00141ad925ce495fb90f8bd1fa5295c2',
 '001880f9d86b4c19b583ae391bca044e',
 '001b4ff1b1bc4a56bc3cc914d3c302d2',
 '001cb5c5243b4d9d9a83e642521ac85f',
 '001d95052d5b448a8435fd0a79b169c5',
 '001ef8f6697e4ef8838cb699c7779344',
 '00213dc0ff8d4d549bcc7eb8818052f0',
 '0025b5f1843842ea9c000300f37262ba',
 '00268e35bc26482c9fe3fe674269cde6',
 '002c2a6426e24cd4b0f7c219850f4b57',
 '002f1c2bfcb246e6876617306c2dad7b',
 '0030b80d4aa147c69d87e3d0b59b593d',
 '003124caf83f47ba9043373ab6a20b89',
 '003e59f1b1554e308c47415269f87f73',
 '00416be1b6d349dabf2810c722a40da7',
 '00498e212aa948c1997ffd7489915040',
 '004a8751334c404183ce068b2021601f',
 

In [15]:
len(full_groups)

20000

In [16]:
train_data = Dataset(
        groups=full_groups,
        grp_by=grp_by_train,
        split="train",
        features=feature_target_names["features"],
        target=feature_target_names["target"],
    )

val_data = Dataset(
        groups=full_groups,
        grp_by=grp_by_train,
        split="val",
        features=feature_target_names["features"],
        target=feature_target_names["target"],
    )

In [17]:
print("len(train_data)", len(train_data))
print("len(val_data)", len(val_data))

len(train_data) 20000
len(val_data) 20000


In [18]:
# train_loader = DataLoader(
#         train_data,
#         batch_size=batch_size,
#         num_workers=10,
#         shuffle=True,
#     )
# val_loader = DataLoader(
#         val_data,
#         batch_size=batch_size,
#         num_workers=10,
#         shuffle=False,
#     )

In [19]:
train_loader = DataLoader(
        train_data,
        batch_size=batch_size,
        shuffle=True,
    )
val_loader = DataLoader(
        val_data,
        batch_size=batch_size,
        shuffle=False,
    )

In [20]:
model = TimeSeriesForcasting(
        n_encoder_inputs=len(feature_target_names["features"]) + 1,
        n_decoder_inputs=len(feature_target_names["features"]) + 1,
        lr=1e-5,
        dropout=0.1,
    )

In [21]:
model

TimeSeriesForcasting(
  (input_pos_embedding): Embedding(1024, 512)
  (target_pos_embedding): Embedding(1024, 512)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-7): 8 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-7): 8 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamically

In [22]:
logger = TensorBoardLogger(
        save_dir=log_dir,
    )

checkpoint_callback = ModelCheckpoint(
        monitor="valid_loss",
        mode="min",
        dirpath=model_dir,
        filename="ts",
    )

In [23]:
trainer = pl.Trainer(
        max_epochs=epochs,
        logger=logger,
        callbacks=[checkpoint_callback],
    )

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [24]:
trainer.fit(model, train_loader, val_loader)


  | Name                 | Type               | Params
------------------------------------------------------------
0 | input_pos_embedding  | Embedding          | 524 K 
1 | target_pos_embedding | Embedding          | 524 K 
2 | encoder              | TransformerEncoder | 25.2 M
3 | decoder              | TransformerDecoder | 33.6 M
4 | input_projection     | Linear             | 3.6 K 
5 | output_projection    | Linear             | 3.6 K 
6 | linear               | Linear             | 513   
7 | do                   | Dropout            | 0     
------------------------------------------------------------
59.9 M    Trainable params
0         Non-trainable params
59.9 M    Total params
239.630   Total estimated model params size (MB)


Epoch 9: 100%|██████████| 157/157 [03:32<00:00,  0.74it/s, v_num=5]        

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 157/157 [03:33<00:00,  0.74it/s, v_num=5]


In [25]:
output_json_path = "models/trained_config.json"

In [28]:
result_val = trainer.test(
    model=model,
    dataloaders=val_loader)

output_json = {
        "val_loss": result_val[0]["test_loss"],
        "best_model_path": checkpoint_callback.best_model_path,
        }

Testing DataLoader 0: 100%|██████████| 157/157 [00:47<00:00,  3.28it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.5206373333930969
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [29]:
if output_json_path is not None:
    with open(output_json_path, "w") as f:
        json.dump(output_json, f, indent=4)

In [30]:
result_val

[{'test_loss': 0.5206373333930969}]