In [1]:
import copy
from pathlib import Path
import warnings

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
import numpy as np
import pandas as pd
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import (
    optimize_hyperparameters,
)

In [32]:
# Define relative file paths for training and testing data
train_file = "..\\Data\\Train\\trains1990s.csv"

# Load Training Data
train_df = pd.read_csv(train_file, parse_dates=['observation_date'], date_format="%m/%y%")

train_df = train_df.iloc[:,:2]
train_df.set_index('observation_date', inplace=True)
# add time index
train_df["time_id"] = range(1, len(train_df) + 1)
train_df["group_id"] = 0
train_df


Unnamed: 0_level_0,fred_PCEPI,time_id,group_id
observation_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01/1990,58.553,1,0
02/1990,58.811,2,0
03/1990,59.033,3,0
04/1990,59.157,4,0
05/1990,59.290,5,0
...,...,...,...
08/2020,104.887,368,0
09/2020,105.046,369,0
10/2020,105.106,370,0
11/2020,105.225,371,0


In [None]:
max_prediction_length = 6
max_encoder_length = 24
training_cutoff = train_df["time_idx"].max() - max_prediction_length

training = TimeSeriesDataSet(
    train_df[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="fred_PCEPI",
    group_ids=["group_id"],
    min_encoder_length=max_encoder_length
    // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["agency", "sku"],
    static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    time_varying_known_categoricals=["special_days", "month"],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], transformation="softplus"
    ),  # use softplus and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)