# TFT multivariate experiments

This notebook contains the experiments for TFT on on the three multivariate datasets.
1. Contrary, to Informer, Autoformer and DLinear implementation, we manually need to scale and split the datasets.
2. Add time features such as month of the year, day of week etc..
3. Implement an experiment_main() method that runs the whole model training across datasets and prediction lengths
4. Wrap the data in the required format by pytorch-forecasting and set up the model
5. We calculate the results with a rolling test data set

In [None]:
import copy

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger

import numpy as np
import os
import pandas as pd
from pathlib import Path
import pickle

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import EncoderNormalizer
from pytorch_forecasting.data import MultiNormalizer
from pytorch_forecasting.data import TorchNormalizer
from pytorch_forecasting.metrics import MAE, RMSE, SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

from sklearn.preprocessing import StandardScaler
import torch
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')

## 1. Load, Scale, Split Data

In [2]:
file_path = '../../01_datasets/df_all_columns.csv'
df_all_columns = pd.read_csv(file_path)

file_path = '../../01_datasets/df_most_important_columns.csv'
df_most_important_columns = pd.read_csv(file_path)

file_path = '../../01_datasets/df_only_generation_columns.csv'
df_only_generation_columns = pd.read_csv(file_path)

In [3]:
def preprocess_data(df):
    
    # Extract the date column
    dates = df['date']
    
    # Exclude the date column for scaling
    df = df.drop(columns=['date'])
    
    # Define the sizes for training, validation, and test sets (70%,10%,20%)
    train_size = int(len(df) * 0.7)
    val_size = int((len(df) * 0.1)+1)
    test_size = len(df) - train_size - val_size
    
    # Split the data into training, validation, and test set
    train_data = df.iloc[:train_size]
    val_data = df.iloc[train_size:train_size + val_size]
    test_data = df.iloc[train_size + val_size:]
    
    # Initialize the StandardScaler
    scaler = StandardScaler()
    
    # Fit the scaler on the training data
    scaler.fit(train_data)
    
    # Transform the datasets using the same scaler
    train_standardized = scaler.transform(train_data)
    val_standardized = scaler.transform(val_data)
    test_standardized = scaler.transform(test_data)
    
    # Create new DataFrames with standardized values, including the date column
    train_data = pd.DataFrame(train_standardized, columns=train_data.columns)
    val_data = pd.DataFrame(val_standardized, columns=df.columns)
    test_data = pd.DataFrame(test_standardized, columns=test_data.columns)
    
    # Add the date column back to the data
    train_data['date'] = dates.iloc[:train_size].values
    val_data['date'] = dates.iloc[train_size:train_size + val_size].values
    test_data['date'] = dates.iloc[train_size + val_size:].values
     
    # Set 'date' column as index and convert it to datetime format
    train_data['date'] = pd.to_datetime(train_data['date'])
    train_data.set_index('date', inplace=True)

    test_data['date'] = pd.to_datetime(test_data['date'])
    test_data.set_index('date', inplace=True)
    
    val_data['date'] = pd.to_datetime(val_data['date'])
    val_data.set_index('date', inplace=True)
 
    return train_data, val_data, test_data

In [4]:
train_data_all_columns, val_data_all_columns, test_data_all_columns = preprocess_data(df_all_columns)
train_data_most_important_columns, val_data_most_important_columns, test_data_most_important_columns = preprocess_data(df_most_important_columns)
train_data_only_generation_columns, val_data_only_generation_columns, test_data_only_generation_columns = preprocess_data(df_only_generation_columns)

In [5]:
# This is needed because TFT requires one dataset as an input and additional information on where to split up between training and validation
data_all_columns = pd.concat([train_data_all_columns, val_data_all_columns], ignore_index=False)
data_most_important_columns = pd.concat([train_data_most_important_columns, val_data_most_important_columns], ignore_index=False)
data_only_generation_columns = pd.concat([train_data_only_generation_columns, val_data_only_generation_columns], ignore_index=False)

## 2. Time Features

In [6]:
def add_time_features(df):
    # Convert the index to a DateTimeIndex if it's not already
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)
    
    # Extract the desired date components
    hour_of_day = df.index.hour.astype(str).astype("category")
    day_of_month = df.index.day.astype(str).astype("category")
    day_of_year = df.index.dayofyear.astype(str).astype("category")
    month_of_year = df.index.month.astype(str).astype("category")
    week_of_year = df.index.isocalendar().week.astype(str).astype("category")
    day_of_week = df.index.dayofweek.astype(str).astype("category")  # Monday is 0, Sunday is 6
    # Append these Series as new columns in the DataFrame
    df = df.assign(
        hour_of_day=hour_of_day,
        day_of_month=day_of_month,
        day_of_year=day_of_year,
        month_of_year=month_of_year,
        week_of_year=week_of_year,
        day_of_week=day_of_week
    )
    return df

In [7]:
data_all_columns = add_time_features(data_all_columns)
data_most_important_columns = add_time_features(data_most_important_columns)
data_only_generation_columns = add_time_features(data_only_generation_columns)

In [8]:
# Reset the index for data_all_columns
# A time_idx is required by TFT
# TFT requires at least on category for prediction. Since we only have one category/one country we add 'DE' to every row
data_all_columns = data_all_columns.reset_index(drop=True)
data_all_columns['time_idx']=data_all_columns.index
data_all_columns['DE'] = 'DE'

In [9]:
data_most_important_columns = data_most_important_columns.reset_index(drop=True)
data_most_important_columns['time_idx']= data_most_important_columns.index
data_most_important_columns['DE'] = 'DE'

In [10]:
data_only_generation_columns = data_only_generation_columns.reset_index(drop=True)
data_only_generation_columns['time_idx'] = data_only_generation_columns.index
data_only_generation_columns['DE'] = 'DE'

## 3. Experiment Method

In [11]:
def experiment_main():
    # Define the save directory
    save_dir = 'TFT_Results' 
    
    # Create the directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    datasets = [data_all_columns, data_most_important_columns, data_only_generation_columns]
    prediction_lengths = [24, 48, 96, 192]
    
    for dataset in datasets:
        dataset_name = "data_all_columns" if dataset is data_all_columns else "data_most_important_columns" if dataset is data_most_important_columns else "data_only_generation_columns"
        data = dataset
        # List of columns
        columns = [col for col in data.columns if col not in ['DE', 'time_idx','hour_of_day','day_of_month','day_of_year','month_of_year','week_of_year','day_of_week']]
        print(dataset_name)
        for pred_len in prediction_lengths:
            dataset_builder(data, pred_len, columns)

            #network_configuration(dataset)
            #training()
            print(pred_len)

In [12]:
def dataset_builder(data, pred_len, columns):
    
    # Sets the maximum encoder length same across models
    max_encoder_length = 96
    # Ensures the validation data is not used for training
    training_cutoff = data["time_idx"].max() - 4379
    # Create a list of target normalizers for each column
    target_normalizers = [TorchNormalizer(method='identity', center=False, transformation=None, method_kwargs={}) for _ in range(len(columns))]
    
    # Define the training dataset in the TimeSeriesDataSet format
    training = TimeSeriesDataSet(
        data[lambda x: x.time_idx < training_cutoff],
        time_idx="time_idx",
        target=columns,
        group_ids=["DE"],
        min_encoder_length=max_encoder_length,  # keep encoder length long (as it is in the validation set)
        max_encoder_length=max_encoder_length,
        min_prediction_length=pred_len,
        max_prediction_length=pred_len,
        static_categoricals=["DE"],
        time_varying_known_categoricals=['hour_of_day','day_of_month','day_of_year','month_of_year','week_of_year','day_of_week'],
        time_varying_known_reals=["time_idx"],
        time_varying_unknown_categoricals=[],
        time_varying_unknown_reals=columns,
        # Pytorch-forecasting requires a target_normalizer. However, the data is already normalized to use the same normalization across models. Therefore, this normalizer does not change the data.
        target_normalizer=MultiNormalizer(target_normalizers),
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
    )
    print(training)
    # Define validation dataset with rolling validation approach
    validation = TimeSeriesDataSet.from_dataset(training, data, predict=False, min_prediction_idx=data["time_idx"].max() - 4379, predict_mode=False, stop_randomization=True)
    print(validation)
    
    # Create dataloaders for model
    batch_size = 128  # set this between 32 to 128
    train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
    val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)
    
    # Configure network and trainer
    pl.seed_everything(42)
    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
    lr_logger = LearningRateMonitor()  # log the learning rate
    logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

    trainer = pl.Trainer(
        max_epochs=50,
        accelerator="gpu",
        enable_model_summary=True,
        gradient_clip_val=0.1,
        limit_train_batches=50,  # coment in for training, running valiation every 30 batches
        # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
        callbacks=[lr_logger, early_stop_callback],
        logger=logger,
    )

    tft = TemporalFusionTransformer.from_dataset(
        training,
        learning_rate=0.03,
        hidden_size=16,
        attention_head_size=2,
        dropout=0.1,
        hidden_continuous_size=8,
        loss=MAE(),
        log_interval=10,  # uncomment for learning rate finder and otherwise, e.g. to 10 for logging every 10 batches
        optimizer="adam",
        reduce_on_plateau_patience=4,
    )
    print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")
    
     # Fit network
    trainer.fit(
        tft,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader,
    )
    
    # Define the folder where you want to save the information
    output_folder = "TFT_Results"

    # Define the file path within the folder
    output_file = os.path.join(output_folder, "model_info_multivariate.txt")

    # Create a dictionary to store data
    data = {"pred_len": pred_len, "num_columns": len(columns), "best_model_path": trainer.checkpoint_callback.best_model_path}
    
    # Save the data to the file
    with open(output_file, "a") as file:
        file.write(str(data) + "\n")
        
    # Print the saved information
    print("Model information saved to", output_file)

In [13]:
experiment_main()

data_all_columns


Seed set to 42


TimeSeriesDataSet[length=30368](
	time_idx='time_idx',
	target=['DE_load_actual_entsoe_transparency', 'DE_solar_capacity', 'DE_solar_generation_actual', 'DE_solar_profile', 'DE_wind_capacity', 'DE_wind_generation_actual', 'DE_wind_profile', 'DE_wind_offshore_capacity', 'DE_wind_offshore_generation_actual', 'DE_wind_offshore_profile', 'DE_wind_onshore_capacity', 'DE_wind_onshore_generation_actual', 'DE_wind_onshore_profile', 'DE_50hertz_load_actual_entsoe_transparency', 'DE_50hertz_solar_generation_actual', 'DE_50hertz_wind_generation_actual', 'DE_50hertz_wind_offshore_generation_actual', 'DE_50hertz_wind_onshore_generation_actual', 'DE_amprion_load_actual_entsoe_transparency', 'DE_amprion_solar_generation_actual', 'DE_amprion_wind_onshore_generation_actual', 'DE_tennet_load_actual_entsoe_transparency', 'DE_tennet_solar_generation_actual', 'DE_tennet_wind_generation_actual', 'DE_tennet_wind_offshore_generation_actual', 'DE_tennet_wind_onshore_generation_actual', 'DE_transnetbw_load_actu

Trainer will use only 1 of 3 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=3)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Number of parameters in network: 89.0k


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | MultiLoss                       | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 7.3 K 
3  | prescalers                         | ModuleDict                      | 1.4 K 
4  | static_variable_selection          | VariableSelectionNetwork        | 41.4 K
5  | encoder_variable_selection         | VariableSelectionNetwork        | 23.6 K
6  | decoder_variable_selection         | VariableSelectionNetwork        | 2.4 K 
7  | static_context_variable_selection  | GatedResidualNetwork            | 1.1 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 1.1 K 
9  | static_context_initial_cell_l

Epoch 0: 100%|██████████| 50/50 [01:31<00:00,  0.54it/s, v_num=56, train_loss_step=12.80]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/4 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/4 [00:00<?, ?it/s][A
Validation DataLoader 0:  25%|██▌       | 1/4 [02:07<06:21,  0.01it/s][A
Validation DataLoader 0:  50%|█████     | 2/4 [02:15<02:15,  0.01it/s][A
Validation DataLoader 0:  75%|███████▌  | 3/4 [02:25<00:48,  0.02it/s][A
Validation DataLoader 0: 100%|██████████| 4/4 [02:28<00:00,  0.03it/s][A
Epoch 1: 100%|██████████| 50/50 [01:31<00:00,  0.54it/s, v_num=56, train_loss_step=9.780, val_loss=20.30, train_loss_epoch=17.00]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/4 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/4 [00:00<?, ?it/s][A
Validation DataLoader 0:  25%|██▌       | 1/4 [02:00<06:02,  0.01it/s][A
Validation DataLoader 0:  50%|█████     | 2/4 [02:09<02:09,  0.02it/s][A


## 5. Calculate the Results

### Prepare test dataset

In [53]:
# First before comparing the predictions to the actuals we need to feed a modified version of our test dataset to the modelto get predictions
# In the following part some parameters should be manually adjusted as needed depending on the target
test_dataset = test_data_all_columns #adjust as needed

In [54]:
# Adding the same time features so we can use them as future known categoricals
test_dataset = add_time_features(test_dataset) 

In [55]:
# Needed because TFT needs a group column
test_dataset['DE'] = 'DE'

In [56]:
# This is pytorch-forecasting syntax specific, the time_idx index should be continous so we set it continuing the time_idx index from train+val
test_dataset['time_idx'] = range(30536, 30536 + len(test_dataset)) #this is pytorch-forecasting syntax specific, the time_idx index should be continous so we set it continuing the time_idx index from train+val

In [57]:
# Drop the data index and delete this column
test_dataset=test_dataset.reset_index()
test_dataset=test_dataset.drop(columns='date')  

### Using the test dataset to feed information to the encoder and decoder in a rolling way to get predictions

In [59]:
# Get the columns to know the targets
columns = [col for col in test_dataset.columns if col not in ['DE', 'time_idx','hour_of_day','day_of_month','day_of_year','month_of_year','week_of_year','day_of_week']]

In [None]:
# Load the best model
best_tft = TemporalFusionTransformer.load_from_checkpoint('lightning_logs/lightning_logs/version_53/checkpoints/epoch=12-step=650.ckpt')# adjust as needed
# Define an empty dataframe to append all predictions to
all_df = pd.DataFrame()

max_encoder_length = 96  
max_prediction_length = 48 # adjust as needed
target_name = columns

# We need to stop our predictions max_encoder_length + max_prediction_length before the end of the test dataset otherwise we get errors
addition_var = max_encoder_length + max_prediction_length

for i in range(0, len(test_dataset)-addition_var, max_prediction_length):
    new_pred_collection = []
    # Define the data for the encoder and decoder
    # The encoder contains data from i until the max_encoder_length
    test_dataset_short = test_dataset.iloc[i : i + max_encoder_length]
    encoder_data = test_dataset_short[lambda x: x.time_idx > x.time_idx.max() - max_encoder_length]

    # The decoder contains information (here: known future time features) from the last point of the encoder until the prediction length
    test_dataset_decoder = test_dataset.iloc[i+ max_encoder_length : i + max_encoder_length + max_prediction_length]

    # Set the targets to zero so the model is not spoiled
    for i in columns:
        test_dataset_decoder[i]=0.0
    decoder_data = test_dataset_decoder
    
    # Concatenate encoder and decoder information
    new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)
    
    # Make predictions
    new_raw_predictions = best_tft.predict(new_prediction_data, mode="raw", return_x=True, return_index = True)
  
    # Reformatting and writing the current predictions in the overall all_df
    for i in range (test_dataset.shape[1]-8):
        new_pred = (new_raw_predictions[0][0][i].cpu().detach().numpy())
        new_pred = new_pred.squeeze()
        new_pred_collection.append(new_pred)
        
    local_df = pd.DataFrame(new_pred_collection).transpose()
    all_df = pd.concat([all_df, local_df])

### Calculate MAE and MSE

In [62]:
# The original test_dataset and the all_df with all our predictions do not have the same size yet
# This is for two main reasons 1) The predictions only start after one full length of encoder input of size (96), therefore we cut off the first 96 rows
# Additionally since we stop max_encoder_length + max_prediction_length steps before the end of test dataset due to pytorch-forecasting TFT implementation,
# we need to stop early and cut-off in the end as well 

def calc_mse_mae(test_dataset, all_df):
    columns = [col for col in test_dataset.columns if col not in ['DE', 'time_idx','hour_of_day','day_of_month','day_of_year','month_of_year','week_of_year','day_of_week']]
    # Cut off at the start
    test_dataset_2=test_dataset[96:]

    # Cutting at the end
    length_all_df = len(all_df)
    length_test_dataset_2 = len(test_dataset_2)
    difference_to_substract = length_test_dataset_2-length_all_df
    test_dataset_2 = test_dataset_2.iloc[:-difference_to_substract] 
    # Only keep the targets in the test dataset
    test_dataset_2= test_dataset_2[columns]
  
    all_df_tensor = torch.tensor(all_df.values)
    torch_test_dataset = torch.tensor(test_dataset_2.values)

    print(torch_test_dataset.size())
    print(all_df_tensor.size())

    mae = F.l1_loss(all_df_tensor, torch_test_dataset)
    # Print the MAE
    print("MAE:", mae)

    # Calculate the MSE
    mse = F.mse_loss(all_df_tensor, torch_test_dataset)
    # Print the MSE
    print("MSE:", mse)

In [63]:
calc_mse_mae(test_dataset, all_df)

torch.Size([8640, 29])
torch.Size([8640, 29])
MAE: tensor(0.9574, dtype=torch.float64)
MSE: tensor(2.0916, dtype=torch.float64)
