# Model 3: Transformer Model

In [None]:
from pathlib import Path
import numpy as np
import torch 

In [None]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

In [None]:
import sys; sys.path.insert(0, '../')

In [None]:
from models.transformer import TransformerEncoderOnly, get_loss_function, get_optimizer_function

## Read Datasets from .csv

In [None]:
from utils.file_io import read_trajectory_datasets

In [None]:
data_folder = Path("../../data/")
train_set, test_set, validation_set, visualization_set = read_trajectory_datasets(data_folder, 0.85, 0.10, 0.045, 0.005, 256, normalize_features=True)

In [None]:
# FIXME: Total loaded size correct?
input_shape, output_shape = 8, 3
print(f"Data shape {input_shape} / {output_shape} of total {len(train_set) + len(test_set) + len(validation_set) + len(visualization_set)} data rows!")

## Load parameter, functions and dataloader

In [None]:
import os

from torch.utils.data import DataLoader
from dotenv import load_dotenv
from torch import nn

from utils.file_io import define_dataloader_from_subset
from utils.evaluation import compute_loss_on
from utils.optimizer import rate

In [None]:
model_path = Path("../../models/transformer/").absolute()

In [None]:
dotenv_path = model_path / ".env"
load_dotenv(dotenv_path=dotenv_path)

learning_rate = float(os.getenv("LEARNING_RATE"))
batch_size = int(os.getenv("BATCH_SIZE"))
num_epochs = int(os.getenv("NUM_EPOCHS"))

In [None]:
train_dataloader, validation_dataloader, test_dataloader = define_dataloader_from_subset(train_set, validation_set, test_set, batch_size=batch_size, shuffle=True)

## Define train methods

In [None]:
from ray import train as ray_train

In [None]:
def train_epoch(train_dataloader: DataLoader, model, loss_function, optimizer, lr_scheduler,
                device: torch.device, report_interval: int = 1000):
    
    running_loss = 0
    last_loss = 0
    
    for i, (inputs, true_values) in enumerate(train_dataloader):
        
        inputs = inputs.to(device)
        true_values = true_values.to(device)
    
        inputs_shape, true_values_shape = inputs.size(), true_values.size()
        inputs = inputs.view(inputs_shape[1], inputs_shape[0], inputs_shape[2])
        true_values = true_values.view(true_values_shape[1], true_values_shape[0], true_values_shape[2])
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, true_values)
        running_loss += loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    
        if i % report_interval == report_interval - 1:
            last_loss = running_loss / report_interval

            print(f"batch {i + 1}, Mean Squared Error: {last_loss}")
            
            running_loss = 0

    return last_loss

In [None]:
def train(epochs: int, train_dataloader: DataLoader, validation_dataloader: DataLoader, model: nn.Module, 
          loss_function, optimizer, lr_scheduler, checkpoint_path: Path, device: torch.device = 'cpu', report_interval: int = 128, tune: bool = False) -> nn.Module:
    
    best_val_loss = float("inf")

    checkpoint_file = checkpoint_path / "checkpoint.pt"

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
        
    model.to(device)

    if checkpoint_file.exists():
        model_state = torch.load(checkpoint_file)
        model.load_state_dict(model_state)

    for epoch in range(model.total_epochs, epochs):
        if not tune:
            print(f"Epoch: {epoch + 1}")

        model.train(True)
        avg_loss = train_epoch(train_dataloader, model, loss_function, optimizer, lr_scheduler, device, report_interval)
        model.eval()

        with torch.no_grad():
            avg_val_loss = compute_loss_on(validation_dataloader, model, loss_function, device=device)

        if not tune:
            print(f"Loss on train: {avg_loss}, loss on validation: {avg_val_loss}")

        model.total_epochs += 1
    
        if avg_val_loss < best_val_loss or tune:
            best_val_loss = avg_val_loss            
            
            torch.save(model.state_dict(), checkpoint_file)

        if tune:
            ray_train.report(metrics={ "loss": float(avg_val_loss) })
            
    return model

## Train the model with optuna hyperparameter tuning

In [None]:
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from typing import Dict
from utils.evaluation import compute_sliding_window_predictions, compute_losses_from
from utils.cluster import attach_ray, disconnect_ray

In [None]:
def parameter_train(parameter: Dict, train_epochs: int, train_data: DataLoader, validation_data: DataLoader, model_input_shape: int,
                    model_output_shape: int, checkpoint_path: Path) -> None:

    # Determ device on the actual worker used for the trail
    device = ("cuda" if torch.cuda.is_available() else "cpu")

    run_id = ray_train.get_context().get_trial_id()
    run_checkpoint = checkpoint_path / run_id
    run_checkpoint.mkdir(parents=True)

    model = TransformerEncoderOnly(model_input_shape, parameter["model_dim"], parameter["feedforward_dim"], model_output_shape,
                                   parameter["encoder_layer"], parameter["transformer_dropout"], parameter["pos_encoder_dropout"])

    optimizer = get_optimizer_function(model, parameter["lr"])
    loss_function = get_loss_function()

    _ = train(train_epochs, train_data, validation_data, model, loss_function, optimizer, run_checkpoint, device, report_interval=50, tune=True)

In [None]:
learning_rate_radius = 1e-3
batch_size_radius = 10
num_samples = 100

In [None]:
parameter_space = {
    "lr": tune.loguniform(learning_rate_radius - learning_rate, learning_rate + learning_rate_radius),
    "batch_size": tune.choice(list(range(batch_size - batch_size_radius, batch_size + batch_size_radius, 4))),
    "model_dim": tune.choice([8]),
    "feedforward_dim": tune.choice([32]),
    "encoder_layer": tune.choice([1, 2, 3]),
    "transformer_dropout": tune.uniform(0.1, 0.5),
    "pos_encoder_dropout": tune.uniform(0.1, 0.5)
}

In [None]:
scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=num_epochs
)

In [None]:
search_alg = OptunaSearch(
    metric="loss",
    mode="min"
) 

In [None]:
attach_ray(manager = True)

In [None]:
ray_resources_manager = tune.with_resources(
    trainable=lambda param: parameter_train(param, num_epochs, train_dataloader, validation_dataloader, input_shape, output_shape, model_path),
    # See: https://stackoverflow.com/questions/58967793/what-is-the-way-to-make-tune-run-parallel-trials-across-multiple-gpus
    resources={ "cpu": 3, "gpu": 0.25 }
)

tuner = tune.Tuner(
    ray_resources_manager,
    param_space=parameter_space,
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        search_alg=search_alg,
         num_samples=num_samples
    )
)

In [None]:
results = tuner.fit()

In [None]:
disconnect_ray()

In [None]:
# Save as csv file
result_grid = results.get_dataframe()
result_grid.to_csv(model_path / "trail_grid_.csv")

In [None]:
best_result = result_grid.iloc[result_grid['loss'].idxmin()].to_dict()
trail_id = best_result['trial_id']

print(f"Trail ID from the best run: {trail_id}")

In [None]:
print(f"Best trail by loss value {best_result['loss']}", "\n------")
for key in best_result:
    if 'config' in key:
        print(f"Best trail: {key} value {best_result[key]}")

In [None]:
print(y_true)