# Model 3: Transformer Model

In [1]:
from pathlib import Path
import numpy as np
import torch 

In [2]:
import sys; sys.path.insert(0, '/mnt/src')

In [3]:
from models.transformer import TransformerEncoderOnly, train

KeyboardInterrupt: 

## Create Trajectory dataset from dataframe

In [None]:
from utils.file_io import read_trajectory_datasets

In [None]:
feature_columns = [
    'left_boom_base_yaw_joint', 
    'left_boom_base_pitch_joint',
    'left_boom_main_prismatic_joint',
    'left_boom_second_roll_joint',
    'left_boom_second_yaw_joint',
    'left_boom_top_pitch_joint',
    'left_boom_ee_joint',
    'cable1_property(length,youngsmodule(bend,twist))',
    'cable2_property(length,youngsmodule(bend,twist))',
    'cable3_property(length,youngsmodule(bend,twist))'
]

label_features = [
    ('cable1_lowest_point', np.array([1], dtype=np.int64)),
    ('cable2_lowest_point', np.array([1], dtype=np.int64)),
    ('cable3_lowest_point', np.array([1], dtype=np.int64))
]

normalized_features = [
    ('cable1_property(length,youngsmodule(bend,twist))', np.array([1,2], dtype=np.int64)),
    ('cable2_property(length,youngsmodule(bend,twist))', np.array([1,2], dtype=np.int64)),
    ('cable3_property(length,youngsmodule(bend,twist))', np.array([1,2], dtype=np.int64))
]

In [None]:
data_folder = Path("/mnt/data").absolute()
train_set, test_set, validation_set, _ = read_trajectory_datasets(data_folder, 0.8, 0.05, 0.15, window_size=256, 
                                                                  feature_columns=feature_columns, label_features=label_features, 
                                                                  normalized_features=normalized_features)

In [None]:
input_shape, output_shape = len(feature_columns), len(label_features)
print(f"Data shape {input_shape} / {output_shape} of total {len(train_set) + len(test_set) + len(validation_set)} data rows!")

## Load parameter, functions and dataloader

In [None]:
from utils.file_io import define_dataloader_from_subset

In [None]:
tune_path = Path("/mnt/models/transformer/tune").absolute()
tune_path.mkdir(parents=True, exist_ok=True)

## Train the model with optuna hyperparameter tuning

In [None]:
from ray import tune, train as ray_train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from torch import nn
from typing import Dict
from utils.cluster import attach_ray, disconnect_ray
from utils.optimizer import get_optimizer_function, get_learning_rate_scheduler
from utils.activation import get_activation
from utils.loss_functions import get_loss_function
from torch.utils.data import Dataset, DataLoader
import random; random.seed(0)

In [None]:
def parameter_train(parameter: Dict, train_epochs: int, train_set: Dataset, validation_set: Dataset, model_input_shape: int,
                    model_output_shape: int) -> None:

    # Determ device on the actual worker used for the trail
    device = ("cuda" if torch.cuda.is_available() else "cpu")

    if device != "cuda":
        print("No cuda device found!")

    train_dataloader = DataLoader(train_set, batch_size=parameter["batch_size"], shuffle = True)
    validation_dataloader = DataLoader(validation_set, batch_size=parameter["batch_size"], shuffle = True)

    activation = get_activation(parameter["activation"])
    downprojection = True if parameter["model_dim_num_heads_n_neighbors"][0] == 9 else False
    model = TransformerEncoderOnly(parameter["model_dim_num_heads_n_neighbors"][1], parameter["model_dim_num_heads_n_neighbors"][0], parameter["feedforward_dim"],
                                   parameter["num_encoder_layer"], parameter["transformer_dropout"], parameter["pos_encoder_dropout"], downprojection, parameter["model_dim_num_heads_n_neighbors"][2], activation)

    # The model needs to be on the device used for training before instance the optimizer
    model.to(device)

    optimizer = get_optimizer_function(parameter["optimizer"], model, 1)
    lr_scheduler = get_learning_rate_scheduler(optimizer, parameter["model_dim"], parameter["warmup_steps"])
    loss_function = get_loss_function()

    _ = train(train_epochs, train_dataloader, validation_dataloader, model, loss_function, optimizer, lr_scheduler, None, device, report_interval=50, tune=True)

In [None]:
num_samples = 2000
num_epochs = 750
grace_period = 5

In [17]:
model_dim_params = []

n_neighbors = [3, 5, 7, 10, 15]
model_dim = [15, 9]

for d in model_dim:
    num_heads = []
    
    for i in range(2, d):
        if d % i == 0: num_heads.append(i)

    for h in num_heads:

        # If model_dim is not equal 9, only 1 neighbors
        if d != 9:
            model_dim_params.append((d, h, 1))
        else:
            for n in n_neighbors:
                model_dim_params.append((d, h, n))

In [None]:
parameter_space = {
    "pos_encoder_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "transformer_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "num_encoder_layer": tune.choice(list(range(1, 10, 2))),
    "feedforward_dim": tune.choice(np.logspace(256, 2048, base=2, dtype=np.int32)),
    "batch_size": tune.choice(list(range(64, 256, 16))),
    "model_dim_num_heads_n_neighbors": tune.choice(model_dim_params),
    "optimizer": tune.choice(["adam", "adamw"]),
    "activation": tune.choice(["relu", "gelu"]),
    "warmup_steps": tune.choice(list(range(1000, 4000, 200))),
}

In [None]:
scheduler = ASHAScheduler(
    metric = "loss",
    mode = "min",
    max_t = num_epochs,
    grace_period = grace_period
)

In [None]:
search_alg = OptunaSearch(
    metric = "loss",
    mode = "min"
) 

In [None]:
attach_ray(manager = True)

In [None]:
ray_resources_manager = tune.with_resources(
    trainable=lambda param: parameter_train(param, num_epochs, train_set, validation_set, input_shape, output_shape),
    # See: https://stackoverflow.com/questions/58967793/what-is-the-way-to-make-tune-run-parallel-trials-across-multiple-gpus
    resources={ "cpu": 3, "gpu": 0.25 if torch.cude.is_available() else 0 }
)

tuner = tune.Tuner(
    ray_resources_manager,
    param_space=parameter_space,
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        search_alg=search_alg,
         num_samples=num_samples
    ),
    run_config = ray_train.RunConfig(
        name = "transformer_encoder"
    )
)

In [None]:
results = tuner.fit()

In [None]:
disconnect_ray()

In [None]:
# Save as csv file
result_grid = results.get_dataframe()
result_grid.to_csv(tune_path / "trail_grid_.csv")

In [None]:
best_result = result_grid.iloc[result_grid['loss'].idxmin()].to_dict()
trail_id = best_result['trial_id']

print(f"Trail ID from the best run: {trail_id}")

In [None]:
print(f"Best trail by loss value {best_result['loss']}", "\n------")
for key in best_result:
    if 'config' in key:
        print(f"Best trail: {key} value {best_result[key]}")