# Model 3: Transformer Model

In [1]:
from pathlib import Path
import numpy as np
import torch 

In [2]:
import sys; sys.path.insert(0, '/mnt/src')

In [3]:
from models.transformer import TransformerEncoderModel, train

## Create Trajectory dataset from dataframe

In [4]:
from utils.file_io import read_trajectory_datasets

In [5]:
feature_columns = [
    'left_boom_base_yaw_joint', 
    'left_boom_base_pitch_joint',
    'left_boom_main_prismatic_joint',
    'left_boom_second_roll_joint',
    'left_boom_second_yaw_joint',
    'left_boom_top_pitch_joint',
    'left_boom_ee_joint',
    'cable1_property(length,youngsmodule(bend,twist))',
    'cable2_property(length,youngsmodule(bend,twist))',
    'cable3_property(length,youngsmodule(bend,twist))'
]

label_features = [
    ('cable1_lowest_point', np.array([1], dtype=np.int64)),
    ('cable2_lowest_point', np.array([1], dtype=np.int64)),
    ('cable3_lowest_point', np.array([1], dtype=np.int64))
]

normalized_features = [
    ('cable1_property(length,youngsmodule(bend,twist))', np.array([1,2], dtype=np.int64)),
    ('cable2_property(length,youngsmodule(bend,twist))', np.array([1,2], dtype=np.int64)),
    ('cable3_property(length,youngsmodule(bend,twist))', np.array([1,2], dtype=np.int64))
]

In [6]:
data_folder = Path("/mnt/data").absolute()
train_set, _, validation_set, _ = read_trajectory_datasets(data_folder, 0.8, 0, 0.2, window_size=256, 
                                                                  feature_columns=feature_columns, label_features=label_features, 
                                                                  normalized_features=normalized_features)

Reading .csv files: 1it [00:00,  2.53it/s]
  return np.where(x_max != x_min, (features - x_min) / (x_max - x_min), 1).astype(dtype=np.float32)
  return np.where(x_max != x_min, (features - x_min) / (x_max - x_min), 1).astype(dtype=np.float32)


Preprocessing dataframe
Reshaping dataframe for learning


In [7]:
input_shape, output_shape = 16, 3
print(f"Data shape {input_shape} / {output_shape} of total {len(train_set) + len(validation_set)} data rows!")

Data shape 16 / 3 of total 40 data rows!


## Load parameter, functions and dataloader

In [8]:
tune_path = Path("/mnt/models/transformer/tune").absolute()
tune_path.mkdir(parents=True, exist_ok=True)

## Train the model with optuna hyperparameter tuning

In [9]:
from ray import tune, train as ray_train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from torch import nn
from typing import Dict
from utils.cluster import attach_ray, disconnect_ray
from utils.optimizer import get_optimizer_function, get_learning_rate_scheduler
from utils.activation import get_activation
from utils.loss_functions import get_loss_function
from torch.utils.data import Dataset, DataLoader
import random; random.seed(0)

In [10]:
def parameter_train(parameter: Dict, train_epochs: int, train_set: Dataset, validation_set: Dataset, model_input_shape: int,
                    model_output_shape: int) -> None:

    # Determ device on the actual worker used for the trail
    device = ("cuda" if torch.cuda.is_available() else "cpu")

    if device != "cuda":
        print("No cuda device found!")

    train_dataloader = DataLoader(train_set, batch_size=parameter["batch_size"], shuffle = True)
    validation_dataloader = DataLoader(validation_set, batch_size=parameter["batch_size"], shuffle = True)

    activation = get_activation(parameter["activation"])

    model = TransformerEncoderModel(
        num_heads = parameter["model_dim_num_heads_projection"][1],
        model_dim = parameter["model_dim_num_heads_projection"][0],
        feedforward_hidden_dim = parameter["feedforward_dim"],
        num_encoder_layers = parameter["num_encoder_layer"],
        output_dim = model_output_shape,
        transformer_dropout = parameter["transformer_dropout"],
        pos_encoder_dropout = parameter["pos_encoder_dropout"],
        downprojection = True if parameter["model_dim_num_heads_projection"][2] != 0 else False,
        projection_num_neighbors = parameter["model_dim_num_heads_projection"][2],
        activation = activation
    )

    # The model needs to be on the device used for training before instance the optimizer
    model.to(device)

    optimizer = get_optimizer_function(parameter["optimizer"], model, 1)
    lr_scheduler = get_learning_rate_scheduler(optimizer, parameter["model_dim"], parameter["warmup_steps"])
    loss_function = get_loss_function()

    _ = train(train_epochs, train_dataloader, validation_dataloader, model, loss_function, optimizer, lr_scheduler, None, device, report_interval=50, tune=True)

In [11]:
num_samples = 2000
num_epochs = 750
grace_period = 5

In [12]:
model_dim_params = []

n_neighbors = [3, 5, 7, 10, 15]
model_dim = [input_shape, output_shape]

for d in model_dim:
    num_heads = []
    for i in range(2, d + 1):
        if d % i == 0: num_heads.append(i)

    for h in num_heads:

        if d != output_shape:
            model_dim_params.append((d, h, 0))
        else:
            for n in n_neighbors:
                model_dim_params.append((d, h, n))

In [13]:
start, end = np.log2(256), np.log2(2048)
num_values = 10
feedforward_dim = [int(2 ** (start + i / (num_values - 1) * (end - start))) for i in range(num_values)]

In [14]:
parameter_space = {
    "pos_encoder_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "transformer_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "num_encoder_layer": tune.choice(list(range(2, 10 + 1, 2))),
    "feedforward_dim": tune.choice(feedforward_dim),
    "batch_size": tune.choice(list(range(64, 256, 16))),
    
    "model_dim_num_heads_projection": tune.choice(model_dim_params),
    "optimizer": tune.choice(["adam", "adamw"]),
    "activation": tune.choice(["relu", "gelu"]),
    "warmup_steps": tune.choice(list(range(1000, 4000, 200))),
}

In [15]:
scheduler = ASHAScheduler(
    metric = "loss",
    mode = "min",
    max_t = num_epochs,
    grace_period = grace_period
)

In [16]:
search_alg = OptunaSearch(
    metric = "loss",
    mode = "min"
) 

In [17]:
attach_ray(manager = False)

2023-12-12 16:48:55,263	INFO worker.py:1673 -- Started a local Ray instance.
2023-12-12 16:48:55,449	INFO packaging.py:530 -- Creating a file package for local directory '/mnt/src/notebooks/../utils'.
2023-12-12 16:48:55,714	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_a350e7b4e294648b.zip' (0.07MiB) to Ray cluster...
2023-12-12 16:48:55,716	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_a350e7b4e294648b.zip'.
2023-12-12 16:48:55,798	INFO packaging.py:530 -- Creating a file package for local directory '/mnt/src/models'.
2023-12-12 16:48:55,906	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_6289afc7074cc6ea.zip' (0.03MiB) to Ray cluster...
2023-12-12 16:48:55,907	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_6289afc7074cc6ea.zip'.


{'hostname': 'md3zx1nc-mbp-runner', 'resources': {'node:__internal_head__': 1.0, 'CPU': 8.0, 'object_store_memory': 4475621376.0, 'node:172.17.0.2': 1.0, 'memory': 8951242752.0}}


In [18]:
ray_resources_manager = tune.with_resources(
    trainable=lambda param: parameter_train(param, num_epochs, train_set, validation_set, input_shape, output_shape),
    # See: https://stackoverflow.com/questions/58967793/what-is-the-way-to-make-tune-run-parallel-trials-across-multiple-gpus
    resources={ "cpu": 3, "gpu": 0.25 if torch.cuda.is_available() else 0 }
)

tuner = tune.Tuner(
    ray_resources_manager,
    param_space=parameter_space,
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        search_alg=search_alg,
         num_samples=num_samples
    ),
    run_config = ray_train.RunConfig(
        name = "transformer_encoder"
    )
)

In [19]:
results = tuner.fit()

0,1
Current time:,2023-12-12 16:49:23
Running for:,00:00:26.31
Memory:,3.7/15.5 GiB

Trial name,# failures,error file
lambda_f9aa04c7,1,"/root/ray_results/transformer_encoder/lambda_f9aa04c7_1_activation=relu,batch_size=208,feedforward_dim=322,model_dim_num_heads_projection=3_3_15,num_encoder_layer=6,opt_2023-12-12_16-48-57/error.txt"
lambda_5a89ee19,1,"/root/ray_results/transformer_encoder/lambda_5a89ee19_2_activation=relu,batch_size=112,feedforward_dim=1625,model_dim_num_heads_projection=3_3_15,num_encoder_layer=4,op_2023-12-12_16-49-07/error.txt"

Trial name,status,loc,activation,batch_size,feedforward_dim,model_dim_num_heads_ projection,num_encoder_layer,optimizer,pos_encoder_dropout,transformer_dropout,warmup_steps
lambda_d877e8cf,PENDING,,gelu,80,2048,"(3, 3, 7)",6,adamw,0.217021,0.221131,3200
lambda_f9aa04c7,ERROR,172.17.0.2:23099,relu,208,322,"(3, 3, 15)",6,adamw,0.0615668,0.0540211,3000
lambda_5a89ee19,ERROR,172.17.0.2:23163,relu,112,1625,"(3, 3, 15)",4,adam,0.137261,0.146587,3800


[36m(<lambda> pid=23099)[0m No cuda device found!
[36m(<lambda> pid=23099)[0m torch.Size([1024, 3])
[36m(<lambda> pid=23099)[0m torch.Size([2])


2023-12-12 16:49:07,223	ERROR tune_controller.py:1383 -- Trial task failed for trial lambda_f9aa04c7
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2563, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(IndexError): [36mray::ImplicitFunc.train()[39m (pid=23099, ip=172.17.0.2, actor_id=d1f555bc09e7acbfeacd8c4401000000, repr=<lambda>)
  File "/usr/local/lib/python3.10/dist-packages/ray/tune/trainable/trainable.py", line 342, in train
    raise skipped from exception_cause(sk

[36m(<lambda> pid=23163)[0m No cuda device found!
[36m(<lambda> pid=23163)[0m torch.Size([2])[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m


2023-12-12 16:49:24,030	ERROR tune.py:1043 -- Trials did not complete: [lambda_f9aa04c7, lambda_5a89ee19]
2023-12-12 16:49:24,031	INFO tune.py:1047 -- Total run time: 26.70 seconds (26.30 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/root/ray_results/transformer_encoder", trainable=...)
- lambda_d877e8cf: FileNotFoundError('Could not fetch metrics for lambda_d877e8cf: both result.json and progress.csv were not found at /root/ray_results/transformer_encoder/lambda_d877e8cf_3_activation=gelu,batch_size=80,feedforward_dim=2048,model_dim_num_heads_projection=3_3_7,num_encoder_layer=6,opti_2023-12-12_16-49-15')


In [None]:
disconnect_ray()

In [None]:
# Save as csv file
result_grid = results.get_dataframe()
result_grid.to_csv(tune_path / "trail_grid_.csv")

In [None]:
best_result = result_grid.iloc[result_grid['loss'].idxmin()].to_dict()
trail_id = best_result['trial_id']

print(f"Trail ID from the best run: {trail_id}")

In [None]:
print(f"Best trail by loss value {best_result['loss']}", "\n------")
for key in best_result:
    if 'config' in key:
        print(f"Best trail: {key} value {best_result[key]}")