# Model 4.2: Encoder-Decoder Model

In [1]:
import torch
import numpy as np

In [2]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [3]:
# Import local modules from 'src/utils' as package 'utils'
import sys; sys.path.insert(0, '/mnt/src')

## Load ParallelTrajectoriesDatasets from Pretraining

In [4]:
from pathlib import Path

In [5]:
data_path = Path("/mnt/models/two_stage/hpo/data/")
train_path = data_path / "train_set.pt"
validation_path = data_path / "validation_set.pt"
train_set = torch.load(train_path)
validation_set = torch.load(validation_path)

In [6]:
features, labels = train_set[0] 
print(features.shape, labels.shape)
input_shape, output_shape = features.shape[-1], labels.shape[-1]
num_parallel_trajectories = features.shape[0]
print(f"Data shape {input_shape} / {output_shape} of total {len(train_set) + len(validation_set)} data rows!")

torch.Size([1, 256, 16]) torch.Size([1, 256, 3])
Data shape 16 / 3 of total 40 data rows!


## Train the best parallel encoder model

## Load parameter, functions and dataloader

In [7]:
import os
import ast

from dotenv import load_dotenv
from torch.utils.data import DataLoader

In [8]:
encoder_model_path = Path("/mnt/models/two_stage/encoder")

In [9]:
dotenv_path = encoder_model_path / ".env"
load_dotenv(dotenv_path=dotenv_path)

batch_size = int(os.getenv("BATCH_SIZE"))
optimizer = os.getenv("OPTIMIZER")
activation = os.getenv("ACTIVATION")
num_encoder_layer = int(os.getenv("NUM_ENCODER_LAYER"))
pos_encoder_dropout = float(os.getenv("POS_ENCODER_DROPOUT"))
transformer_dropout = float(os.getenv("TRANSFORMER_DROPOUT"))
feedforward_dim = int(os.getenv("FEEDFORWARD_DIM"))
warmup_steps = int(os.getenv("WARMUP_STEPS"))
model_dim_num_heads_projection = ast.literal_eval(os.getenv("MODEL_DIM_NUM_HEADS_PROJECTION"))
num_epochs = int(os.getenv("NUM_EPOCHS"))

In [10]:
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle = True)
validation_dataloader = DataLoader(validation_set, batch_size=batch_size, shuffle = True)

## Load and Train the model

In [11]:
from models.parallel_encoder_model import ParallelEncoderModel
from models import parallel_encoder_model
from utils.loss_functions import get_loss_function
from utils.optimizer import get_optimizer_function, get_learning_rate_scheduler
from utils.activation import get_activation
from utils.file_io import load_downprojection
from utils.early_stopping import EarlyStopping

In [12]:
activation = get_activation(activation)

downprojection_path = encoder_model_path.parent / "downprojections" / f"{model_dim_num_heads_projection[2]}_projection.sav"
downprojection = load_downprojection(downprojection_path)

In [13]:
encoder = ParallelEncoderModel(
    num_decoders=num_parallel_trajectories,
    num_heads=model_dim_num_heads_projection[1],
    model_dim=model_dim_num_heads_projection[0],
    feedforward_hidden_dim=feedforward_dim,
    output_dim=model_dim_num_heads_projection[0],
    num_encoder_layers=num_encoder_layer,
    transformer_dropout=transformer_dropout,
    pos_encoder_dropout=pos_encoder_dropout,
    activation=activation,
    projection_function=downprojection
)



In [14]:
optimizer = get_optimizer_function(optimizer, encoder, 1)
lr_scheduler = get_learning_rate_scheduler(optimizer, model_dim_num_heads_projection[0], warmup_steps)
loss_function = get_loss_function()

In [15]:
early_stopping = EarlyStopping(10)

In [16]:
_, validation_losses = parallel_encoder_model.train(num_epochs, train_dataloader, validation_dataloader, encoder, loss_function, optimizer, lr_scheduler, encoder_model_path, device, early_stopping=early_stopping)

Epoch: 1
Loss on train: 0, loss on validation: 0.7826835513114929
Epoch: 2
Loss on train: 0, loss on validation: 0.7828028202056885
Epoch: 3
Loss on train: 0, loss on validation: 0.7830025553703308
Epoch: 4
Loss on train: 0, loss on validation: 0.7832890748977661
Epoch: 5
Loss on train: 0, loss on validation: 0.7835689783096313
Epoch: 6
Loss on train: 0, loss on validation: 0.783723771572113
Epoch: 7
Loss on train: 0, loss on validation: 0.7837678790092468
Epoch: 8
Loss on train: 0, loss on validation: 0.7834810614585876
Epoch: 9
Loss on train: 0, loss on validation: 0.7833141684532166
Epoch: 10
Loss on train: 0, loss on validation: 0.7832285761833191


## Train the combined model with optuna hyperparameter tuning

In [17]:
decoder_model_path = Path("/mnt/models/two_stage/decoder/")
decoder_model_path.mkdir(parents=True, exist_ok=True)
tune_path = decoder_model_path / "tune"
tune_path.mkdir(exist_ok=True)

In [18]:
from models.transformer import TransformerEncoderModel
from models.parallel_decoder_model import TransformerDecoderModel, TransformerModel
from models import parallel_decoder_model

from ray import tune, train as ray_train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from typing import Dict
from utils.cluster import attach_ray, disconnect_ray
from utils.optimizer import get_optimizer_function, get_learning_rate_scheduler
from utils.activation import get_activation
from utils.loss_functions import get_loss_function
from torch.utils.data import Dataset, DataLoader
import random; random.seed(0)

In [19]:
def parameter_train(parameter: Dict, train_epochs: int, train_set: Dataset, validation_set: Dataset,
                    encoder: TransformerEncoderModel, device: torch.device) -> None:

    # Determ device on the actual worker used for the trail
    device = ("cuda" if torch.cuda.is_available() else "cpu")

    if device != "cuda":
        print("No cuda device found!")
    train_dataloader = DataLoader(train_set, batch_size=parameter["batch_size"], shuffle = True)
    validation_dataloader = DataLoader(validation_set, batch_size=parameter["batch_size"], shuffle = True)

    activation = get_activation(parameter["activation"])

    decoder = TransformerDecoderModel(
        model_dim = parameter["model_dim"][0],
        num_heads = parameter["model_dim"][1],
        feedforward_dim = parameter["feedforward_dim"],
        num_decoder_layers = parameter["num_decoder_layer"],
        pos_encoder = encoder.pos_encoder,
        transformer_dropout = parameter["transformer_dropout"],
        activation = activation
    )

    model = TransformerModel(encoder, decoder)

    # The model needs to be on the device used for training before instance the optimizer
    model.to(device)

    optimizer = get_optimizer_function(parameter["optimizer"], model, 1)
    lr_scheduler = get_learning_rate_scheduler(optimizer, parameter["model_dim"][0], parameter["warmup_steps"])
    loss_function = get_loss_function()

    _ = parallel_decoder_model.train(train_epochs, train_dataloader, validation_dataloader, model, loss_function, optimizer, lr_scheduler, None, device, report_interval=50, tune=True)

In [20]:
num_samples = 2000
num_epochs = 750
grace_period = 5

In [21]:
model_dim_params = []

n_neighbors = [3, 5, 7, 10, 15]
model_dim = [output_shape]

for d in model_dim:
    num_heads = []
    for i in range(2, d + 1):
        if d % i == 0: num_heads.append(i)

        for h in num_heads:
            model_dim_params.append((d, h))

In [22]:
start, end = np.log2(256), np.log2(2048)
num_values = 10
feedforward_dim = [int(2 ** (start + i / (num_values - 1) * (end - start))) for i in range(num_values)]

In [23]:
parameter_space = {
    "model_dim": tune.choice(model_dim_params),
    "batch_size": tune.choice(list(range(64, 256, 16))),
    "warmup_steps": tune.choice(list(range(1000, 4000, 200))),
    "feedforward_dim": tune.choice(feedforward_dim),
    "num_decoder_layer": tune.choice(list(range(1, 10 + 1, 2))),
    "transformer_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "pos_encoder_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "optimizer": tune.choice(["adam", "adamw"]),
    "activation": tune.choice(["relu", "gelu"]),
}

In [24]:
scheduler = ASHAScheduler(
    metric = "loss",
    mode = "min",
    max_t = num_epochs,
    grace_period = grace_period
)

In [25]:
search_alg = OptunaSearch(
    metric="loss",
    mode="min"
) 

In [26]:
attach_ray(manager = False)

2023-12-28 18:28:10,262	INFO worker.py:1724 -- Started a local Ray instance.
2023-12-28 18:28:10,269	INFO packaging.py:530 -- Creating a file package for local directory '/mnt/src/utils'.
2023-12-28 18:28:10,274	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_9b2abe3b353f6177.zip' (0.11MiB) to Ray cluster...
2023-12-28 18:28:10,276	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_9b2abe3b353f6177.zip'.
2023-12-28 18:28:10,278	INFO packaging.py:530 -- Creating a file package for local directory '/mnt/src/models'.
2023-12-28 18:28:10,281	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_394f94149c4f4a5c.zip' (0.06MiB) to Ray cluster...
2023-12-28 18:28:10,282	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_394f94149c4f4a5c.zip'.


{'hostname': 'ajay-desktop-mbp-runner', 'resources': {'memory': 17563164672.0, 'object_store_memory': 8781582336.0, 'node:172.17.0.2': 1.0, 'CPU': 12.0, 'node:__internal_head__': 1.0, 'accelerator_type:G': 1.0, 'GPU': 1.0}}


In [27]:
ray_resources_manager = tune.with_resources(
    trainable=lambda params: parameter_train(params, num_epochs, train_set, validation_set, encoder.encoder, device),
    # See: https://stackoverflow.com/questions/58967793/what-is-the-way-to-make-tune-run-parallel-trials-across-multiple-gpus
    resources={ "cpu": 6, "gpu": 0.25 }
)

tuner = tune.Tuner(
    ray_resources_manager,
    param_space=parameter_space,
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        search_alg=search_alg,
         num_samples=num_samples
    )
)

In [28]:
results = tuner.fit()

0,1
Current time:,2023-12-28 18:32:39
Running for:,00:04:26.53
Memory:,8.1/31.2 GiB

Trial name,status,loc,activation,batch_size,feedforward_dim,model_dim,num_decoder_layer,optimizer,pos_encoder_dropout,transformer_dropout,warmup_steps,iter,total time (s),loss
lambda_c2d2c45f,RUNNING,172.17.0.2:191322,relu,96,1625,"(3, 3)",5,adam,0.415468,0.20306,3400,13.0,245.253,3715.37
lambda_d0b1e059,RUNNING,172.17.0.2:191395,relu,128,406,"(3, 3)",1,adam,0.284546,0.30987,1200,13.0,238.206,3709.41
lambda_4385ab9f,PENDING,,relu,160,1024,"(3, 3)",7,adamw,0.0954385,0.206881,2800,,,


[36m(<lambda> pid=191322)[0m tensor([-0.3647, -1.0009,  1.3657], device='cuda:0', grad_fn=<SliceBackward0>) tensor([1.7523, 2.9143, 1.2329], device='cuda:0')
[36m(<lambda> pid=191395)[0m tensor([-1.2530,  0.0586,  1.1944], device='cuda:0', grad_fn=<SliceBackward0>) tensor([1.5580, 2.7970, 1.2671], device='cuda:0')
[36m(<lambda> pid=191322)[0m tensor([ 0.2525, -1.3313,  1.0788], device='cuda:0', grad_fn=<SliceBackward0>) tensor([2.3529, 3.8126, 2.3083], device='cuda:0')
[36m(<lambda> pid=191395)[0m tensor([-1.2723,  0.1015,  1.1708], device='cuda:0', grad_fn=<SliceBackward0>) tensor([0.7864, 1.6669, 0.4593], device='cuda:0')
[36m(<lambda> pid=191395)[0m tensor([-1.1085, -0.2061,  1.3147], device='cuda:0', grad_fn=<SliceBackward0>) tensor([1.2922, 2.3488, 1.0424], device='cuda:0')[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-dedu

2023-12-28 18:32:42,652	INFO tune.py:1042 -- Total run time: 271.08 seconds (266.52 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/root/ray_results/lambda_2023-12-28_18-28-11", trainable=...)
- lambda_4385ab9f: FileNotFoundError('Could not fetch metrics for lambda_4385ab9f: both result.json and progress.csv were not found at /root/ray_results/lambda_2023-12-28_18-28-11/lambda_4385ab9f_3_activation=relu,batch_size=160,feedforward_dim=1024,model_dim=3_3,num_decoder_layer=7,optimizer=adamw,pos_encode_2023-12-28_18-28-37')


In [None]:
disconnect_ray()

In [None]:
# Save as csv file
result_grid = results.get_dataframe()
result_grid.to_csv(tune_path / "trail_grid_.csv")

In [None]:
best_result = result_grid.iloc[result_grid['loss'].idxmin()].to_dict()
trail_id = best_result['trial_id']

print(f"Trail ID from the best run: {trail_id}")

In [None]:
print(f"Best trail by loss value {best_result['loss']}", "\n------")
for key in best_result:
    if 'config' in key:
        print(f"Best trail: {key} value {best_result[key]}")