# Model 4.2: Encoder-Decoder Model

In [None]:
import torch
import numpy as np

In [None]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

In [None]:
# Import local modules from 'src/utils' as package 'utils'
import sys; sys.path.insert(0, '/mnt/src')

## Load ParallelTrajectoriesDatasets from Pretraining

In [None]:
from pathlib import Path

In [None]:
data_path = Path("/mnt/models/two_stage/hpo/data/")
train_path = data_path / "train_set.pt"
validation_path = data_path / "validation_set.pt"
train_set = torch.load(train_path)
validation_set = torch.load(validation_path)

In [None]:
features, labels = train_set[0] 
print(features.shape, labels.shape)
input_shape, output_shape = features.shape[-1], labels.shape[-1]
num_parallel_trajectories = features.shape[0]
print(f"Data shape {input_shape} / {output_shape} of total {len(train_set) + len(validation_set)} data rows!")

## Train the best parallel encoder model

## Load parameter, functions and dataloader

In [None]:
import os
import ast

from dotenv import load_dotenv
from torch.utils.data import DataLoader

In [None]:
encoder_model_path = Path("/mnt/models/two_stage/encoder")

In [None]:
dotenv_path = encoder_model_path / ".env"
load_dotenv(dotenv_path=dotenv_path)

batch_size = int(os.getenv("BATCH_SIZE"))
optimizer = os.getenv("OPTIMIZER")
activation = os.getenv("ACTIVATION")
num_encoder_layer = int(os.getenv("NUM_ENCODER_LAYER"))
pos_encoder_dropout = float(os.getenv("POS_ENCODER_DROPOUT"))
transformer_dropout = float(os.getenv("TRANSFORMER_DROPOUT"))
feedforward_dim = int(os.getenv("FEEDFORWARD_DIM"))
warmup_steps = int(os.getenv("WARMUP_STEPS"))
model_dim_num_heads_projection = ast.literal_eval(os.getenv("MODEL_DIM_NUM_HEADS_PROJECTION"))
num_epochs = int(os.getenv("NUM_EPOCHS"))

In [None]:
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle = True)
validation_dataloader = DataLoader(validation_set, batch_size=batch_size, shuffle = True)

## Load and Train the model

In [None]:
from models.parallel_encoder_model import ParallelEncoderModel
from models import parallel_encoder_model
from utils.loss_functions import get_loss_function
from utils.optimizer import get_optimizer_function, get_learning_rate_scheduler
from utils.activation import get_activation
from utils.file_io import load_downprojection
from utils.early_stopping import EarlyStopping

In [None]:
activation = get_activation(activation)

downprojection_path = encoder_model_path.parent / "downprojections" / f"{model_dim_num_heads_projection[2]}_projection.sav"
downprojection = load_downprojection(downprojection_path)

In [None]:
encoder = ParallelEncoderModel(
    num_decoders=num_parallel_trajectories,
    num_heads=model_dim_num_heads_projection[1],
    model_dim=model_dim_num_heads_projection[0],
    feedforward_hidden_dim=feedforward_dim,
    output_dim=model_dim_num_heads_projection[0],
    num_encoder_layers=num_encoder_layer,
    transformer_dropout=transformer_dropout,
    pos_encoder_dropout=pos_encoder_dropout,
    activation=activation,
    projection_function=downprojection
)

In [None]:
optimizer = get_optimizer_function(optimizer, encoder, 1)
lr_scheduler = get_learning_rate_scheduler(optimizer, model_dim_num_heads_projection[0], warmup_steps)
loss_function = get_loss_function()

In [None]:
early_stopping = EarlyStopping(10)

In [None]:
_, validation_losses = parallel_encoder_model.train(num_epochs, train_dataloader, validation_dataloader, encoder, loss_function, optimizer, lr_scheduler, encoder_model_path, device, early_stopping=early_stopping)

## Train the combined model with optuna hyperparameter tuning

In [None]:
decoder_model_path = Path("/mnt/models/two_stage/decoder/")
decoder_model_path.mkdir(parents=True, exist_ok=True)
tune_path = decoder_model_path / "tune"
tune_path.mkdir(exist_ok=True)

In [None]:
from models.transformer import TransformerEncoderModel
from models.parallel_decoder_model import TransformerDecoderModel, TransformerModel
from models import parallel_decoder_model

from ray import tune, train as ray_train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from typing import Dict
from utils.cluster import attach_ray, disconnect_ray
from utils.optimizer import get_optimizer_function, get_learning_rate_scheduler
from utils.activation import get_activation
from utils.loss_functions import get_loss_function
from torch.utils.data import Dataset, DataLoader
import random; random.seed(0)

In [None]:
def parameter_train(parameter: Dict, train_epochs: int, train_set: Dataset, validation_set: Dataset,
                    encoder: TransformerEncoderModel, device: torch.device) -> None:

    # Determ device on the actual worker used for the trail
    device = ("cuda" if torch.cuda.is_available() else "cpu")

    if device != "cuda":
        print("No cuda device found!")
    train_dataloader = DataLoader(train_set, batch_size=parameter["batch_size"], shuffle = True)
    validation_dataloader = DataLoader(validation_set, batch_size=parameter["batch_size"], shuffle = True)

    activation = get_activation(parameter["activation"])

    decoder = TransformerDecoderModel(
        model_dim = parameter["model_dim"][0],
        num_heads = parameter["model_dim"][1],
        feedforward_dim = parameter["feedforward_dim"],
        num_decoder_layers = parameter["num_decoder_layer"],
        pos_encoder = encoder.pos_encoder,
        transformer_dropout = parameter["transformer_dropout"],
        activation = activation
    )

    model = TransformerModel(encoder, decoder)

    # The model needs to be on the device used for training before instance the optimizer
    model.to(device)

    optimizer = get_optimizer_function(parameter["optimizer"], model, 1)
    lr_scheduler = get_learning_rate_scheduler(optimizer, parameter["model_dim"][0], parameter["warmup_steps"])
    loss_function = get_loss_function()

    _ = parallel_decoder_model.train(train_epochs, train_dataloader, validation_dataloader, model, loss_function, optimizer, lr_scheduler, None, device, report_interval=50, tune=True)

In [None]:
num_samples = 2000
num_epochs = 750
grace_period = 5

In [None]:
model_dim_params = []

n_neighbors = [3, 5, 7, 10, 15]
model_dim = [output_shape]

for d in model_dim:
    num_heads = []
    for i in range(2, d + 1):
        if d % i == 0: num_heads.append(i)

        for h in num_heads:
            model_dim_params.append((d, h))

In [None]:
start, end = np.log2(256), np.log2(2048)
num_values = 10
feedforward_dim = [int(2 ** (start + i / (num_values - 1) * (end - start))) for i in range(num_values)]

In [None]:
parameter_space = {
    "model_dim": tune.choice(model_dim_params),
    "batch_size": tune.choice(list(range(64, 256, 16))),
    "warmup_steps": tune.choice(list(range(1000, 4000, 200))),
    "feedforward_dim": tune.choice(feedforward_dim),
    "num_decoder_layer": tune.choice(list(range(1, 10 + 1, 2))),
    "transformer_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "pos_encoder_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "optimizer": tune.choice(["adam", "adamw"]),
    "activation": tune.choice(["relu", "gelu"]),
}

In [None]:
scheduler = ASHAScheduler(
    metric = "loss",
    mode = "min",
    max_t = num_epochs,
    grace_period = grace_period
)

In [None]:
search_alg = OptunaSearch(
    metric="loss",
    mode="min"
) 

In [None]:
attach_ray(use_cluster = True)

In [None]:
ray_resources_manager = tune.with_resources(
    trainable=lambda params: parameter_train(params, num_epochs, train_set, validation_set, encoder.encoder, device),
    # See: https://stackoverflow.com/questions/58967793/what-is-the-way-to-make-tune-run-parallel-trials-across-multiple-gpus
    resources={ "cpu": 6, "gpu": 0.25 }
)

tuner = tune.Tuner(
    ray_resources_manager,
    param_space=parameter_space,
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        search_alg=search_alg,
         num_samples=num_samples
    )
)

In [None]:
results = tuner.fit()

In [None]:
disconnect_ray()

In [None]:
# Save as csv file
result_grid = results.get_dataframe()
result_grid.to_csv(tune_path / "trail_grid_.csv")

In [None]:
best_result = result_grid.iloc[result_grid['loss'].idxmin()].to_dict()
trail_id = best_result['trial_id']

print(f"Trail ID from the best run: {trail_id}")

In [None]:
print(f"Best trail by loss value {best_result['loss']}", "\n------")
for key in best_result:
    if 'config' in key:
        print(f"Best trail: {key} value {best_result[key]}")