# Model 2: LSTM sequence model
## Read data

In [None]:
from pathlib import Path
import torch 
import numpy as np

In [None]:
import sys; sys.path.insert(0, '/mnt/src')

In [None]:
from models.lstm import DecoderLSTM, train

## Create Trajectory dataset from dataframe

In [None]:
from utils.file_io import read_trajectory_datasets

In [None]:
feature_columns = [
    'left_boom_base_yaw_joint', 
    'left_boom_base_pitch_joint',
    'left_boom_main_prismatic_joint',
    'left_boom_second_roll_joint',
    'left_boom_second_yaw_joint',
    'left_boom_top_pitch_joint'
]

label_features = [
    ('cable1_lowest_point', np.array([1], dtype=np.int64)),
    ('cable2_lowest_point', np.array([1], dtype=np.int64)),
    ('cable3_lowest_point', np.array([1], dtype=np.int64))
]

In [None]:
data_folder = Path("/mnt/data").absolute()
train_set, test_set, validation_set, _ = read_trajectory_datasets(data_folder, 0.8, 0.15, 0.045, 0.005, 64, standardize_features=True)

In [None]:
input_shape, output_shape = len(feature_columns), len(label_features)
print(f"Data shape {input_shape} / {output_shape} of total {len(train_set) + len(test_set) + len(validation_set)} data rows!")

## Load parameter, functions and dataloader

In [None]:
from utils.file_io import define_dataloader_from_subset

In [None]:
tune_path = Path("/mnt/local/lstm/tune").absolute()

## Train the model with optuna hyperparameter tuning

In [None]:
from ray import tune, train as ray_train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from torch import nn
from typing import Dict
from utils.cluster import attach_ray, disconnect_ray
from utils.angle_dataset import AngleDataset
from utils.activation import get_activation
from utils.optimizer import get_optimizer_function
from utils.loss_functions import get_loss_function
from torch.util.data import Dataset

In [None]:
def parameter_train(parameter: Dict, train_epochs: int, train_set: Dataset, validation_set: Dataset, test_set: Dataset, model_input_shape: int,
                    model_output_shape: int, checkpoint_path: Path) -> None:

    # Determ device on the actual worker used for the trail
    device = ("cuda" if torch.cuda.is_available() else "cpu")

    if device != "cuda":
        print("No cuda device found!")
        
    run_id = ray_train.get_context().get_trial_id()
    run_checkpoint = checkpoint_path / run_id
    run_checkpoint.mkdir(parents=True)

    train_dataloader, validation_dataloader, _ = define_dataloader_from_subset(train_set, validation_set, test_set, batch_size=parameter["batch_size"], shuffle=True)

    model = DecoderLSTM(model_input_shape, parameter["h_dims"], model_output_shape, parameter["lstm_dropout"], parameter["final_dropout"], parameter["n_layers"], proj_size=parameter["proj_size"])

    # The model needs to be on the device used for training before instance the optimizer
    model.to(device)
    
    optimizer = get_optimizer_function(parameter["optimizer"], model, parameter["lr"])
    loss_function = get_loss_function()

    _ = train(train_epochs, train_dataloader, validation_dataloader, model, loss_function, optimizer, run_checkpoint, device, report_interval=50, tune=True)

Define parameter ranges

In [None]:
num_samples = 10000
num_epochs = 1000
grace_period = 10

In [None]:
parameter_space = {
    "optimizer": tune.choice(["adam", "sgd", "adamw", "adagrad"]),
    "lr": tune.loguniform(1e-6, 1e-2, base = 10),
    "batch_size": tune.choice(list(range(64, 256, 16))),
    "lstm_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "final_dropout": tune.loguniform(0.05, 0.6, base = 2),
    "proj_size": tune.choice([output_shape, 0]),
    "h_dims": tune.choice([1, 2, 3, 5]),
    "n_layers": tune.choice([1, 2, 3, 5]),
}

In [None]:
scheduler = ASHAScheduler(
    metric = "loss",
    mode = "min",
    max_t = num_epochs,
    grace_period = grace_period
)

In [None]:
search_alg = OptunaSearch(
    metric = "loss",
    mode = "min"
) 

In [None]:
attach_ray(manager = True)

In [None]:
ray_resources_manager = tune.with_resources(
    trainable = lambda param: parameter_train(param, num_epochs, train_set, validation_set, test_set, input_shape, output_shape, tune_path),
    # See: https://stackoverflow.com/questions/58967793/what-is-the-way-to-make-tune-run-parallel-trials-across-multiple-gpus
    resources = { "cpu": 3, "gpu": 0.25 }
)

tuner = tune.Tuner(
    ray_resources_manager,
    param_space = parameter_space,
    tune_config = tune.TuneConfig(
        scheduler = scheduler,
        search_alg = search_alg,
        num_samples = num_samples
    ),
    run_config = ray_train.RunConfig(
        name = "lstm"
    )
)

In [None]:
results = tuner.fit()

In [None]:
disconnect_ray()

In [None]:
# Save as csv file
result_grid = results.get_dataframe()
result_grid.to_csv(tune_path / "trail_grid_.csv")

In [None]:
result_grid.sort_values('loss').head(5)

In [None]:
best_result = result_grid.iloc[result_grid['loss'].idxmin()].to_dict()
trail_id = best_result['trial_id']

print(f"Trail ID from the best run: {trail_id}")

In [None]:
print(f"Best trail by loss value {best_result['loss']}", "\n------")
for key in best_result:
    if 'config' in key:
        print(f"Best trail: {key} value {best_result[key]}")

Do not forget to save all checkpoints on other devices of the trail if the training was carried out with a cluster.