# Model 4: Three Stage Model

In [1]:
import torch
import numpy as np
import pandas as pd

In [2]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [3]:
# Import local modules from 'src/utils' as package 'utils'
import sys; sys.path.insert(0, '../')

## Read Yuxuan's .csv files

In [4]:
from pathlib import Path
from tqdm import tqdm
from typing import List

In [5]:
def read_all_data_dumps_in(data_folder: Path) -> List[pd.DataFrame]:
    """
    Read all .csv data dumps in data_folder and put them into one List.
    """
    dataframes = []
    for data_file in tqdm(data_folder.glob("*.csv"), "Reading .csv files"):
        dataframes.append(read_data_csv(data_file))
    return dataframes


def read_data_csv(filepath: Path, separator: str = ";") -> pd.DataFrame:
    dataframe = pd.read_csv(filepath, sep=separator)
    dataframe["Timestamp"] = pd.to_datetime(dataframe["Timestamp"], unit="ns")  
    convert_list_columns(dataframe)
    return dataframe


def convert_list_columns(dataframe: pd.DataFrame):
    """
    Convert string columns to np.ndarrays.
    """
    convertible_columns = [column for column in dataframe.columns if column != "Timestamp"]
    for column in convertible_columns:
        # do not convert columns that do not contain a list
        if dataframe[column][0][0] == "[":
            dataframe[column] = dataframe[column].apply(convert_list)


def convert_list(text: str) -> np.ndarray:
    """
    Converts strings like "[el1, el2, el3]", with el1, el2, el3 being floats into an np.ndarray. 
    """
    return np.fromstring(text[1:-1], sep=",", dtype=np.float32)

In [6]:
data_path = Path("../../data/")
dataset = read_all_data_dumps_in(data_path)

Reading .csv files: 2it [00:05,  2.91s/it]


In [7]:
print(dataset[0].columns)
feature_columns = [
    'left_boom_base_yaw_joint', 'left_boom_base_pitch_joint', 'left_boom_main_prismatic_joint', 'left_boom_second_roll_joint',
    'left_boom_second_yaw_joint', 'left_boom_top_pitch_joint'
]

label_columns = ['cable1_lowest_point', 'cable2_lowest_point', 'cable3_lowest_point']

Index(['Timestamp', 'left_boom_base_yaw_joint', 'left_boom_base_pitch_joint',
       'left_boom_main_prismatic_joint', 'left_boom_second_roll_joint',
       'left_boom_second_yaw_joint', 'left_boom_top_pitch_joint',
       'left_boom_ee_joint', 'cable1_lowest_point', 'cable2_lowest_point',
       'cable3_lowest_point',
       'cable1_property(length,youngsmodule(bend,twist))',
       'cable2_property(length,youngsmodule(bend,twist))',
       'cable3_property(length,youngsmodule(bend,twist))',
       'left_boom_yaw_link(x,y,z,w,qx,qy,qz)',
       'left_boom_main_link(x,y,z,w,qx,qy,qz)',
       'left_boom_second_link(x,y,z,w,qx,qy,qz)',
       'left_boom_pitch_link(x,y,z,w,qx,qy,qz)',
       'left_boom_top_link(x,y,z,w,qx,qy,qz)',
       'left_boom_top_second_link(x,y,z,w,qx,qy,qz)',
       'left_boom_tip(x,y,z,w,qx,qy,qz)'],
      dtype='object')


## Preprocessing dataframes

In [8]:
from typing import List

In [9]:
def reshape_dataframe_for_learning(dataframe: pd.DataFrame, feature_columns: List[str] = None, label_columns: List[str] = None, label_dims: List[int] = None, standardize_features: bool = False, normalize_features: bool = False) -> pd.DataFrame:
    """
    Flattens the arm's angle columns into one feature column and creates a view on the original dataframe 
    which only contains the flattened angle vector and lowest point. 
    """
    # FIXME: Scale youngsmodule bend and twist
    print("Preprocessing dataframe.")
    feature_columns = [col for col in dataframe.columns][1:-1] if feature_columns is None else feature_columns
    dataframe["features"] = dataframe[feature_columns].apply(lambda row: np.concatenate(row.values), axis=1)
    if label_columns is None: 
        label_columns = dataframe.columns[-1]
        data_columns = ["features", label_columns]
    else:
        dataframe["labels"] = dataframe[label_columns].apply(lambda row: np.concatenate(row.values), axis=1) if label_dims is None else dataframe[label_columns].apply(lambda row: np.concatenate(row.values[label_dims]), axis=1)
        data_columns = ["features", "labels"]
    if normalize_features:
        dataframe["features"] = normalize(np.stack(dataframe["features"].to_numpy())).tolist()
    if standardize_features:
        dataframe["features"] = standardize(np.stack(dataframe["features"].to_numpy())).tolist()
    return dataframe[data_columns]


def standardize(features: np.ndarray) -> np.ndarray:
    return (features - features.mean(axis=0)) / features.std(axis=0)


def normalize(features: np.ndarray) -> np.ndarray:
    x_min = features.min(axis=0)
    x_max = features.max(axis=0)
    return (features - x_min) / (x_max - x_min)

In [10]:
def preprocess_dataframes_for_parallel_training(dataframes: List[pd.DataFrame], feature_columns: List[str] = None, label_columns: List[str] = None, label_dims: List[int] = None, standardize_features: bool = False, normalize_features: bool = False):
   preprocess = lambda dataframe: reshape_dataframe_for_learning(dataframe, feature_columns=feature_columns, label_columns=label_columns, label_dims=label_dims, standardize_features=standardize_features, normalize_features=normalize_features)
   dataframes = [preprocess(dataframe) for dataframe in dataframes]
   dataframes = cut_dataframes_to_same_length(dataframes)
   return dataframes

def cut_dataframes_to_same_length(dataframes: pd.DataFrame):
    dataframe_lengths = [len(dataframe.index) for dataframe in dataframes]
    min_len = min(dataframe_lengths)
    return [dataframe.head(min_len) for dataframe in dataframes]

In [11]:
dataset = preprocess_dataframes_for_parallel_training(dataset, feature_columns=feature_columns, label_columns=label_columns)

Preprocessing dataframe.
Preprocessing dataframe.


In [12]:
print([len(dataframe) for dataframe in dataset])
print([dataframe.head() for dataframe in dataset])
print(dataset[0]["features"][0].shape, dataset[0]["labels"][0].shape)

[81215, 81215]
[                                            features  \
0  [0.0224931, -0.0883802, 0.523592, -1.03304, 0....   
1  [0.0224938, -0.0883812, 0.5236, -1.03305, 0.07...   
2  [0.0224945, -0.0883802, 0.523608, -1.03305, 0....   
3  [0.0224952, -0.0883792, 0.523616, -1.03306, 0....   
4  [0.0224973, -0.0883783, 0.523639, -1.03308, 0....   

                                              labels  
0  [-1.21362, 5.32123, -0.190972, -0.804286, 5.69...  
1  [-1.21362, 5.32124, -0.190982, -0.80429, 5.691...  
2  [-1.21362, 5.32125, -0.190985, -0.804292, 5.69...  
3  [-1.21363, 5.32126, -0.19099, -0.804294, 5.691...  
4  [-1.21363, 5.32127, -0.190995, -0.804298, 5.69...  ,                                             features  \
0  [0.0104718, -1.52122e-07, 0.00400016, 0.020017...   
1  [0.010462, 3.99373e-05, 0.00401454, 0.020676, ...   
2  [0.0104325, 0.000158116, 0.00406007, 0.0227489...   
3  [0.0104325, 0.000158116, 0.00406007, 0.0227489...   
4  [0.0104069, 0.000232887, 0.004087

## Create Parallel Trajectory Dataset

In [13]:
from utils.trajectory_dataset import *
from torch.utils.data import Dataset

In [14]:
class ParallelTrajectoryDataset(Dataset):
    def __init__(self, datasets: List[TrajectoryDataset]) -> None:
        super().__init__()
        self.datasets = datasets

    def __len__(self) -> int:
        return len(self.datasets[0])
    
    def __getitem__(self, index) -> Tuple[torch.Tensor, torch.Tensor]:
        trajectory_features, trajectory_true_lowpoints = [], []
        for dataset in self.datasets:
            features, true_lowpoints = dataset[index]
            trajectory_features.append(features)
            trajectory_true_lowpoints.append(true_lowpoints)
        return torch.stack(trajectory_features), torch.stack(trajectory_true_lowpoints)


class ParallelSlidingWindowTrajectoryDataset(Dataset):
    def __init__(self, datasets: List[SlidingWindowTrajectoryDataset]) -> None:
        super().__init__()
        self.datasets = datasets

    def __len__(self) -> int:
        return len(self.datasets[0])
    
    def __getitem__(self, index) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        trajectory_features, trajectory_true_lowpoints, trajectory_last_indices = [], [], []
        for dataset in self.datasets:
            features, true_lowpoints, last_index = dataset[index]
            trajectory_features.append(features)
            trajectory_true_lowpoints.append(true_lowpoints)
            trajectory_last_indices.append(last_index)
        return torch.stack(trajectory_features), torch.stack(trajectory_true_lowpoints), torch.tensor(trajectory_last_indices, dtype=torch.int64)


In [15]:
dataset = [TrajectoryDataset(df, 256) for df in dataset]
dataset = ParallelTrajectoryDataset(dataset)

In [16]:
feature_tensor, lowpoint_tensor = dataset[0]
# The output shape is #Parallel trajectories, Sequence length, feature/label dim
print(feature_tensor.shape, lowpoint_tensor.shape)

torch.Size([2, 256, 6]) torch.Size([2, 256, 9])


### Load ParallelTrajectoryDatasets

In [17]:
from torch.utils.data import random_split, Subset

In [18]:
def read_parallel_trajectory_datasets(data_folder: Path, train_split: float, test_split: float, validation_split: float, 
                             visualization_split: float = 0.0, window_size: int = 128, feature_columns: List[str] = None, label_columns: List[str] = None, label_dims: List[int] = None,
                             standardize_features: bool = False, normalize_features = False) -> Tuple[Subset, SlidingWindowTrajectoryDataset, Subset, SlidingWindowTrajectoryDataset]:
    """

    :param data_folder: The path to the folder containing the data.
    :param train_split: A float between 0 and 1 describing the relative size of the training dataset compared to the whole dataset.
    :param test_split: A float between 0 and 1 describing the relative size of the test dataset compared to the whole dataset.
    :param validation_split: A float between 0 and 1 describing the relative size of the validation dataset compared to the whole dataset.
    :param visualization_split: A float between 0 and 1 describing the relative size of the visualization dataset compared to the whole dataset.
    :param window_size: The length of the trajectories in the dataset.
    :param feature_columns: The columns of the .csv files used as features. By default all columns between the first and last column are used.
    :param label_columns: The columns of the .csv files used as labels. By default the last column is used as the label column.
    :param label_dims: A list of the entries in the label column used to create label vectors.
    :param standardize_features: Set this to true to standardize the features (subtract the standard deviation and divide by the variance), incompatible with normalize_features
    :param normalize_features: Set this to true to normalize the features between [-1, 1].

    :return: The train dataset, test dataset, validation dataset and visualization_dataset
    """
    sum_of_splits = train_split + test_split + validation_split + visualization_split
    if not sum_of_splits <= 1:
        raise ValueError(f"The sum of all splits should be smaller than 1.0, given {sum_of_splits}!")
    
    dataframes = read_all_data_dumps_in(data_folder)
    preprocessed = preprocess_dataframes_for_parallel_training(dataframes, feature_columns=feature_columns, label_columns=label_columns, label_dims=label_dims, standardize_features=standardize_features, normalize_features=normalize_features)
    complete_datasets = [TrajectoryDataset(dataframe, window_size) for dataframe in preprocessed]
    dataset_length = len(complete_datasets[0])

    train_length, test_length, validation_length, shuffled_split_len = compute_split_lengths(dataset_length, train_split, test_split, validation_split)

    shuffled_split = [Subset(dataset, list(range(0, shuffled_split_len))) for dataset in complete_datasets]
    contigous_split = [SlidingWindowTrajectoryDataset(Subset(dataset, list(range(shuffled_split_len, dataset_length))), window_size, contigous=True) for dataset in complete_datasets]
    shuffled_split = ParallelTrajectoryDataset(shuffled_split)
    contigous_split = ParallelTrajectoryDataset(contigous_split)

    train_set, test_set, validation_set = random_split(shuffled_split, [train_length, test_length, validation_length])
    test_set = ParallelSlidingWindowTrajectoryDataset([SlidingWindowTrajectoryDataset(ts, window_size) for ts in test_set.dataset.datasets])
    return train_set, test_set, validation_set, contigous_split


def compute_split_lengths(dataset_length: int, train_split: float, test_split: float, validation_split: float) -> Tuple[int, int, int, int]:
    train_length = int(dataset_length * train_split)
    test_length = int(dataset_length * test_split)
    validation_length = int(dataset_length * validation_split)
    shuffled_split_len = train_length + test_length + validation_length
    return train_length, test_length, validation_length, shuffled_split_len

In [19]:
train_set, test_set, validation_set, visualization_set = read_parallel_trajectory_datasets(data_path, 0.85, 0.10, 0.045, 0.005, 256, feature_columns, label_columns)

Reading .csv files: 2it [00:05,  2.89s/it]


Preprocessing dataframe.
Preprocessing dataframe.


In [20]:
features, labels, last_indices = test_set[257] 
print(features.shape, labels.shape, last_indices)
input_shape, output_shape = features.shape[-1], labels.shape[-1]

torch.Size([2, 256, 6]) torch.Size([2, 9]) tensor([1, 1])


## Define Transformer Encoder Model

In [21]:
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import math

### Transformer positional encoding

In [22]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 1024) -> None:
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x += self.pe[:x.size(0)]
        return self.dropout(x)

In [23]:
class TransformerEncoderModel(nn.Module):
    def __init__(self, num_heads: int, model_dim: int, feedforward_hidden_dim: int,
                 num_encoder_layers: int = 6, transformer_dropout: float = 0.1, pos_encoder_dropout: float = 0.25) -> None:
        super().__init__()
        self.model_type = 'Transformer'
        self.total_epochs = 0
        encoder_layers = TransformerEncoderLayer(model_dim, num_heads, feedforward_hidden_dim, transformer_dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_encoder_layers)   
        self.pos_encoder = PositionalEncoding(model_dim, pos_encoder_dropout)

    def forward(self, source: Tensor, source_msk: Tensor = None) -> Tensor:
        # expect input shape to be (S, N, E) with S being the sequence length, N batch size and, E the input dimensionality
        # target_mask masks out all values right of the diagonal such that information from the target sequence cant bleed into the left hand side at training time
        source = self.pos_encoder(source)
        return self.transformer_encoder(source, source_msk)
    

class ParallelEncoderModel(nn.Module):
    def __init__(self, num_decoders: int, num_heads: int, model_dim: int, feedforward_hidden_dim: int, output_dim: int,
                 num_encoder_layers: int = 6, transformer_dropout: float = 0.1, pos_encoder_dropout: float = 0.25) -> None:
        super().__init__()
        self.model_type = 'Transformer'
        self.total_epochs = 0
        encoder_layers = TransformerEncoderLayer(model_dim, num_heads, feedforward_hidden_dim, transformer_dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_encoder_layers)   
        self.pos_encoder = PositionalEncoding(model_dim, pos_encoder_dropout)
        self.decoders = nn.ModuleList([nn.Linear(model_dim, output_dim) for i in range(num_decoders)])
        self.activation = nn.ReLU()

    def forward(self, source: Tensor, source_mask: Tensor = None) -> Tensor:
        decoded = []
        for i, decoder in enumerate(self.decoders):
            trajectory_source = source[i, :, :, :]
            trajectory_source = self.pos_encoder(trajectory_source)
            trajectory_source = self.transformer_encoder(trajectory_source, source_mask)
            decoded_trajectory = decoder(trajectory_source)
            decoded_trajectory = self.activation(decoded_trajectory)
            decoded.append(decoded_trajectory)
        return torch.stack(decoded, dim=0)


In [24]:
encoder_model = ParallelEncoderModel(2, 3, input_shape, 64, output_shape).to(device) 



## Model Training Step 1

### Load parameters, functions, and Dataloader

In [25]:
import os

from typing import Any

from torch.utils.data import DataLoader
from dotenv import load_dotenv

from utils.file_io import save_model
from utils.file_io import define_dataloader_from_subset
from utils.evaluation import compute_loss_on
from utils.optimizer import rate

In [26]:
model_path = Path("../../models/encoder/").absolute()

In [27]:
dotenv_path = model_path / ".env"
load_dotenv(dotenv_path=dotenv_path)

batch_size = int(os.getenv("BATCH_SIZE"))
num_epochs = int(os.getenv("NUM_EPOCHS"))

In [28]:
def get_optimizer_function_and_learning_rate_scheduler(model: nn.Module, model_size: int, warmup_steps: int, factor: float = 1) -> Tuple[Any, Any]:
    optimizer = torch.optim.AdamW(model.parameters(), lr=1)
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lambda step: rate(step, model_size, factor, warmup_steps))
    return optimizer, lr_scheduler


def get_loss_function() -> nn.Module:
    return torch.nn.MSELoss()

In [29]:
train_dataloader, validation_dataloader, test_dataloader = define_dataloader_from_subset(train_set, validation_set, test_set, batch_size=batch_size, shuffle=True)

### Define train methods

In [30]:
from ray import train as ray_train
from ray.train import Checkpoint

In [31]:
def train_epoch(train_dataloader: DataLoader, model, loss_function, optimizer, lr_scheduler,
                device: torch.device, report_interval: int = 100):
    
    running_loss = 0
    last_loss = 0
    
    for i, (inputs, true_values) in enumerate(train_dataloader):
        
        inputs = inputs.to(device)
        true_values = true_values.to(device)
    
        inputs_shape, true_values_shape = inputs.size(), true_values.size()
        inputs = inputs.view(inputs_shape[1], inputs_shape[2], inputs_shape[0], inputs_shape[3])
        true_values = true_values.view(true_values_shape[1], true_values_shape[2], true_values_shape[0], true_values_shape[3])
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, true_values)
        running_loss += loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        optimizer.zero_grad()
        outputs = model(inputs)
    
        if i % report_interval == report_interval - 1:
            last_loss = running_loss / report_interval
            print(f"batch {i + 1}, Mean Squared Error: {last_loss}")
            running_loss = 0

    return last_loss

In [32]:
def train(epochs: int, train_dataloader: DataLoader, validation_dataloader: DataLoader, model: nn.Module, 
          loss_function, optimizer, lr_scheduler, checkpoint_path: Path, device: torch.device = 'cpu', report_interval: int = 128, tune: bool = False) -> nn.Module:
    
    best_val_loss = float("inf")

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    model.to(device)

    if tune:
        checkpoint = ray_train.get_checkpoint()

        if checkpoint:
            with checkpoint.as_directory() as checkpoint_dir:
                model_state = torch.load(os.path.join(checkpoint_dir, "checkpoint.pt"))
                model.load_state_dict(model_state)

    for epoch in range(model.total_epochs, epochs):
        print(f"Epoch: {epoch + 1}")

        model.train(True)
        avg_loss = train_epoch(train_dataloader, model, loss_function, optimizer, lr_scheduler, device, report_interval)
        model.eval()

        with torch.no_grad():
            avg_val_loss = compute_loss_on(validation_dataloader, model, loss_function, reshape=True, device=device)

        print(f"Loss on train: {avg_loss}, loss on validation: {avg_val_loss}")

        model.total_epochs += 1
    

        if avg_val_loss < best_val_loss or tune:
            best_val_loss = avg_val_loss            
            
            torch.save(model.state_dict(), checkpoint_path / "checkpoint.pt")

        if tune:
            ray_train.report(metrics={ "loss": float(avg_val_loss) }, checkpoint=Checkpoint.from_directory(checkpoint_path))

    return model

## Train the model with optuna hyperparameter tuning

In [42]:
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from typing import Dict

In [43]:
def parameter_train(parameter: Dict, train_epochs: int, train_data: DataLoader, validation_data: DataLoader, model_input_shape: int,
                    model_output_shape: int, checkpoint_path: Path, device: torch.device) -> None:
    features. _ = train_dataloader.dataset[0]
    parallel_trajectories = features.shape[0]
    run_id = ray_train.get_context().get_trial_id()
    run_checkpoint = checkpoint_path / run_id
    run_checkpoint.mkdir(parents=True)

    model = ParallelEncoderModel(parallel_trajectories, model_input_shape, parameter["model_dim"], parameter["feedforward_dim"], model_output_shape,
                                   parameter["encoder_layer"], parameter["transformer_dropout"], parameter["pos_encoder_dropout"]).to(device)

    optimizer, lr_scheduler = get_optimizer_function_and_learning_rate_scheduler(model, parameter["model_dim"], warmup_steps=parameter["warmup_steps"])
    loss_function = get_loss_function()

    _ = train(train_epochs, train_data, validation_data, model, loss_function, optimizer, lr_scheduler, run_checkpoint, device, report_interval=50, tune=True)

In [44]:
learning_rate_radius = 1e-3
batch_size_radius = 10
num_samples = 100

In [45]:
parameter_space = {
    "batch_size": tune.choice(list(range(batch_size - batch_size_radius, batch_size + batch_size_radius, 4))),
    "model_dim": tune.choice([6]),
    "warmup_steps": tune.choice(list(range(1000, 1200, 2000))),
    "feedforward_dim": tune.choice([32]),
    "encoder_layer": tune.choice([1, 2, 3]),
    "transformer_dropout": tune.uniform(0.1, 0.5),
    "pos_encoder_dropout": tune.uniform(0.1, 0.5)
}

In [46]:
scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=num_epochs
)

In [47]:
search_alg = OptunaSearch(
    metric="loss",
    mode="min"
) 

In [48]:
import utils
if ray.is_initialized():
    ray.shutdown()

ray.init(runtime_env={ "py_modules": [utils] })

2023-11-28 00:11:48,320	INFO worker.py:1673 -- Started a local Ray instance.
2023-11-28 00:11:48,325	INFO packaging.py:530 -- Creating a file package for local directory '/mnt/src/notebooks/../utils'.
2023-11-28 00:11:48,329	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_93581ef4ec11384f.zip' (0.05MiB) to Ray cluster...
2023-11-28 00:11:48,330	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_93581ef4ec11384f.zip'.


0,1
Python version:,3.10.12
Ray version:,2.8.0


In [49]:
ray_resources_manager = tune.with_resources(
    trainable=lambda param: parameter_train(param, num_epochs, train_dataloader, validation_dataloader, input_shape, output_shape, model_path, device),
    # See: https://stackoverflow.com/questions/58967793/what-is-the-way-to-make-tune-run-parallel-trials-across-multiple-gpus
    resources={ "cpu": 3, "gpu": 0.25 }
)

tuner = tune.Tuner(
    ray_resources_manager,
    param_space=parameter_space,
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        search_alg=search_alg,
         num_samples=num_samples
    )
)

In [50]:
results = tuner.fit()

0,1
Current time:,2023-11-28 00:12:14
Running for:,00:00:20.81
Memory:,18.0/62.7 GiB

Trial name,status,loc,batch_size,encoder_layer,feedforward_dim,model_dim,pos_encoder_dropout,transformer_dropout,warmup_steps,iter,total time (s),loss
lambda_f6782919,RUNNING,172.17.0.2:68980,54,1,32,6,0.107476,0.352193,1000,24.0,9.85788,8.17494
lambda_c8ea82da,RUNNING,172.17.0.2:69059,62,2,32,6,0.375609,0.312293,1000,13.0,5.97095,10.5447
lambda_6cb72e5a,RUNNING,172.17.0.2:69155,70,1,32,6,0.24754,0.184577,1000,7.0,3.16377,11.2305
lambda_b1932ebb,PENDING,,62,1,32,6,0.407584,0.429971,1000,,,
lambda_4b83322b,TERMINATED,172.17.0.2:69059,62,2,32,6,0.231493,0.407853,1000,1.0,0.811396,11.8532




[36m(<lambda> pid=68980)[0m Epoch: 1
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  
[36m(<lambda> pid=68980)[0m tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  
[36m(<lambda> pid=68980)[0m tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m Loss on train: 0, loss on validation: 11.763545989990234


[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000000)


[36m(<lambda> pid=68980)[0m Epoch: 2
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  
[36m(<lambda> pid=68980)[0m tensor(False, device='cuda:0')


[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000001)


[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m Loss on train: 0, loss on validation: 11.710311889648438
[36m(<lambda> pid=68980)[0m Epoch: 3
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  
[36m(<lambda> pid=68980)[0m tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda

[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000002)


[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m Loss on train: 0, loss on validation: 11.787110328674316
[36m(<lambda> pid=68980)[0m Epoch: 4
[36m(<lambda> pid=68980)[0m outputs isnan  
[36m(<lambda> pid=68980)[0m tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0

[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000003)


[36m(<lambda> pid=68980)[0m Epoch: 5
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m Loss on train: 0, loss on validation: 11.704679489135742
[36m(<lambda> pid=68980)[0m Epoch: 6


[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000004)


[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m Loss on train: 0, loss on validation: 11.612455368041992
[36m(<lambda> pid=68980)[0m Epoch: 7


[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000005)


[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  
[36m(<lambda> pid=68980)[0m tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m Loss on train: 0, loss on validation: 11.632894515991211
[36m(<lambda> pid=68980)[0m Epoch: 8
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0

[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000006)


[36m(<lambda> pid=68980)[0m tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')


[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000007)


[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m Loss on train: 0, loss on validation: 11.628889083862305
[36m(<lambda> pid=68980)[0m Epoch: 9
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')


[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000008)


[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m outputs 2 isnan  tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m Loss on train: 0, loss on validation: 11.558516502380371




[36m(<lambda> pid=68980)[0m Epoch: 10[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(<lambda> pid=69059)[0m outputs 2 isnan  tensor(False, device='cuda:0')[32m [repeated 10x across cluster][0m
[36m(<lambda> pid=69059)[0m Loss on train: 0, loss on validation: 11.853206634521484




[36m(<lambda> pid=68980)[0m outputs isnan  
[36m(<lambda> pid=68980)[0m tensor(False, device='cuda:0')
[36m(<lambda> pid=68980)[0m Loss on train: 0, loss on validation: 11.521973609924316


[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000009)[32m [repeated 2x across cluster][0m


[36m(<lambda> pid=69155)[0m Epoch: 1[32m [repeated 17x across cluster][0m
[36m(<lambda> pid=69059)[0m outputs 2 isnan  tensor(False, device='cuda:0')[32m [repeated 156x across cluster][0m




[36m(<lambda> pid=69059)[0m outputs 2 isnan  [32m [repeated 13x across cluster][0m
[36m(<lambda> pid=69059)[0m tensor(False, device='cuda:0')[32m [repeated 13x across cluster][0m
[36m(<lambda> pid=69155)[0m Loss on train: 0, loss on validation: 11.427377700805664[32m [repeated 17x across cluster][0m


[36m(<lambda> pid=68980)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/lambda_2023-11-28_00-11-50/lambda_f6782919_1_batch_size=54,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.1075,transformer_dropout=0.35_2023-11-28_00-11-53/checkpoint_000018)[32m [repeated 18x across cluster][0m
2023-11-28 00:12:15,022	INFO tune.py:1047 -- Total run time: 23.98 seconds (20.80 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/root/ray_results/lambda_2023-11-28_00-11-50", trainable=...)
- lambda_b1932ebb: FileNotFoundError('Could not fetch metrics for lambda_b1932ebb: both result.json and progress.csv were not found at /root/ray_results/lambda_2023-11-28_00-11-50/lambda_b1932ebb_5_batch_size=62,encoder_layer=1,feedforward_dim=32,model_dim=6,pos_encoder_dropout=0.4076,transformer_dropout=0.43_2023-11-28_00-12-10')


In [None]:
if ray.is_initialized():
    ray.shutdown()

In [None]:
# Save as csv file
results.get_dataframe().to_csv(model_path / "trail_grid.csv")

In [None]:
best_result = results.get_best_result("loss", "min")
best_checkpoint = best_result.get_best_checkpoint("loss", "min")

best_model = torch.load(f"{best_checkpoint.path}/checkpoint.pt")

In [None]:
print(f"Best trail by loss value {best_result.metrics['loss']}", "\n------")
for i in best_result.config:
    print(f"Best trail: {i} value {best_result.config[i]}")

## Step 2 Fit Cable Properties