# Model 1: Fully Connected Neural Network

In [1]:
from pathlib import Path
import numpy as np
import torch 

In [2]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [3]:
# Import local modules from 'src/utils' as package 'utils'
import sys; sys.path.insert(0, '../')

In [4]:
import utils

## Read Datasets from .csv

In [5]:
from utils.file_io import read_angle_datasets

In [6]:
data_folder = Path("../../data/")
train_data, test_data = read_angle_datasets(data_folder, 0.9)

Reading .csv files: 1it [00:01,  1.40s/it]


In [7]:
input_shape, output_shape = train_data[0][0].shape[0], train_data[0][1].shape[0]
print(f"Data shape {input_shape} / {output_shape} of total {len(train_data)} data rows!")

Data shape 8 / 3 of total 62379 data rows!


  return row[0], row[1]


## Model definition

In [8]:
from torch import nn, Tensor
from typing import Tuple, List

In [9]:
class FullyConnected(nn.Module):
    def __init__(self, flattened_input_dim: int, intermediate_dims: List[int], output_dim: int, dropout: float = 0.25, hidden_activation = nn.ReLU) -> None:
        super().__init__()
        self.total_epochs = 0
        self.flatten = nn.Flatten()
        self.hidden = nn.Sequential()
        for i, dim in enumerate(intermediate_dims):
            if i == 0:
                self.hidden.add_module(f"linear_{i+1}", nn.Linear(flattened_input_dim, dim))
            else:
                self.hidden.add_module(f"linear_{i+1}", nn.Linear(intermediate_dims[i-1], dim))

            self.hidden.add_module(f"hidden_activation_{i+1}", hidden_activation())
            self.hidden.add_module(f"dropout_{i+2}", nn.Dropout(dropout))

        self.last = nn.Linear(intermediate_dims[-1], output_dim)

    def forward(self, x: Tensor) -> Tensor:
        x = self.flatten(x)
        x = self.hidden(x)
        return self.last(x)

## Load parameter, functions and dataloader

In [10]:
import os
from dotenv import load_dotenv
from torch.utils.data import DataLoader

from utils.file_io import save_model
from utils.evaluation import compute_loss_on
from utils.file_io import define_dataloader_from_angle_dataset

In [11]:
model_path = Path("../../models/fully_connected/").absolute()

In [12]:
dotenv_path = model_path / ".env"
load_dotenv(dotenv_path=dotenv_path)

learning_rate = float(os.getenv("LEARNING_RATE"))
batch_size = int(os.getenv("BATCH_SIZE"))
num_epochs = int(os.getenv("NUM_EPOCHS"))

In [13]:
def get_optimizer_function(model: nn.Module, learning_rate: float) -> torch.optim:
    return torch.optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
def get_loss_function() -> nn.Module:
    return torch.nn.MSELoss()

In [15]:
train_dataloader, validation_dataloader, test_dataloader = define_dataloader_from_angle_dataset(train_data, test_data, batch_size, split_size=0.95)

## Define train methods

In [16]:
def train_epoch(train_dataloader: DataLoader, model: nn.Module, loss_function, optimizer, 
                device: torch.device, report_interval: int = 1000) -> float:

    running_loss = 0
    last_loss = 0
    
    for i, (inputs, true_values) in enumerate(train_dataloader):

        inputs = inputs.to(device)
        true_values = true_values.to(device)
                
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, true_values)
        running_loss += loss
        loss.backward()
        optimizer.step() 
    
    if i % report_interval == report_interval - 1:
        last_loss = running_loss / report_interval
        print(f"batch {i + 1}, Mean Squared Error: {last_loss}")
        running_loss = 0
    
    return last_loss 

In [17]:
def train(epochs: int, train_dataloader: DataLoader, validation_dataloader: DataLoader, model: nn.Module, loss_function, optimizer, 
          checkpoint_path: Path, device: torch.device = 'cpu', report_interval: int = 1000, tune: bool = False) -> nn.Module:

    best_val_loss = float("inf")

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    model.to(device)

    best_model = model

    for epoch in range(model.total_epochs, epochs):
        print(f"Epoch: {epoch + 1}")

        model.train(True)
        avg_loss = train_epoch(train_dataloader, model, loss_function, optimizer, device, report_interval)
        model.eval()

        with torch.no_grad():
            avg_val_loss = compute_loss_on(validation_dataloader, model, loss_function, device=device)

        print(f"Loss on train: {avg_loss}, loss on validation: {avg_val_loss}")

        model.total_epochs += 1
    
        # Disable checkpoints while running in tune mode
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss

            if not tune:
                model_path = checkpoint_path / f"{checkpoint_path.name}_{epoch}.pt"
                save_model(model, model_path)
            
            best_model = model

    return best_model   

## Train the model with grid search hyperparameter tuning

In [18]:
import ray
import json
from ray import tune
from ray import train as ray_train
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler
from typing import Dict
from utils.evaluation import compute_predictions, compute_losses_from
from functools import partial

In [19]:
def run_id_generator(data: Dict) -> str:
    return str(hash(json.dumps(data)))

In [20]:
def parameter_train(parameter: Dict, train_data: DataLoader, test_data: DataLoader, model_input_shape: int, model_output_shape: int, checkpoint_path: Path, device: torch.device) -> None:

    run_id = run_id_generator(parameter)
    run_checkpoint = checkpoint_path / run_id
    run_checkpoint.mkdir(parents=True)

    train_dataloader, validation_dataloader, test_dataloader = define_dataloader_from_angle_dataset(train_data, test_data, batch_size=parameter["batch_size"])

    model = FullyConnected(model_input_shape, parameter["hidden_layers"], model_output_shape)

    optimizer = get_optimizer_function(model, parameter["lr"])
    loss_function = get_loss_function()

    last_model = train(parameter["epochs"], train_dataloader, validation_dataloader, model, loss_function, optimizer, checkpoint_path, device, report_interval=50, tune=True)

    y, y_true = compute_predictions(test_dataloader, last_model)
    test_losses = compute_losses_from(y, y_true, loss_function)

    torch.save((last_model.state_dict(), optimizer.state_dict()), run_checkpoint / "checkpoint.pt")

    print(str(parameter), file=open(run_checkpoint / "trail_parameter.txt", "w"))

    ray_train.report(
        metrics={ "loss": float(test_losses.mean()), "parameter": parameter , "run_id": run_id }
    )

Define parameter ranges

In [21]:
learning_rate_radius = 1e-3
num_epochs_radius = 50
batch_size_radius = 10
hidden_layer_sizes = [16, 32, 64]
hidden_layer_count = [2, 4, 6]

In [22]:
parameter = {
    "lr": tune.loguniform(learning_rate_radius - learning_rate, learning_rate + learning_rate_radius),
    "epochs": tune.choice(list(range(num_epochs - num_epochs_radius, num_epochs + num_epochs_radius, 20))),
    "batch_size": tune.grid_search(list(range(batch_size - batch_size_radius, batch_size + batch_size_radius, 4))),
    "hidden_layers": tune.grid_search(list([hidden_layer_sizes[i % len(hidden_layer_sizes)] for i in range(dim)] for _, dim in enumerate(hidden_layer_count))),
    "dropout": tune.uniform(0.1, 0.5)
}

In [23]:
scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=num_epochs,
    grace_period=1,
    reduction_factor=2,
)

In [24]:
if ray.is_initialized():
    ray.shutdown()

ray.init(runtime_env={ "py_modules": [utils] })

2023-11-19 17:58:06,232	INFO worker.py:1673 -- Started a local Ray instance.
2023-11-19 17:58:06,237	INFO packaging.py:530 -- Creating a file package for local directory '/mnt/src/notebooks/../utils'.
2023-11-19 17:58:06,240	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_d8c8a9358aefbc3b.zip' (0.03MiB) to Ray cluster...
2023-11-19 17:58:06,241	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_d8c8a9358aefbc3b.zip'.


0,1
Python version:,3.10.12
Ray version:,2.8.0


In [25]:
ray_resources_manager = tune.with_resources(
    trainable=lambda param: parameter_train(param, train_data, test_data, input_shape, output_shape, model_path, device),
    # See: https://stackoverflow.com/questions/58967793/what-is-the-way-to-make-tune-run-parallel-trials-across-multiple-gpus
    resources={ "cpu": 3, "gpu": 0.25 }
)

tuner = tune.Tuner(
    ray_resources_manager,
    param_space=parameter,
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        num_samples=20
    )
)

In [26]:
results = tuner.fit()

0,1
Current time:,2023-11-19 18:04:47
Running for:,00:06:37.98
Memory:,12.3/62.8 GiB

Trial name,status,loc,batch_size,dropout,epochs,hidden_layers,lr
lambda_32f5e_00000,RUNNING,172.17.0.2:26783,54,0.105522,990,"[16, 32]",0.00103166
lambda_32f5e_00001,RUNNING,172.17.0.2:26784,58,0.21181,990,"[16, 32]",0.00108566
lambda_32f5e_00002,RUNNING,172.17.0.2:26785,62,0.269716,1030,"[16, 32]",0.00093458
lambda_32f5e_00003,RUNNING,172.17.0.2:26786,66,0.207716,1010,"[16, 32]",0.000947284
lambda_32f5e_00004,PENDING,,70,0.400545,950,"[16, 32]",0.00102713
lambda_32f5e_00005,PENDING,,54,0.205669,990,"[16, 32, 64, 16]",0.00108168
lambda_32f5e_00006,PENDING,,58,0.229881,1030,"[16, 32, 64, 16]",0.00102237
lambda_32f5e_00007,PENDING,,62,0.354973,1030,"[16, 32, 64, 16]",0.00108978
lambda_32f5e_00008,PENDING,,66,0.391208,950,"[16, 32, 64, 16]",0.00094122
lambda_32f5e_00009,PENDING,,70,0.415408,990,"[16, 32, 64, 16]",0.00103844




[36m(<lambda> pid=26786)[0m Epoch: 1


[36m(<lambda> pid=26786)[0m   return row[0], row[1]


[36m(<lambda> pid=26786)[0m Loss on train: 0, loss on validation: 0.3048282265663147
[36m(<lambda> pid=26786)[0m Epoch: 2[32m [repeated 4x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(<lambda> pid=26786)[0m Loss on train: 0, loss on validation: 0.16283735632896423[32m [repeated 4x across cluster][0m
[36m(<lambda> pid=26786)[0m Epoch: 3[32m [repeated 4x across cluster][0m
[36m(<lambda> pid=26786)[0m Loss on train: 0, loss on validation: 0.11116863787174225[32m [repeated 4x across cluster][0m
[36m(<lambda> pid=26786)[0m Epoch: 4[32m [repeated 4x across cluster][0m
[36m(<lambda> pid=26786)[0m Loss on train: 0, loss on validation: 0.07930207252502441[32m [repeated 4x across cluster][0m
[36m(<lambda> pid=26786)[0m Epoch: 5[32m [repeated 4x across cluster][0m
[36m(<lambda> pid=26786)[0m Lo



[36m(<lambda> pid=26785)[0m Loss on train: 0, loss on validation: 0.023599527776241302[32m [repeated 4x across cluster][0m
[36m(<lambda> pid=26785)[0m Epoch: 57[32m [repeated 4x across cluster][0m


2023-11-19 18:04:58,003	INFO tune.py:1047 -- Total run time: 409.65 seconds (397.97 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/root/ray_results/lambda_2023-11-19_17-58-08", trainable=...)
- lambda_32f5e_00004: FileNotFoundError('Could not fetch metrics for lambda_32f5e_00004: both result.json and progress.csv were not found at /root/ray_results/lambda_2023-11-19_17-58-08/lambda_32f5e_00004_4_batch_size=70,dropout=0.4005,epochs=950,hidden_layers=16_32,lr=0.0010_2023-11-19_17-58-19')
- lambda_32f5e_00005: FileNotFoundError('Could not fetch metrics for lambda_32f5e_00005: both result.json and progress.csv were not found at /root/ray_results/lambda_2023-11-19_17-58-08/lambda_32f5e_00005_5_batch_size=54,dropout=0.2057,epochs=990,hidden_layers=16_32_64_16,lr=0.0011_2023-11-19_17-58-20')
- lambda_32f5e_00006: FileNotFoundError('Could not fetch metrics for lambda_32f5e_00006: both result.json and progress.csv were not found at /root/ray_results/lambda_2023-11-19

[36m(<lambda> pid=26784)[0m Loss on train: 0, loss on validation: 0.04228660464286804[32m [repeated 3x across cluster][0m
[36m(<lambda> pid=26784)[0m Epoch: 56[32m [repeated 3x across cluster][0m


KeyboardInterrupt: 

Exception ignored in atexit callback: <function _unregister_all at 0x7f1f0d107d90>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/tune/registry.py", line 171, in _unregister_all
    _unregister_trainables()
  File "/usr/local/lib/python3.10/dist-packages/ray/tune/registry.py", line 116, in _unregister_trainables
    _global_registry.unregister_all(TRAINABLE_CLASS)
  File "/usr/local/lib/python3.10/dist-packages/ray/tune/registry.py", line 251, in unregister_all
    self.unregister(cat, key)
  File "/usr/local/lib/python3.10/dist-packages/ray/tune/registry.py", line 243, in unregister
    _internal_kv_del(_make_key(self.prefix, category, key))
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/experimental/internal_kv.py", line 109, in _internal_kv_del
    return global_gcs_client.internal_kv_del(key, del_by_pre

In [None]:
best_result = results.get_best_result("loss", "min")

for key in best_result.metrics["parameter"]:
    print(f"Best Trail: {key} with {best_result.metrics['parameter'][key]}")

In [None]:
# Checkpoint contains model and optimizer function
best_model, _ = torch.load(model_path / best_result.metrics["run_id"] / "checkpoint.pt")

In [None]:
print(f"Currently best trial defined by loss: {best_result.metrics['run_id']}", file=open(model_path / "best_trail.txt", "w"))

## Evaluation

In [None]:
from utils.visualization import create_trace_animation
from matplotlib import pyplot as plt
from IPython.display import HTML

In [None]:
# Compute eval on the cpu
device = 'cpu'

In [None]:
%matplotlib notebook
 
plt.rcParams["animation.html"] = "jshtml"
plt.rcParams['figure.dpi'] = 150  

### Loading the best model

In [None]:
from utils.file_io import load_model

In [None]:
loss_function = get_loss_function()

In [None]:
model = FullyConnected(input_shape, best_result.metrics["parameter"]["hidden_layers"], output_shape)
model.load_state_dict(best_model)
model.eval()

In [None]:
y, y_true = compute_predictions(test_dataloader, model, device)
test_losses = compute_losses_from(y, y_true, loss_function)
print(f"The mean squared error of the loaded model on test is: {test_losses.mean()}")

In [None]:
animation = create_trace_animation(y.numpy(), y_true.numpy())
HTML(animation.to_jshtml())