In [1]:
import os
import shutil
import pickle
import numpy as np
import torch
import copy
import matplotlib.pyplot as plt

from skopt.space import Real, Categorical, Integer

from bcnf.gp_minimize.gp_minimize import gp_minimize_fixed
from bcnf.simulation.physics import get_data
from bcnf.models.cnf import CondRealNVP
from bcnf.models.feature_network import FullyConnectedFeatureNetwork
from bcnf.eval.crossvalidate import cross_validate
from bcnf.errors import TrainingDivergedError
from bcnf.utils import get_dir

from bcnf.simulation.sampling import generate_data

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Setup

In [2]:
# HACK: scikit-optimize is not maintained anymore and this is a quick fix to make it work
# https://github.com/scikit-optimize/scikit-optimize/issues/1171#:~:text=To%20avoid%20this%20error%20in%20existing%20code%2C%20use%20int%20by%20itself.
np.int = np.int64

In [3]:
# Store and load the progress in the Huggingface repository
checkpoint_file = os.path.join(get_dir("models", "bcnf-models", "hyperparameter_optimization", "stage_2", create=True), 'checkpoint_improved.pkl')
metrics_dir = get_dir("models", "bcnf-models", "hyperparameter_optimization", "stage_2", "metrics", create=True)

print(os.path.abspath(checkpoint_file))
print(os.path.abspath(metrics_dir))

/home/psaegert/Projects/bcnf/models/bcnf-models/hyperparameter_optimization/stage_2/checkpoint_improved.pkl
/home/psaegert/Projects/bcnf/models/bcnf-models/hyperparameter_optimization/stage_2/metrics


## Data

In [4]:
param_names = ['x0_x','x0_y','x0_z','v0_x','v0_y','v0_z','g','w_x','w_y','w_z','b','A','Cd','rho','m','a_x','a_y','a_z','r']

## Hyperparameter

In [7]:
X, y = get_data(
    T=1.0,
    dt=1 / 30,
    N=2_000,
    break_on_impact=False
)

X_tensor = torch.Tensor(X.reshape(X.shape[0], -1))
y_tensor = torch.Tensor(y)

X_tensor.shape, y_tensor.shape

100%|██████████| 2000/2000 [00:02<00:00, 921.51it/s]


(torch.Size([2000, 90]), torch.Size([2000, 17]))

In [8]:
model_size = y_tensor.shape[1]
feature_size = X_tensor.shape[1]

print(f"Model size: {model_size}")
print(f"Feature size: {feature_size}")

Model size: 17
Feature size: 90


In [9]:
# These parameters have worked well in pilot experiments
optimizer_kwargs = {
    "lr": 2e-4
}

lr_scheduler_kwargs = {
    "mode": "min",
    "factor": 0.5,
    "patience": 250,
    "threshold_mode": "abs",
    "threshold": 1e-1,
}

In [10]:
# Define the search space
search_spaces = {
    'condition_size': Integer(1, 2048),
    'model_nested_size': Integer(16, 1024),
    'model_nested_layers': Integer(1, 8),
    'model_n_blocks': Integer(4, 32),
    'model_act_norm': Categorical([True, False]),
    'model_dropout': Real(0.0, 0.5),
    'feature_network_hidden_size': Integer(16, 256),
    'feature_network_hidden_layers': Integer(0, 16),
    'feature_network_dropout': Real(0.0, 0.5),
}

## Helper Functions

In [11]:
def param_index(name: str, search_spaces: dict[str, Real | Integer | Categorical]):
    """
    Get the index of a parameter in the search space to match the order of the parameters in the optimization function.
    
    Parameters
    ----------
    name : str
        The name of the parameter.
    search_spaces : dict[str, Real | Integer | Categorical]
        The search space.

    Returns
    -------
    int
        The index of the parameter in the search space.
    """
    return list(search_spaces.keys()).index(name)

In [12]:
def score_parameters(params: list):
    print({k: v for k, v in zip(search_spaces.keys(), params)})

    # Catch training errors (such as divergence) to filter out bad parameter sets and speed up the optimization
    try:
        # Use cross-validation to estimate the performance of the model
        fold_metrics = cross_validate(
            model_class=CondRealNVP,
            model_kwargs={
                "size": model_size,
                "nested_sizes": [params[param_index('model_nested_size', search_spaces)]] * params[param_index('model_nested_layers', search_spaces)],
                "n_blocks": params[param_index('model_n_blocks', search_spaces)],
                "n_conditions": params[param_index('condition_size', search_spaces)],
                "act_norm": params[param_index('model_act_norm', search_spaces)],
                "dropout": params[param_index('model_dropout', search_spaces)],
            },
            feature_network_class=FullyConnectedFeatureNetwork,
            feature_network_kwargs={
                "sizes": [feature_size]
                    + [params[param_index('feature_network_hidden_size', search_spaces)]] * params[param_index('feature_network_hidden_layers', search_spaces)]
                    + [params[param_index('condition_size', search_spaces)]],
                "dropout": params[param_index('feature_network_dropout', search_spaces)],    
            },
            optimizer_class=torch.optim.Adam,
            optimizer_kwargs=optimizer_kwargs,
            lr_scheduler_class=torch.optim.lr_scheduler.ReduceLROnPlateau,
            lr_scheduler_kwargs=lr_scheduler_kwargs,
            X=X_tensor,
            y=y_tensor,
            n_epochs=50_000,
            val_loss_patience=500,
            val_loss_tolerance=1e-1,  # Improvements to treat as significant
            val_loss_tolerance_mode="abs",
            timeout=60 * 60,  # 1 hour
            batch_size=256,
            device=device,
            verbose=True,  # Print the progress
            n_splits=3,
            errors="raise"  # Raise errors to stop the optimization
        )

        # Save the loss_history and metrics for analysis
        with open(os.path.join(metrics_dir, f'params_{"_".join([str(p) for p in params])}.pkl'), 'wb') as f:
            pickle.dump(fold_metrics, f)

    except TrainingDivergedError as e:
        print(e)
        return 100  # A big number (bad score) to avoid this parameter set

    # Print the average validation loss and its standard deviation during optimization
    val_loss_list = [r['val_loss'][1] for r in fold_metrics]  # each val_loss value is a tuple (epoch, loss)
    print(f'Val Loss: {np.mean(val_loss_list):.4f} ± {np.std(val_loss_list):.4f}')

    # Return the upper confidence bound of the validation loss
    # This encourages the optimization to find good AND reliable parameter sets
    return np.mean(val_loss_list) + np.std(val_loss_list)

## Optimization

In [13]:
# Numer of random initial points to explore the search space
N_STEPS_INIT = 30

# Number of iterations to run the optimization in total
N_STEPS = 100

In [14]:
# Load the checkpoint if it exists
if os.path.exists(checkpoint_file):
    print(f'Loading checkpoint from {checkpoint_file}')

    with open(checkpoint_file, 'rb') as f:
        checkpoint = pickle.load(f)

        # Re-assign the function and callback because they are not picklable
        checkpoint['specs']['args']['func'] = score_parameters

    print(f'Resuming from iteration {len(checkpoint.x_iters)}')

    x0 = checkpoint.x_iters
    y0 = checkpoint.func_vals

    print(f'{len(x0)} iterations so far with {len(y0)} evaluations (minimum: {np.min(y0):.4f})')

    n_initial_points = max(0, N_STEPS_INIT - len(x0))
    n_calls_remaining = max(0, N_STEPS - len(y0))
else:
    print('No checkpoint found. Starting new optimization')
    checkpoint = None

    n_initial_points = N_STEPS_INIT
    n_calls_remaining = N_STEPS

    x0 = None
    y0 = None
    
print(f'Running with {n_initial_points} initial points and {n_calls_remaining} remaining iterations')

Loading checkpoint from /home/psaegert/Projects/bcnf/models/bcnf-models/hyperparameter_optimization/stage_2/checkpoint_improved.pkl
Resuming from iteration 96
96 iterations so far with 96 evaluations (minimum: -61.2413)
Running with 0 initial points and 4 remaining iterations


In [15]:
# You might want to adjust the n_calls or other parameters based on the checkpoint
result = gp_minimize_fixed(
    func=score_parameters,
    dimensions=search_spaces.values(),
    n_initial_points=n_initial_points,  # Number of random points before starting the optimization
    n_calls=n_calls_remaining,  # Number of iterations
    random_state=2024_03_25,
    verbose=True,
    acq_func="EI",  # Expected Improvement https://arxiv.org/pdf/1009.5419.pdf
    initial_point_generator="halton", # https://en.wikipedia.org/wiki/Halton_sequence
    checkpoint_file=checkpoint_file,
    x0=x0,
    y0=y0)

{'args': {'func': <function score_parameters at 0x7f50c9f04860>, 'dimensions': Space([Integer(low=1, high=2048, prior='uniform', transform='normalize'),
       Integer(low=16, high=1024, prior='uniform', transform='normalize'),
       Integer(low=1, high=8, prior='uniform', transform='normalize'),
       Integer(low=4, high=32, prior='uniform', transform='normalize'),
       Categorical(categories=(True, False), prior=None),
       Real(low=0.0, high=0.5, prior='uniform', transform='normalize'),
       Integer(low=16, high=256, prior='uniform', transform='normalize'),
       Integer(low=0, high=16, prior='uniform', transform='normalize'),
       Real(low=0.0, high=0.5, prior='uniform', transform='normalize')]), 'base_estimator': GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1, 1, 1, 1, 1, 1], nu=2.5),
                         n_restarts_optimizer=2, noise='gaussian',
                         normalize_y=True, random_state=77390924), 'n_calls': 4, 'n_random_starts

Train: -47.2792 - Val: -43.3270 (avg: -43.3135, min: -43.2547) | lr: 3.91e-07 - Patience: 500/500:  22%|██▏       | 10931/50000 [47:25<2:49:29,  3.84it/s]
Train: -42.1494 - Val: -41.0040 (avg: -41.0441, min: -40.9894) | lr: 3.91e-07 - Patience: 500/500:  22%|██▏       | 10828/50000 [43:08<2:36:03,  4.18it/s]
Train: -43.4340 - Val: -42.2517 (avg: -42.2525, min: -42.1954) | lr: 3.91e-07 - Patience: 500/500:  20%|█▉        | 9989/50000 [40:21<2:41:38,  4.13it/s]


Val Loss: -41.9571 ± 0.7578
Saving checkpoint with 97 iterations and 97 function evaluations (minimum: -61.24128302096609).
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 7859.8613
Function value obtained: -41.1993
Current minimum: -61.2413
Iteration No: 2 started. Searching for the next optimal point.
Asking optimizer for next point 2: [276, 16, 6, 23, True, 0.2615834182531671, 16, 16, 0.25317236071179605]
{'condition_size': 276, 'model_nested_size': 16, 'model_nested_layers': 6, 'model_n_blocks': 23, 'model_act_norm': True, 'model_dropout': 0.2615834182531671, 'feature_network_hidden_size': 16, 'feature_network_hidden_layers': 16, 'feature_network_dropout': 0.25317236071179605}


Train: -44.7825 - Val: -45.5008 (avg: -45.4926, min: -45.4508) | lr: 7.81e-07 - Patience: 500/500:  22%|██▏       | 10879/50000 [34:25<2:03:46,  5.27it/s]
Train: -42.5668 - Val: -43.8528 (avg: -43.8563, min: -43.8027) | lr: 7.81e-07 - Patience: 500/500:  24%|██▎       | 11784/50000 [37:11<2:00:37,  5.28it/s]
Train: -45.3430 - Val: -46.4052 (avg: -46.4350, min: -46.3414) | lr: 1.56e-06 - Patience: 500/500:  24%|██▍       | 12002/50000 [37:56<2:00:06,  5.27it/s]


Val Loss: -45.2195 ± 0.9774
Saving checkpoint with 98 iterations and 98 function evaluations (minimum: -61.24128302096609).
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 6574.0262
Function value obtained: -44.2421
Current minimum: -61.2413
Iteration No: 3 started. Searching for the next optimal point.
Asking optimizer for next point 3: [477, 16, 7, 23, True, 0.3822500471536552, 150, 1, 0.1872762283326637]
{'condition_size': 477, 'model_nested_size': 16, 'model_nested_layers': 7, 'model_n_blocks': 23, 'model_act_norm': True, 'model_dropout': 0.3822500471536552, 'feature_network_hidden_size': 150, 'feature_network_hidden_layers': 1, 'feature_network_dropout': 0.1872762283326637}


Train: -55.8146 - Val: -59.6296 (avg: -59.6467, min: -59.5852) | lr: 7.81e-07 - Patience: 500/500:  26%|██▌       | 12779/50000 [41:35<2:01:08,  5.12it/s]
Train: -52.4885 - Val: -56.6940 (avg: -56.7033, min: -56.6163) | lr: 1.56e-06 - Patience: 500/500:  24%|██▍       | 11898/50000 [38:49<2:04:19,  5.11it/s]
Train: -56.3357 - Val: -60.0986 (avg: -60.0981, min: -60.0512) | lr: 7.81e-07 - Patience: 500/500:  27%|██▋       | 13465/50000 [44:02<1:59:29,  5.10it/s]


Val Loss: -58.7654 ± 1.4490
Saving checkpoint with 99 iterations and 99 function evaluations (minimum: -61.24128302096609).
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 7467.8669
Function value obtained: -57.3163
Current minimum: -61.2413
Iteration No: 4 started. Searching for the next optimal point.
Asking optimizer for next point 4: [1861, 16, 7, 26, True, 0.4218244357158045, 160, 0, 0.29820783345367463]
{'condition_size': 1861, 'model_nested_size': 16, 'model_nested_layers': 7, 'model_n_blocks': 26, 'model_act_norm': True, 'model_dropout': 0.4218244357158045, 'feature_network_hidden_size': 160, 'feature_network_hidden_layers': 0, 'feature_network_dropout': 0.29820783345367463}


Train: -54.1409 - Val: -57.7781 (avg: -57.7640, min: -57.6936) | lr: 7.81e-07 - Patience: 500/500:  26%|██▌       | 12944/50000 [47:08<2:14:57,  4.58it/s]  
Train: -44.8185 - Val: -47.3377 (avg: -47.1097, min: -47.9806) | lr: 1.00e-04 - Patience: 500/500:  12%|█▏        | 6247/50000 [22:35<2:38:16,  4.61it/s]
Train: -53.1771 - Val: -56.4316 (avg: -56.4392, min: -56.3440) | lr: 1.56e-06 - Patience: 500/500:  24%|██▍       | 12041/50000 [43:36<2:17:29,  4.60it/s]


Val Loss: -53.8541 ± 4.6576
Saving checkpoint with 100 iterations and 100 function evaluations (minimum: -61.24128302096609).
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 6802.3130
Function value obtained: -49.1966
Current minimum: -61.2413
