In [1]:
import os
import shutil
import pickle
import numpy as np
import torch
import copy
import matplotlib.pyplot as plt

from skopt.space import Real, Categorical, Integer

from bcnf.gp_minimize.gp_minimize import gp_minimize_fixed
from bcnf.simulation.physics import get_data
from bcnf.models.cnf import CondRealNVP
from bcnf.models.feature_network import FullyConnectedFeatureNetwork
from bcnf.eval.crossvalidate import cross_validate
from bcnf.errors import TrainingDivergedError
from bcnf.utils import get_dir

from bcnf.simulation.sampling import generate_data

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Setup

In [2]:
# HACK: scikit-optimize is not maintained anymore and this is a quick fix to make it work
# https://github.com/scikit-optimize/scikit-optimize/issues/1171#:~:text=To%20avoid%20this%20error%20in%20existing%20code%2C%20use%20int%20by%20itself.
np.int = np.int64

In [3]:
# Store and load the progress in the Huggingface repository
checkpoint_file = os.path.join(get_dir("models", "bcnf-models", "hyperparameter_optimization", "stage_2", create=True), 'checkpoint_improved.pkl')
metrics_dir = get_dir("models", "bcnf-models", "hyperparameter_optimization", "stage_2", "metrics", create=True)

print(os.path.abspath(checkpoint_file))
print(os.path.abspath(metrics_dir))

/home/psaegert/Projects/bcnf/models/bcnf-models/hyperparameter_optimization/stage_2/checkpoint_improved.pkl
/home/psaegert/Projects/bcnf/models/bcnf-models/hyperparameter_optimization/stage_2/metrics


## Data

In [4]:
param_names = ['x0_x','x0_y','x0_z','v0_x','v0_y','v0_z','g','w_x','w_y','w_z','b','A','Cd','rho','m','a_x','a_y','a_z','r']

In [5]:
# dataset_name = "hyperparameter_optimization_trajectories"

# if not os.path.exists(os.path.join(get_dir('data', 'bcnf-data'), dataset_name + '.pkl')):
#     data = generate_data(
#         n=2000,
#         type="trajectory",
#         SPF=1/30,
#         T=3,
#         config_file=os.path.join(get_dir("configs"), "throw_upwards.yaml"),
#         verbose=True,
#         break_on_impact=False,
#         name=dataset_name
#     )
# else:
#     with open(os.path.join(get_dir('data', 'bcnf-data'), dataset_name + '.pkl'), 'rb') as f:
#         data = pickle.load(f)

In [6]:
# X = np.array(data['traj'])
# y = np.column_stack([np.array(data[param]) for param in param_names])


# X_tensor = torch.Tensor(X.reshape(X.shape[0], -1))
# y_tensor = torch.Tensor(y)

# X_tensor.shape, y_tensor.shape

## Hyperparameter

In [None]:
X, y = get_data(
    T=1.0,
    dt=1 / 30,
    N=2_000,
    break_on_impact=False
)

X_tensor = torch.Tensor(X.reshape(X.shape[0], -1))
y_tensor = torch.Tensor(y)

X_tensor.shape, y_tensor.shape

100%|██████████| 2000/2000 [00:02<00:00, 904.24it/s]


(torch.Size([2000, 90]), torch.Size([2000, 17]))

In [8]:
model_size = y_tensor.shape[1]
feature_size = X_tensor.shape[1]

print(f"Model size: {model_size}")
print(f"Feature size: {feature_size}")

Model size: 17
Feature size: 90


In [9]:
# These parameters have worked well in pilot experiments
optimizer_kwargs = {
    "lr": 2e-4
}

lr_scheduler_kwargs = {
    "mode": "min",
    "factor": 0.5,
    "patience": 250,
    "threshold_mode": "abs",
    "threshold": 1e-1,
}

In [10]:
# Define the search space
search_spaces = {
    'condition_size': Integer(1, 2048),
    'model_nested_size': Integer(16, 1024),
    'model_nested_layers': Integer(1, 8),
    'model_n_blocks': Integer(4, 32),
    'model_act_norm': Categorical([True, False]),
    'model_dropout': Real(0.0, 0.5),
    'feature_network_hidden_size': Integer(16, 256),
    'feature_network_hidden_layers': Integer(0, 16),
    'feature_network_dropout': Real(0.0, 0.5),
}

## Helper Functions

In [11]:
def param_index(name: str, search_spaces: dict[str, Real | Integer | Categorical]):
    """
    Get the index of a parameter in the search space to match the order of the parameters in the optimization function.
    
    Parameters
    ----------
    name : str
        The name of the parameter.
    search_spaces : dict[str, Real | Integer | Categorical]
        The search space.

    Returns
    -------
    int
        The index of the parameter in the search space.
    """
    return list(search_spaces.keys()).index(name)

In [12]:
def score_parameters(params: list):
    print({k: v for k, v in zip(search_spaces.keys(), params)})

    # Catch training errors (such as divergence) to filter out bad parameter sets and speed up the optimization
    try:
        # Use cross-validation to estimate the performance of the model
        fold_metrics = cross_validate(
            model_class=CondRealNVP,
            model_kwargs={
                "size": model_size,
                "nested_sizes": [params[param_index('model_nested_size', search_spaces)]] * params[param_index('model_nested_layers', search_spaces)],
                "n_blocks": params[param_index('model_n_blocks', search_spaces)],
                "n_conditions": params[param_index('condition_size', search_spaces)],
                "act_norm": params[param_index('model_act_norm', search_spaces)],
                "dropout": params[param_index('model_dropout', search_spaces)],
            },
            feature_network_class=FullyConnectedFeatureNetwork,
            feature_network_kwargs={
                "sizes": [feature_size]
                    + [params[param_index('feature_network_hidden_size', search_spaces)]] * params[param_index('feature_network_hidden_layers', search_spaces)]
                    + [params[param_index('condition_size', search_spaces)]],
                "dropout": params[param_index('feature_network_dropout', search_spaces)],    
            },
            optimizer_class=torch.optim.Adam,
            optimizer_kwargs=optimizer_kwargs,
            lr_scheduler_class=torch.optim.lr_scheduler.ReduceLROnPlateau,
            lr_scheduler_kwargs=lr_scheduler_kwargs,
            X=X_tensor,
            y=y_tensor,
            n_epochs=50_000,
            val_loss_patience=500,
            val_loss_tolerance=1e-1,  # Improvements to treat as significant
            val_loss_tolerance_mode="abs",
            timeout=60 * 60,  # 1 hour
            batch_size=256,
            device=device,
            verbose=True,  # Print the progress
            n_splits=3,
            errors="raise"  # Raise errors to stop the optimization
        )

        # Save the loss_history and metrics for analysis
        with open(os.path.join(metrics_dir, f'params_{"_".join([str(p) for p in params])}.pkl'), 'wb') as f:
            pickle.dump(fold_metrics, f)

    except TrainingDivergedError as e:
        print(e)
        return 100  # A big number (bad score) to avoid this parameter set

    # Print the average validation loss and its standard deviation during optimization
    val_loss_list = [r['val_loss'][1] for r in fold_metrics]  # each val_loss value is a tuple (epoch, loss)
    print(f'Val Loss: {np.mean(val_loss_list):.4f} ± {np.std(val_loss_list):.4f}')

    # Return the upper confidence bound of the validation loss
    # This encourages the optimization to find good AND reliable parameter sets
    return np.mean(val_loss_list) + np.std(val_loss_list)

In [13]:
def save_checkpoint(result):
    """
    Save the result of the optimization to a checkpoint file.
    Used as a callback in the optimization function.
    
    Parameters
    ----------
    result : OptimizeResult
        The result of the optimization.
    """
    # Safely write the result to a temporary file first without overwriting the checkpoint file
    with open(checkpoint_file + ".tmp", 'wb') as f:
        # Ignore
        # - result['specs']['args']['func']
        # - result['specs']['args']['callback']
        # because it causes problems when reading somewhere else
        result_no_func = copy.deepcopy(result)
        del result_no_func['specs']['args']['func']
        del result_no_func['specs']['args']['callback']
        pickle.dump(result_no_func, f)

    # Delete the old checkpoint file and rename the temporary file
    shutil.move(checkpoint_file + ".tmp", checkpoint_file)

## Optimization

In [14]:
# Numer of random initial points to explore the search space
N_STEPS_INIT = 30

# Number of iterations to run the optimization in total
N_STEPS = 100

In [15]:
# Load the checkpoint if it exists
if os.path.exists(checkpoint_file):
    print(f'Loading checkpoint from {checkpoint_file}')

    with open(checkpoint_file, 'rb') as f:
        checkpoint = pickle.load(f)

        # Re-assign the function and callback because they are not picklable
        checkpoint['specs']['args']['func'] = score_parameters
        checkpoint['specs']['args']['callback'] = save_checkpoint

    print(f'Resuming from iteration {len(checkpoint.x_iters)}')

    n_initial_points = max(0, N_STEPS_INIT - len(checkpoint.x_iters))
    n_calls_remaining = max(0, N_STEPS - len(checkpoint.x_iters))
    x0 = checkpoint.x_iters
    y0 = checkpoint.func_vals
else:
    print('No checkpoint found. Starting new optimization')
    checkpoint = None

    n_initial_points = N_STEPS_INIT
    n_calls_remaining = N_STEPS

    x0 = None
    y0 = None
    
print(f'Running with {n_initial_points} initial points and {n_calls_remaining} remaining iterations')

No checkpoint found. Starting new optimization
Running with 30 initial points and 100 remaining iterations


In [16]:
# You might want to adjust the n_calls or other parameters based on the checkpoint
result = gp_minimize_fixed(
    func=score_parameters,
    dimensions=search_spaces.values(),
    n_initial_points=n_initial_points,  # Number of random points before starting the optimization
    n_calls=n_calls_remaining,  # Number of iterations
    random_state=2024_03_25,
    verbose=True,
    acq_func="EI",  # Expected Improvement https://arxiv.org/pdf/1009.5419.pdf
    initial_point_generator="halton", # https://en.wikipedia.org/wiki/Halton_sequence
    callback=save_checkpoint,
    x0=x0,
    y0=y0)

30 initial points will be randomly generated
Iteration No: 1 started. Evaluating function at random point.
{'condition_size': 1, 'model_nested_size': 16, 'model_nested_layers': 1, 'model_n_blocks': 4, 'model_act_norm': False, 'model_dropout': 0.0, 'feature_network_hidden_size': 16, 'feature_network_hidden_layers': 0, 'feature_network_dropout': 0.0}


Train: -8.1279 - Val: -7.9453 (avg: -7.9333, min: -7.8651) | lr: 6.25e-06 - Patience: 500/500:  15%|█▍        | 7449/50000 [02:53<16:32, 42.88it/s] 
Train: -11.9331 - Val: -10.6863 (avg: -10.6912, min: -10.6218) | lr: 6.25e-06 - Patience: 500/500:  13%|█▎        | 6259/50000 [02:27<17:10, 42.43it/s]
Train: -11.7297 - Val: -11.1704 (avg: -11.1647, min: -11.0659) | lr: 6.25e-06 - Patience: 500/500:  20%|█▉        | 9823/50000 [03:54<15:58, 41.93it/s]


Val Loss: -10.3427 ± 1.5979
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 556.0341
Function value obtained: -8.7448
Current minimum: -8.7448
Iteration No: 2 started. Evaluating function at random point.
{'condition_size': 1024, 'model_nested_size': 352, 'model_nested_layers': 2, 'model_n_blocks': 8, 'model_act_norm': False, 'model_dropout': 0.038461538461538464, 'feature_network_hidden_size': 30, 'feature_network_hidden_layers': 1, 'feature_network_dropout': 0.021739130434782608}


Train: -30.4528 - Val: -22.7752 (avg: -22.9189, min: -23.6763) | lr: 5.00e-05 - Patience: 500/500:   4%|▎         | 1818/50000 [01:17<34:04, 23.57it/s]
Train: -30.0309 - Val: -21.5282 (avg: -21.6643, min: -22.8815) | lr: 5.00e-05 - Patience: 500/500:   3%|▎         | 1678/50000 [01:13<35:18, 22.81it/s]
Train: -29.5956 - Val: -22.0164 (avg: -21.8390, min: -22.7756) | lr: 5.00e-05 - Patience: 500/500:   4%|▎         | 1820/50000 [01:16<33:53, 23.70it/s]


Val Loss: -23.3258 ± 0.3389
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 227.5756
Function value obtained: -22.9869
Current minimum: -22.9869
Iteration No: 3 started. Evaluating function at random point.
{'condition_size': 513, 'model_nested_size': 688, 'model_nested_layers': 4, 'model_n_blocks': 12, 'model_act_norm': False, 'model_dropout': 0.07692307692307693, 'feature_network_hidden_size': 44, 'feature_network_hidden_layers': 2, 'feature_network_dropout': 0.043478260869565216}


Train: -27.7142 - Val: -20.7422 (avg: -21.2111, min: -24.5751) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1363/50000 [01:49<1:04:56, 12.48it/s]
Train: -27.0091 - Val: -18.9585 (avg: -19.7457, min: -23.4283) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1299/50000 [01:44<1:05:05, 12.47it/s]
Train: -26.2960 - Val: -19.9507 (avg: -19.6917, min: -21.8513) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 1111/50000 [01:28<1:05:04, 12.52it/s]


Val Loss: -20.2697 ± 1.0808
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 302.4071
Function value obtained: -19.1889
Current minimum: -22.9869
Iteration No: 4 started. Evaluating function at random point.
{'condition_size': 1536, 'model_nested_size': 128, 'model_nested_layers': 5, 'model_n_blocks': 16, 'model_act_norm': False, 'model_dropout': 0.11538461538461539, 'feature_network_hidden_size': 58, 'feature_network_hidden_layers': 3, 'feature_network_dropout': 0.06521739130434782}


Train: -27.2286 - Val: -28.9385 (avg: -28.9286, min: -28.8572) | lr: 3.13e-06 - Patience: 500/500:  11%|█         | 5440/50000 [09:27<1:17:31,  9.58it/s]
Train: -27.2377 - Val: -27.0291 (avg: -27.0222, min: -26.9902) | lr: 3.13e-06 - Patience: 500/500:  11%|█         | 5601/50000 [09:46<1:17:30,  9.55it/s]
Train: -27.3872 - Val: -28.8222 (avg: -28.8237, min: -28.7427) | lr: 3.13e-06 - Patience: 500/500:  11%|█▏        | 5713/50000 [09:58<1:17:21,  9.54it/s]


Val Loss: -28.2112 ± 0.8029
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1753.4937
Function value obtained: -27.4083
Current minimum: -27.4083
Iteration No: 5 started. Evaluating function at random point.
{'condition_size': 257, 'model_nested_size': 464, 'model_nested_layers': 7, 'model_n_blocks': 20, 'model_act_norm': False, 'model_dropout': 0.15384615384615385, 'feature_network_hidden_size': 72, 'feature_network_hidden_layers': 3, 'feature_network_dropout': 0.08695652173913043}


Train: -26.3160 - Val: -24.0128 (avg: -24.0175, min: -24.7903) | lr: 5.00e-05 - Patience: 500/500:   4%|▍         | 2114/50000 [05:42<2:09:26,  6.17it/s]
Train: -27.1233 - Val: -24.6118 (avg: -24.8269, min: -26.0091) | lr: 1.00e-04 - Patience: 500/500:   4%|▍         | 2200/50000 [05:55<2:08:35,  6.20it/s]
Train: -25.1198 - Val: -20.4690 (avg: -20.7521, min: -24.0811) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1516/50000 [04:06<2:11:08,  6.16it/s]


Val Loss: -23.1397 ± 1.3281
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 944.4177
Function value obtained: -21.8115
Current minimum: -27.4083
Iteration No: 6 started. Evaluating function at random point.
{'condition_size': 1280, 'model_nested_size': 800, 'model_nested_layers': 1, 'model_n_blocks': 24, 'model_act_norm': False, 'model_dropout': 0.19230769230769232, 'feature_network_hidden_size': 87, 'feature_network_hidden_layers': 4, 'feature_network_dropout': 0.10869565217391304}


Train: -30.3349 - Val: -34.2839 (avg: -34.2745, min: -34.2144) | lr: 7.81e-07 - Patience: 500/500:  15%|█▍        | 7474/50000 [11:55<1:07:52, 10.44it/s]
Train: -30.4466 - Val: -33.1558 (avg: -33.1489, min: -33.0698) | lr: 7.81e-07 - Patience: 500/500:  15%|█▌        | 7741/50000 [12:10<1:06:27, 10.60it/s]
Train: -31.6858 - Val: -34.5761 (avg: -34.5789, min: -34.5095) | lr: 7.81e-07 - Patience: 500/500:  17%|█▋        | 8443/50000 [13:18<1:05:29, 10.58it/s]


Val Loss: -33.9073 ± 0.4592
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 2244.8903
Function value obtained: -33.4480
Current minimum: -33.4480
Iteration No: 7 started. Evaluating function at random point.
{'condition_size': 769, 'model_nested_size': 240, 'model_nested_layers': 3, 'model_n_blocks': 28, 'model_act_norm': True, 'model_dropout': 0.23076923076923078, 'feature_network_hidden_size': 101, 'feature_network_hidden_layers': 5, 'feature_network_dropout': 0.13043478260869565}


Train: -57.2983 - Val: -57.7589 (avg: -57.7600, min: -57.7159) | lr: 3.91e-07 - Patience: 500/500:  24%|██▍       | 12178/50000 [32:56<1:42:17,  6.16it/s]
Train: -54.7351 - Val: -58.0184 (avg: -58.0308, min: -57.9495) | lr: 7.81e-07 - Patience: 500/500:  22%|██▏       | 10846/50000 [29:22<1:46:04,  6.15it/s]
Train: -54.2234 - Val: -58.2519 (avg: -58.2565, min: -58.2135) | lr: 3.91e-07 - Patience: 500/500:  21%|██▏       | 10663/50000 [28:58<1:46:53,  6.13it/s]


Val Loss: -57.9883 ± 0.2943
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 5477.6443
Function value obtained: -57.6940
Current minimum: -57.6940
Iteration No: 8 started. Evaluating function at random point.
{'condition_size': 1792, 'model_nested_size': 576, 'model_nested_layers': 4, 'model_n_blocks': 5, 'model_act_norm': True, 'model_dropout': 0.2692307692307692, 'feature_network_hidden_size': 115, 'feature_network_hidden_layers': 6, 'feature_network_dropout': 0.15217391304347827}


Train: -39.9542 - Val: -37.7695 (avg: -37.7909, min: -37.8182) | lr: 6.25e-06 - Patience: 500/500:  13%|█▎        | 6553/50000 [05:03<33:32, 21.59it/s]
Train: -34.0059 - Val: -30.6364 (avg: -30.6144, min: -30.5851) | lr: 2.50e-05 - Patience: 500/500:   8%|▊         | 3976/50000 [03:03<35:27, 21.63it/s]
Train: -34.6529 - Val: -32.9069 (avg: -33.0076, min: -33.0596) | lr: 2.50e-05 - Patience: 500/500:   9%|▉         | 4621/50000 [03:33<34:52, 21.69it/s]


Val Loss: -33.8726 ± 3.2048
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 700.5880
Function value obtained: -30.6678
Current minimum: -57.6940
Iteration No: 9 started. Evaluating function at random point.
{'condition_size': 129, 'model_nested_size': 912, 'model_nested_layers': 5, 'model_n_blocks': 9, 'model_act_norm': True, 'model_dropout': 0.3076923076923077, 'feature_network_hidden_size': 129, 'feature_network_hidden_layers': 7, 'feature_network_dropout': 0.17391304347826086}


Train: -25.2282 - Val: -17.6466 (avg: -17.4276, min: -18.8464) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1377/50000 [02:00<1:10:44, 11.45it/s]
Train: -24.7016 - Val: -15.4868 (avg: -15.2698, min: -17.8041) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1305/50000 [01:53<1:10:39, 11.49it/s]
Train: -26.1677 - Val: -16.5425 (avg: -16.3223, min: -19.5055) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1483/50000 [02:09<1:10:25, 11.48it/s]


Val Loss: -17.0287 ± 0.7955
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 363.3448
Function value obtained: -16.2332
Current minimum: -57.6940
Iteration No: 10 started. Evaluating function at random point.
{'condition_size': 1152, 'model_nested_size': 53, 'model_nested_layers': 7, 'model_n_blocks': 13, 'model_act_norm': True, 'model_dropout': 0.34615384615384615, 'feature_network_hidden_size': 143, 'feature_network_hidden_layers': 8, 'feature_network_dropout': 0.1956521739130435}


Train: -46.7292 - Val: -48.6183 (avg: -48.6098, min: -48.5625) | lr: 1.56e-06 - Patience: 500/500:  31%|███       | 15446/50000 [31:06<1:09:34,  8.28it/s]
Train: -46.8457 - Val: -49.8320 (avg: -49.8142, min: -49.7413) | lr: 1.56e-06 - Patience: 500/500:  27%|██▋       | 13704/50000 [28:28<1:15:26,  8.02it/s]
Train: -46.6647 - Val: -49.4433 (avg: -49.6204, min: -49.5503) | lr: 3.13e-06 - Patience: 500/500:  26%|██▌       | 13011/50000 [26:11<1:14:27,  8.28it/s]


Val Loss: -49.2645 ± 0.4774
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 5146.8657
Function value obtained: -48.7871
Current minimum: -57.6940
Iteration No: 11 started. Evaluating function at random point.
{'condition_size': 641, 'model_nested_size': 389, 'model_nested_layers': 2, 'model_n_blocks': 17, 'model_act_norm': True, 'model_dropout': 0.38461538461538464, 'feature_network_hidden_size': 157, 'feature_network_hidden_layers': 8, 'feature_network_dropout': 0.21739130434782608}


Train: -50.8368 - Val: -54.8813 (avg: -54.8727, min: -54.7762) | lr: 7.81e-07 - Patience: 500/500:  25%|██▍       | 12311/50000 [19:20<59:13, 10.61it/s]  
Train: -45.6149 - Val: -48.6386 (avg: -48.6330, min: -48.5568) | lr: 7.81e-07 - Patience: 500/500:  22%|██▏       | 10773/50000 [16:53<1:01:30, 10.63it/s]
Train: -50.1827 - Val: -53.8109 (avg: -53.7856, min: -53.7008) | lr: 7.81e-07 - Patience: 500/500:  27%|██▋       | 13450/50000 [21:04<57:17, 10.63it/s]  


Val Loss: -52.4097 ± 2.7657
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 3439.3859
Function value obtained: -49.6440
Current minimum: -57.6940
Iteration No: 12 started. Evaluating function at random point.
{'condition_size': 1664, 'model_nested_size': 725, 'model_nested_layers': 3, 'model_n_blocks': 21, 'model_act_norm': False, 'model_dropout': 0.4230769230769231, 'feature_network_hidden_size': 171, 'feature_network_hidden_layers': 9, 'feature_network_dropout': 0.2391304347826087}


Train: -24.5408 - Val: -27.1470 (avg: -27.1392, min: -27.0515) | lr: 1.56e-06 - Patience: 500/500:  15%|█▍        | 7486/50000 [15:38<1:28:51,  7.97it/s]
Train: -25.7158 - Val: -26.6356 (avg: -26.6350, min: -26.5420) | lr: 1.56e-06 - Patience: 500/500:  15%|█▌        | 7518/50000 [15:49<1:29:23,  7.92it/s]
Train: -23.6932 - Val: -24.7363 (avg: -24.7366, min: -24.6457) | lr: 1.56e-06 - Patience: 500/500:  14%|█▎        | 6782/50000 [14:16<1:31:00,  7.91it/s]


Val Loss: -26.2035 ± 0.9065
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 2745.3961
Function value obtained: -25.2970
Current minimum: -57.6940
Iteration No: 13 started. Evaluating function at random point.
{'condition_size': 385, 'model_nested_size': 165, 'model_nested_layers': 4, 'model_n_blocks': 25, 'model_act_norm': False, 'model_dropout': 0.46153846153846156, 'feature_network_hidden_size': 185, 'feature_network_hidden_layers': 10, 'feature_network_dropout': 0.2608695652173913}


Train: -20.8666 - Val: -24.3413 (avg: -24.3298, min: -24.2795) | lr: 1.56e-06 - Patience: 500/500:  18%|█▊        | 9171/50000 [21:34<1:36:01,  7.09it/s]
Train: -21.9046 - Val: -25.7786 (avg: -25.7831, min: -25.6991) | lr: 3.13e-06 - Patience: 500/500:  16%|█▋        | 8227/50000 [19:27<1:38:45,  7.05it/s]
Train: -21.5863 - Val: -24.9608 (avg: -24.9459, min: -24.8881) | lr: 3.13e-06 - Patience: 500/500:  16%|█▌        | 7794/50000 [18:26<1:39:52,  7.04it/s]


Val Loss: -25.0544 ± 0.5161
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 3567.9781
Function value obtained: -24.5382
Current minimum: -57.6940
Iteration No: 14 started. Evaluating function at random point.
{'condition_size': 1408, 'model_nested_size': 501, 'model_nested_layers': 6, 'model_n_blocks': 29, 'model_act_norm': False, 'model_dropout': 0.0029585798816568047, 'feature_network_hidden_size': 200, 'feature_network_hidden_layers': 11, 'feature_network_dropout': 0.2826086956521739}


Train: -22.1335 - Val: -6.6861 (avg: -6.4225, min: -17.7826) | lr: 5.00e-05 - Patience: 500/500:   4%|▍         | 2162/50000 [07:50<2:53:22,  4.60it/s]  
Train: -21.2658 - Val: -15.6915 (avg: -15.7315, min: -17.7148) | lr: 5.00e-05 - Patience: 500/500:   5%|▍         | 2349/50000 [08:31<2:52:49,  4.60it/s]
Train: -22.0875 - Val: -9.2824 (avg: -11.3315, min: -17.4266) | lr: 5.00e-05 - Patience: 500/500:   5%|▌         | 2546/50000 [09:10<2:51:01,  4.62it/s] 


Val Loss: -12.3984 ± 2.4651
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 1532.5608
Function value obtained: -9.9332
Current minimum: -57.6940
Iteration No: 15 started. Evaluating function at random point.
{'condition_size': 897, 'model_nested_size': 837, 'model_nested_layers': 7, 'model_n_blocks': 5, 'model_act_norm': False, 'model_dropout': 0.04142011834319527, 'feature_network_hidden_size': 214, 'feature_network_hidden_layers': 12, 'feature_network_dropout': 0.30434782608695654}


Train: -16.3879 - Val: 63.8896 (avg: 66.4218, min: -10.7970) | lr: 5.00e-05 - Patience: 500/500:   4%|▍         | 2090/50000 [02:14<51:18, 15.56it/s]   
Train: -13.4287 - Val: -12.8662 (avg: -12.7747, min: -12.9376) | lr: 2.50e-05 - Patience: 500/500:   6%|▌         | 3063/50000 [03:16<50:15, 15.57it/s]         
Train: -11.4922 - Val: -11.5626 (avg: -11.3799, min: -11.5250) | lr: 2.00e-04 - Patience: 133/500:   4%|▍         | 2115/50000 [02:15<51:08, 15.61it/s]  


Error in fold 2: Loss exploded to 2550424.0 at epoch 2115.6666666666665
Loss exploded to 2550424.0 at epoch 2115.6666666666665
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 466.8701
Function value obtained: 100.0000
Current minimum: -57.6940
Iteration No: 16 started. Evaluating function at random point.
{'condition_size': 1920, 'model_nested_size': 277, 'model_nested_layers': 2, 'model_n_blocks': 9, 'model_act_norm': False, 'model_dropout': 0.07988165680473373, 'feature_network_hidden_size': 228, 'feature_network_hidden_layers': 13, 'feature_network_dropout': 0.32608695652173914}


Train: -17.7377 - Val: -17.1888 (avg: -17.1466, min: -17.0692) | lr: 1.25e-05 - Patience: 500/500:   8%|▊         | 3873/50000 [03:25<40:52, 18.81it/s]
Train: -17.9247 - Val: -16.9222 (avg: -16.9023, min: -16.8351) | lr: 3.13e-06 - Patience: 500/500:  10%|▉         | 4993/50000 [04:25<39:50, 18.83it/s]
Train: -19.7070 - Val: -19.4656 (avg: -19.4521, min: -19.3930) | lr: 3.13e-06 - Patience: 500/500:  10%|▉         | 4950/50000 [04:20<39:27, 19.03it/s]


Val Loss: -18.1042 ± 1.5130
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 731.4054
Function value obtained: -16.5912
Current minimum: -57.6940
Iteration No: 17 started. Evaluating function at random point.
{'condition_size': 65, 'model_nested_size': 613, 'model_nested_layers': 3, 'model_n_blocks': 13, 'model_act_norm': False, 'model_dropout': 0.1183431952662722, 'feature_network_hidden_size': 242, 'feature_network_hidden_layers': 13, 'feature_network_dropout': 0.34782608695652173}


Train: -15.4416 - Val: -10.0159 (avg: -10.2743, min: -12.3318) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 1048/50000 [01:19<1:01:30, 13.27it/s]
Train: -15.9170 - Val: -9.0352 (avg: -8.9215, min: -12.1042) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 1095/50000 [01:23<1:02:03, 13.14it/s]  
Train: -15.8690 - Val: -8.0282 (avg: -8.4349, min: -11.9381) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 1069/50000 [01:20<1:01:13, 13.32it/s] 


Val Loss: -10.3334 ± 0.8500
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 242.7582
Function value obtained: -9.4834
Current minimum: -57.6940
Iteration No: 18 started. Evaluating function at random point.
{'condition_size': 1088, 'model_nested_size': 949, 'model_nested_layers': 5, 'model_n_blocks': 17, 'model_act_norm': True, 'model_dropout': 0.15680473372781065, 'feature_network_hidden_size': 17, 'feature_network_hidden_layers': 14, 'feature_network_dropout': 0.3695652173913043}


Train: -26.8800 - Val: -15.9730 (avg: -15.7357, min: -19.1758) | lr: 1.00e-04 - Patience: 500/500:   4%|▍         | 1958/50000 [05:23<2:12:06,  6.06it/s]
Train: -26.9897 - Val: -14.1368 (avg: -13.5042, min: -17.7673) | lr: 5.00e-05 - Patience: 500/500:   4%|▍         | 1941/50000 [05:19<2:11:59,  6.07it/s]
Train: -25.9597 - Val: -16.1720 (avg: -16.7858, min: -19.3683) | lr: 1.00e-04 - Patience: 500/500:   4%|▍         | 1961/50000 [05:31<2:15:09,  5.92it/s] 


Val Loss: -15.7651 ± 0.8771
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 974.7199
Function value obtained: -14.8880
Current minimum: -57.6940
Iteration No: 19 started. Evaluating function at random point.
{'condition_size': 577, 'model_nested_size': 91, 'model_nested_layers': 6, 'model_n_blocks': 21, 'model_act_norm': True, 'model_dropout': 0.1952662721893491, 'feature_network_hidden_size': 31, 'feature_network_hidden_layers': 15, 'feature_network_dropout': 0.391304347826087}


Train: -46.0554 - Val: -48.8630 (avg: -48.8312, min: -48.7529) | lr: 1.56e-06 - Patience: 500/500:  24%|██▍       | 12049/50000 [35:03<1:50:25,  5.73it/s]
Train: -43.0077 - Val: -45.5403 (avg: -45.5581, min: -45.4999) | lr: 7.81e-07 - Patience: 500/500:  21%|██▏       | 10723/50000 [31:39<1:55:56,  5.65it/s]
Train: -42.0975 - Val: -44.6414 (avg: -44.6745, min: -44.6302) | lr: 7.81e-07 - Patience: 500/500:  22%|██▏       | 11211/50000 [33:31<1:56:00,  5.57it/s]


Val Loss: -46.3473 ± 1.8017
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 6014.7682
Function value obtained: -44.5456
Current minimum: -57.6940
Iteration No: 20 started. Evaluating function at random point.
{'condition_size': 1600, 'model_nested_size': 427, 'model_nested_layers': 7, 'model_n_blocks': 25, 'model_act_norm': True, 'model_dropout': 0.23372781065088757, 'feature_network_hidden_size': 45, 'feature_network_hidden_layers': 0, 'feature_network_dropout': 0.41304347826086957}


Train: -38.5233 - Val: -27.8311 (avg: -27.6559, min: -28.7802) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 1215/50000 [04:35<3:04:21,  4.41it/s]
Train: -40.3198 - Val: -25.4191 (avg: -25.0236, min: -28.0968) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1368/50000 [05:12<3:04:56,  4.38it/s]
Train: -41.1977 - Val: -27.0138 (avg: -26.4704, min: -28.6736) | lr: 5.00e-05 - Patience: 500/500:   3%|▎         | 1581/50000 [06:02<3:04:49,  4.37it/s]


Val Loss: -26.8437 ± 1.1473
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 950.3120
Function value obtained: -25.6964
Current minimum: -57.6940
Iteration No: 21 started. Evaluating function at random point.
{'condition_size': 321, 'model_nested_size': 763, 'model_nested_layers': 2, 'model_n_blocks': 29, 'model_act_norm': True, 'model_dropout': 0.27218934911242604, 'feature_network_hidden_size': 59, 'feature_network_hidden_layers': 1, 'feature_network_dropout': 0.43478260869565216}


Train: -22.1539 - Val: 51.0000 (avg: 47.6827, min: -11.4257) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 608/50000 [01:35<2:08:48,  6.39it/s] 
  0%|          | 0/50000 [00:00<?, ?it/s]


Error in fold 1: Loss exploded to 810997.75 at epoch 0.0
Loss exploded to 810997.75 at epoch 0.0
Iteration No: 21 ended. Evaluation done at random point.
Time taken: 95.3605
Function value obtained: 100.0000
Current minimum: -57.6940
Iteration No: 22 started. Evaluating function at random point.
{'condition_size': 1344, 'model_nested_size': 203, 'model_nested_layers': 4, 'model_n_blocks': 6, 'model_act_norm': True, 'model_dropout': 0.31065088757396453, 'feature_network_hidden_size': 73, 'feature_network_hidden_layers': 2, 'feature_network_dropout': 0.45652173913043476}


Train: -42.1594 - Val: -46.2577 (avg: -46.0396, min: -45.9703) | lr: 2.00e-04 - Patience: 75/500:  21%|██        | 10251/50000 [08:29<32:55, 20.13it/s]


Error in fold 0: Loss exploded to 211012.0 at epoch 10251.666666666666
Loss exploded to 211012.0 at epoch 10251.666666666666
Iteration No: 22 ended. Evaluation done at random point.
Time taken: 509.3961
Function value obtained: 100.0000
Current minimum: -57.6940
Iteration No: 23 started. Evaluating function at random point.
{'condition_size': 833, 'model_nested_size': 539, 'model_nested_layers': 5, 'model_n_blocks': 10, 'model_act_norm': False, 'model_dropout': 0.34911242603550297, 'feature_network_hidden_size': 87, 'feature_network_hidden_layers': 3, 'feature_network_dropout': 0.4782608695652174}


Train: -21.2039 - Val: -23.5304 (avg: -23.5454, min: -23.5163) | lr: 1.56e-06 - Patience: 500/500:  12%|█▏        | 5901/50000 [07:23<55:10, 13.32it/s]  
Train: -20.5281 - Val: -22.9510 (avg: -22.9497, min: -22.9156) | lr: 3.13e-06 - Patience: 500/500:  11%|█         | 5610/50000 [07:02<55:42, 13.28it/s]  
Train: -21.4703 - Val: -22.9697 (avg: -22.9476, min: -22.8895) | lr: 3.13e-06 - Patience: 500/500:  11%|█         | 5374/50000 [06:41<55:32, 13.39it/s]  


Val Loss: -23.1626 ± 0.3534
Iteration No: 23 ended. Evaluation done at random point.
Time taken: 1267.0352
Function value obtained: -22.8092
Current minimum: -57.6940
Iteration No: 24 started. Evaluating function at random point.
{'condition_size': 1856, 'model_nested_size': 875, 'model_nested_layers': 6, 'model_n_blocks': 14, 'model_act_norm': False, 'model_dropout': 0.38757396449704146, 'feature_network_hidden_size': 102, 'feature_network_hidden_layers': 3, 'feature_network_dropout': 0.000945179584120983}


Train: -37.1329 - Val: -37.7283 (avg: -37.7364, min: -37.6758) | lr: 7.81e-07 - Patience: 500/500:  11%|█▏        | 5648/50000 [13:27<1:45:40,  6.99it/s]
Train: -36.5071 - Val: -34.7660 (avg: -34.7578, min: -34.6834) | lr: 7.81e-07 - Patience: 500/500:  11%|█         | 5536/50000 [13:37<1:49:23,  6.77it/s]
Train: -36.3937 - Val: -35.2624 (avg: -35.2458, min: -35.1880) | lr: 7.81e-07 - Patience: 500/500:  11%|█▏        | 5685/50000 [13:35<1:45:53,  6.98it/s]


Val Loss: -36.0797 ± 1.2249
Iteration No: 24 ended. Evaluation done at random point.
Time taken: 2441.0317
Function value obtained: -34.8547
Current minimum: -57.6940
Iteration No: 25 started. Evaluating function at random point.
{'condition_size': 193, 'model_nested_size': 315, 'model_nested_layers': 8, 'model_n_blocks': 18, 'model_act_norm': False, 'model_dropout': 0.4260355029585799, 'feature_network_hidden_size': 116, 'feature_network_hidden_layers': 4, 'feature_network_dropout': 0.02268431001890359}


Train: -28.7622 - Val: -33.2617 (avg: -33.2680, min: -33.2230) | lr: 1.56e-06 - Patience: 500/500:  16%|█▋        | 8240/50000 [20:39<1:44:42,  6.65it/s]
Train: -30.6251 - Val: -34.8104 (avg: -34.8056, min: -34.7662) | lr: 1.56e-06 - Patience: 500/500:  18%|█▊        | 8972/50000 [22:28<1:42:48,  6.65it/s]
Train: -28.9592 - Val: -32.4665 (avg: -32.4649, min: -32.4179) | lr: 1.56e-06 - Patience: 500/500:  16%|█▌        | 7853/50000 [19:42<1:45:48,  6.64it/s]


Val Loss: -33.4899 ± 0.9432
Iteration No: 25 ended. Evaluation done at random point.
Time taken: 3771.7402
Function value obtained: -32.5467
Current minimum: -57.6940
Iteration No: 26 started. Evaluating function at random point.
{'condition_size': 1216, 'model_nested_size': 651, 'model_nested_layers': 1, 'model_n_blocks': 22, 'model_act_norm': False, 'model_dropout': 0.4644970414201184, 'feature_network_hidden_size': 130, 'feature_network_hidden_layers': 5, 'feature_network_dropout': 0.0444234404536862}


Train: -33.7927 - Val: -38.8658 (avg: -38.8742, min: -38.7887) | lr: 7.81e-07 - Patience: 500/500:  15%|█▌        | 7726/50000 [10:51<59:23, 11.86it/s]  
Train: -33.5692 - Val: -37.9592 (avg: -37.9584, min: -37.9064) | lr: 7.81e-07 - Patience: 500/500:  17%|█▋        | 8371/50000 [11:48<58:45, 11.81it/s]  
Train: -33.9177 - Val: -38.5779 (avg: -38.5780, min: -38.4939) | lr: 1.56e-06 - Patience: 500/500:  17%|█▋        | 8333/50000 [11:49<59:09, 11.74it/s]  


Val Loss: -38.4775 ± 0.4314
Iteration No: 26 ended. Evaluation done at random point.
Time taken: 2070.2420
Function value obtained: -38.0460
Current minimum: -57.6940
Iteration No: 27 started. Evaluating function at random point.
{'condition_size': 705, 'model_nested_size': 987, 'model_nested_layers': 2, 'model_n_blocks': 26, 'model_act_norm': False, 'model_dropout': 0.005917159763313609, 'feature_network_hidden_size': 144, 'feature_network_hidden_layers': 6, 'feature_network_dropout': 0.0661625708884688}


Train: -30.8487 - Val: -20.6752 (avg: -24.2935, min: -28.2156) | lr: 5.00e-05 - Patience: 500/500:   4%|▍         | 2062/50000 [04:21<1:41:25,  7.88it/s]
Train: -32.8651 - Val: -26.4621 (avg: -26.4611, min: -28.0497) | lr: 2.50e-05 - Patience: 500/500:   5%|▌         | 2614/50000 [05:32<1:40:26,  7.86it/s]
Train: -30.4280 - Val: -24.9652 (avg: -24.3466, min: -27.8339) | lr: 5.00e-05 - Patience: 500/500:   4%|▎         | 1849/50000 [03:56<1:42:32,  7.83it/s]


Val Loss: -26.5543 ± 1.8077
Iteration No: 27 ended. Evaluation done at random point.
Time taken: 830.9640
Function value obtained: -24.7466
Current minimum: -57.6940
Iteration No: 28 started. Evaluating function at random point.
{'condition_size': 1728, 'model_nested_size': 28, 'model_nested_layers': 4, 'model_n_blocks': 30, 'model_act_norm': False, 'model_dropout': 0.044378698224852076, 'feature_network_hidden_size': 158, 'feature_network_hidden_layers': 7, 'feature_network_dropout': 0.08790170132325141}


Train: 357.9969 - Val: -24.3674 (avg: -24.8730, min: -24.9041) | lr: 2.00e-04 - Patience: 100/500:   7%|▋         | 3595/50000 [09:47<2:06:22,  6.12it/s]


Error in fold 0: Loss exploded to 7693361152.0 at epoch 3595.5
Loss exploded to 7693361152.0 at epoch 3595.5
Iteration No: 28 ended. Evaluation done at random point.
Time taken: 587.4625
Function value obtained: 100.0000
Current minimum: -57.6940
Iteration No: 29 started. Evaluating function at random point.
{'condition_size': 449, 'model_nested_size': 364, 'model_nested_layers': 5, 'model_n_blocks': 6, 'model_act_norm': True, 'model_dropout': 0.08284023668639054, 'feature_network_hidden_size': 172, 'feature_network_hidden_layers': 8, 'feature_network_dropout': 0.10964083175803402}


Train: -36.0369 - Val: -35.5816 (avg: -35.6581, min: -35.7197) | lr: 6.25e-06 - Patience: 500/500:  16%|█▌        | 8092/50000 [07:33<39:07, 17.85it/s]
Train: -31.8379 - Val: -27.9390 (avg: -28.3435, min: -28.6206) | lr: 5.00e-05 - Patience: 500/500:   6%|▌         | 3010/50000 [02:47<43:38, 17.95it/s]
Train: -28.3004 - Val: -23.1195 (avg: -23.6089, min: -25.1078) | lr: 1.00e-04 - Patience: 500/500:   5%|▍         | 2461/50000 [02:18<44:33, 17.78it/s]


Val Loss: -29.6802 ± 3.7938
Iteration No: 29 ended. Evaluation done at random point.
Time taken: 759.5702
Function value obtained: -25.8864
Current minimum: -57.6940
Iteration No: 30 started. Evaluating function at random point.
{'condition_size': 1472, 'model_nested_size': 700, 'model_nested_layers': 7, 'model_n_blocks': 10, 'model_act_norm': True, 'model_dropout': 0.121301775147929, 'feature_network_hidden_size': 186, 'feature_network_hidden_layers': 8, 'feature_network_dropout': 0.13137996219281664}


Train: -22.7504 - Val: -25.3140 (avg: -25.9832, min: -26.1537) | lr: 2.00e-04 - Patience: 3/500:   3%|▎         | 1529/50000 [02:46<1:28:04,  9.17it/s] 


Error in fold 0: Loss exploded to 569701.375 at epoch 1529.5
Loss exploded to 569701.375 at epoch 1529.5
Iteration No: 30 ended. Evaluation done at random point.
Time taken: 167.0312
Function value obtained: 100.0000
Current minimum: -57.6940
Iteration No: 31 started. Searching for the next optimal point.
{'condition_size': 1038, 'model_nested_size': 484, 'model_nested_layers': 5, 'model_n_blocks': 32, 'model_act_norm': True, 'model_dropout': 0.3185052316747453, 'feature_network_hidden_size': 97, 'feature_network_hidden_layers': 5, 'feature_network_dropout': 0.1254495493226791}


Train: -53.3390 - Val: -54.5408 (avg: -54.5381, min: -54.4880) | lr: 3.91e-07 - Patience: 500/500:  21%|██        | 10611/50000 [41:56<2:35:41,  4.22it/s]
Train: -55.4095 - Val: -50.3834 (avg: -50.4031, min: -50.3203) | lr: 7.81e-07 - Patience: 500/500:  21%|██        | 10378/50000 [41:21<2:37:53,  4.18it/s]
Train: -51.3079 - Val: -53.0534 (avg: -52.8785, min: -52.9128) | lr: 2.50e-05 - Patience: 162/500:  14%|█▍        | 7196/50000 [35:02<4:51:53,  2.44it/s]