In [1]:
import os
import shutil
import pickle
import numpy as np
import torch
import copy
import matplotlib.pyplot as plt

from skopt.space import Real, Categorical, Integer

from bcnf.gp_minimize.gp_minimize import gp_minimize_fixed
from bcnf.simulation.physics import get_data
from bcnf.models.cnf import CondRealNVP
from bcnf.models.feature_network import FullyConnectedFeatureNetwork
from bcnf.eval.crossvalidate import cross_validate
from bcnf.errors import TrainingDivergedError
from bcnf.utils import get_dir

from bcnf.simulation.sampling import generate_data

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Setup

In [2]:
# HACK: scikit-optimize is not maintained anymore and this is a quick fix to make it work
# https://github.com/scikit-optimize/scikit-optimize/issues/1171#:~:text=To%20avoid%20this%20error%20in%20existing%20code%2C%20use%20int%20by%20itself.
np.int = np.int64

In [13]:
# Store and load the progress in the Huggingface repository
checkpoint_file = os.path.join(get_dir("models", "bcnf-models", "hyperparameter_optimization", "stage_2", create=True), 'checkpoint_improved.pkl')
metrics_dir = get_dir("models", "bcnf-models", "hyperparameter_optimization", "stage_2", "metrics", create=True)

print(os.path.abspath(checkpoint_file))
print(os.path.abspath(metrics_dir))

/home/psaegert/Projects/bcnf/models/bcnf-models/hyperparameter_optimization/stage_2/checkpoint_improved.pkl
/home/psaegert/Projects/bcnf/models/bcnf-models/hyperparameter_optimization/stage_2/metrics


## Data

In [5]:
param_names = ['x0_x','x0_y','x0_z','v0_x','v0_y','v0_z','g','w_x','w_y','w_z','b','A','Cd','rho','m','a_x','a_y','a_z','r']

In [4]:
# dataset_name = "hyperparameter_optimization_trajectories"

# if not os.path.exists(os.path.join(get_dir('data', 'bcnf-data'), dataset_name + '.pkl')):
#     data = generate_data(
#         n=2000,
#         type="trajectory",
#         SPF=1/30,
#         T=3,
#         config_file=os.path.join(get_dir("configs"), "throw_upwards.yaml"),
#         verbose=True,
#         break_on_impact=False,
#         name=dataset_name
#     )
# else:
#     with open(os.path.join(get_dir('data', 'bcnf-data'), dataset_name + '.pkl'), 'rb') as f:
#         data = pickle.load(f)

In [6]:
# X = np.array(data['traj'])
# y = np.column_stack([np.array(data[param]) for param in param_names])


# X_tensor = torch.Tensor(X.reshape(X.shape[0], -1))
# y_tensor = torch.Tensor(y)

# X_tensor.shape, y_tensor.shape

(torch.Size([2000, 270]), torch.Size([2000, 19]))

In [None]:
X, y = get_data(
    T=1.0,
    dt=1 / 30,
    N=2_000,
    break_on_impact=False
)

X_tensor = torch.Tensor(X.reshape(X.shape[0], -1))
y_tensor = torch.Tensor(y)

X_tensor.shape, y_tensor.shape

## Hyperparameter

In [7]:
model_size = y_tensor.shape[1]
feature_size = X_tensor.shape[1]

print(f"Model size: {model_size}")
print(f"Feature size: {feature_size}")

Model size: 19
Feature size: 270


In [8]:
# These parameters have worked well in pilot experiments
optimizer_kwargs = {
    "lr": 2e-4
}

lr_scheduler_kwargs = {
    "mode": "min",
    "factor": 0.5,
    "patience": 250,
    "threshold_mode": "abs",
    "threshold": 1e-1,
}

In [9]:
# Define the search space
search_spaces = {
    'condition_size': Integer(1, 2048),
    'model_nested_size': Integer(16, 1024),
    'model_nested_layers': Integer(1, 8),
    'model_n_blocks': Integer(4, 32),
    'model_act_norm': Categorical([True, False]),
    'model_dropout': Real(0.0, 0.5),
    'feature_network_hidden_size': Integer(16, 256),
    'feature_network_hidden_layers': Integer(0, 16),
    'feature_network_dropout': Real(0.0, 0.5),
}

## Helper Functions

In [14]:
def param_index(name: str, search_spaces: dict[str, Real | Integer | Categorical]):
    """
    Get the index of a parameter in the search space to match the order of the parameters in the optimization function.
    
    Parameters
    ----------
    name : str
        The name of the parameter.
    search_spaces : dict[str, Real | Integer | Categorical]
        The search space.

    Returns
    -------
    int
        The index of the parameter in the search space.
    """
    return list(search_spaces.keys()).index(name)

In [15]:
def score_parameters(params: list):
    print({k: v for k, v in zip(search_spaces.keys(), params)})

    # Catch training errors (such as divergence) to filter out bad parameter sets and speed up the optimization
    try:
        # Use cross-validation to estimate the performance of the model
        fold_metrics = cross_validate(
            model_class=CondRealNVP,
            model_kwargs={
                "size": model_size,
                "nested_sizes": [params[param_index('model_nested_size', search_spaces)]] * params[param_index('model_nested_layers', search_spaces)],
                "n_blocks": params[param_index('model_n_blocks', search_spaces)],
                "n_conditions": params[param_index('condition_size', search_spaces)],
                "act_norm": params[param_index('model_act_norm', search_spaces)],
                "dropout": params[param_index('model_dropout', search_spaces)],
            },
            feature_network_class=FullyConnectedFeatureNetwork,
            feature_network_kwargs={
                "sizes": [feature_size]
                    + [params[param_index('feature_network_hidden_size', search_spaces)]] * params[param_index('feature_network_hidden_layers', search_spaces)]
                    + [params[param_index('condition_size', search_spaces)]],
                "dropout": params[param_index('feature_network_dropout', search_spaces)],    
            },
            optimizer_class=torch.optim.Adam,
            optimizer_kwargs=optimizer_kwargs,
            lr_scheduler_class=torch.optim.lr_scheduler.ReduceLROnPlateau,
            lr_scheduler_kwargs=lr_scheduler_kwargs,
            X=X_tensor,
            y=y_tensor,
            n_epochs=50_000,
            val_loss_patience=500,
            val_loss_tolerance=1e-1,  # Improvements to treat as significant
            val_loss_tolerance_mode="abs",
            timeout=60 * 60,  # 1 hour
            batch_size=256,
            device=device,
            verbose=True,  # Print the progress
            n_splits=3,
            errors="raise"  # Raise errors to stop the optimization
        )

        # Save the loss_history and metrics for analysis
        with open(os.path.join(metrics_dir, f'params_{"_".join([str(p) for p in params])}.pkl'), 'wb') as f:
            pickle.dump(fold_metrics, f)

    except TrainingDivergedError as e:
        print(e)
        return 100  # A big number (bad score) to avoid this parameter set

    # Print the average validation loss and its standard deviation during optimization
    val_loss_list = [r['val_loss'][1] for r in fold_metrics]  # each val_loss value is a tuple (epoch, loss)
    print(f'Val Loss: {np.mean(val_loss_list):.4f} ± {np.std(val_loss_list):.4f}')

    # Return the upper confidence bound of the validation loss
    # This encourages the optimization to find good AND reliable parameter sets
    return np.mean(val_loss_list) + np.std(val_loss_list)

In [16]:
def save_checkpoint(result):
    """
    Save the result of the optimization to a checkpoint file.
    Used as a callback in the optimization function.
    
    Parameters
    ----------
    result : OptimizeResult
        The result of the optimization.
    """
    # Safely write the result to a temporary file first without overwriting the checkpoint file
    with open(checkpoint_file + ".tmp", 'wb') as f:
        # Ignore
        # - result['specs']['args']['func']
        # - result['specs']['args']['callback']
        # because it causes problems when reading somewhere else
        result_no_func = copy.deepcopy(result)
        del result_no_func['specs']['args']['func']
        del result_no_func['specs']['args']['callback']
        pickle.dump(result_no_func, f)

    # Delete the old checkpoint file and rename the temporary file
    shutil.move(checkpoint_file + ".tmp", checkpoint_file)

## Optimization

In [17]:
# Numer of random initial points to explore the search space
N_STEPS_INIT = 30

# Number of iterations to run the optimization in total
N_STEPS = 100

In [18]:
# Load the checkpoint if it exists
if os.path.exists(checkpoint_file):
    print(f'Loading checkpoint from {checkpoint_file}')

    with open(checkpoint_file, 'rb') as f:
        checkpoint = pickle.load(f)

        # Re-assign the function and callback because they are not picklable
        checkpoint['specs']['args']['func'] = score_parameters
        checkpoint['specs']['args']['callback'] = save_checkpoint

    print(f'Resuming from iteration {len(checkpoint.x_iters)}')

    n_initial_points = max(0, N_STEPS_INIT - len(checkpoint.x_iters))
    n_calls_remaining = max(0, N_STEPS - len(checkpoint.x_iters))
    x0 = checkpoint.x_iters
    y0 = checkpoint.func_vals
else:
    print('No checkpoint found. Starting new optimization')
    checkpoint = None

    n_initial_points = N_STEPS_INIT
    n_calls_remaining = N_STEPS

    x0 = None
    y0 = None
    
print(f'Running with {n_initial_points} initial points and {n_calls_remaining} remaining iterations')

No checkpoint found. Starting new optimization
Running with 30 initial points and 100 remaining iterations


In [19]:
# You might want to adjust the n_calls or other parameters based on the checkpoint
result = gp_minimize_fixed(
    func=score_parameters,
    dimensions=search_spaces.values(),
    n_initial_points=n_initial_points,  # Number of random points before starting the optimization
    n_calls=n_calls_remaining,  # Number of iterations
    random_state=2024_03_25,
    verbose=True,
    acq_func="EI",  # Expected Improvement https://arxiv.org/pdf/1009.5419.pdf
    initial_point_generator="halton", # https://en.wikipedia.org/wiki/Halton_sequence
    callback=save_checkpoint,
    x0=x0,
    y0=y0)

30 initial points will be randomly generated
Iteration No: 1 started. Evaluating function at random point.
{'condition_size': 1, 'model_nested_size': 16, 'model_nested_layers': 1, 'model_n_blocks': 4, 'model_act_norm': False, 'model_dropout': 0.0, 'feature_network_hidden_size': 16, 'feature_network_hidden_layers': 0, 'feature_network_dropout': 0.0}


Train: 24.6184 - Val: 36.4657 (avg: 36.3351, min: 31.8462) | lr: 5.00e-05 - Patience: 500/500:   4%|▎         | 1812/50000 [00:41<18:31, 43.34it/s]     
Train: 24.2726 - Val: 26.2805 (avg: 26.2549, min: 26.0513) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1435/50000 [00:32<18:32, 43.67it/s]    
Train: 21.4233 - Val: 25.5778 (avg: 25.4570, min: 23.9413) | lr: 1.00e-04 - Patience: 500/500:   5%|▌         | 2551/50000 [00:58<18:14, 43.36it/s]    


Val Loss: 26.6614 ± 1.5437
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 133.9515
Function value obtained: 28.2051
Current minimum: 28.2051
Iteration No: 2 started. Evaluating function at random point.
{'condition_size': 1024, 'model_nested_size': 352, 'model_nested_layers': 2, 'model_n_blocks': 8, 'model_act_norm': False, 'model_dropout': 0.038461538461538464, 'feature_network_hidden_size': 30, 'feature_network_hidden_layers': 1, 'feature_network_dropout': 0.021739130434782608}


Train: -0.4672 - Val: 50.7001 (avg: 49.7093, min: 21.1193) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 608/50000 [00:26<35:19, 23.30it/s]
Train: -0.2403 - Val: 49.4795 (avg: 48.9554, min: 20.5015) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 621/50000 [00:26<35:04, 23.46it/s]
Train: -1.0247 - Val: 46.1674 (avg: 45.8817, min: 19.3636) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 623/50000 [00:26<35:23, 23.25it/s]


Val Loss: 50.4280 ± 1.2651
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 79.4270
Function value obtained: 51.6931
Current minimum: 28.2051
Iteration No: 3 started. Evaluating function at random point.
{'condition_size': 513, 'model_nested_size': 688, 'model_nested_layers': 4, 'model_n_blocks': 12, 'model_act_norm': False, 'model_dropout': 0.07692307692307693, 'feature_network_hidden_size': 44, 'feature_network_hidden_layers': 2, 'feature_network_dropout': 0.043478260869565216}


Train: -3.1596 - Val: 78.7569 (avg: 77.9794, min: 18.6097) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 608/50000 [00:49<1:07:08, 12.26it/s]
Train: -3.1041 - Val: 61.0314 (avg: 59.6848, min: 16.9599) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 609/50000 [00:50<1:07:51, 12.13it/s]
Train: -2.9019 - Val: 60.1020 (avg: 58.0478, min: 17.2667) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 609/50000 [00:50<1:07:48, 12.14it/s]


Val Loss: 67.7817 ± 6.7551
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 150.2450
Function value obtained: 74.5367
Current minimum: 28.2051
Iteration No: 4 started. Evaluating function at random point.
{'condition_size': 1536, 'model_nested_size': 128, 'model_nested_layers': 5, 'model_n_blocks': 16, 'model_act_norm': False, 'model_dropout': 0.11538461538461539, 'feature_network_hidden_size': 58, 'feature_network_hidden_layers': 3, 'feature_network_dropout': 0.06521739130434782}


Train: 8.6853 - Val: 17.3639 (avg: 17.3433, min: 15.1324) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 857/50000 [01:30<1:26:51,  9.43it/s] 
Train: 9.5497 - Val: 16.4912 (avg: 16.3765, min: 15.0072) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 884/50000 [01:34<1:27:29,  9.36it/s] 
Train: 8.4812 - Val: 15.9070 (avg: 15.8724, min: 14.4736) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 940/50000 [01:39<1:26:55,  9.41it/s] 


Val Loss: 16.4965 ± 0.2801
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 285.4017
Function value obtained: 16.7766
Current minimum: 16.7766
Iteration No: 5 started. Evaluating function at random point.
{'condition_size': 257, 'model_nested_size': 464, 'model_nested_layers': 7, 'model_n_blocks': 20, 'model_act_norm': False, 'model_dropout': 0.15384615384615385, 'feature_network_hidden_size': 72, 'feature_network_hidden_layers': 3, 'feature_network_dropout': 0.08695652173913043}


Train: 4.9612 - Val: 33.8293 (avg: 33.6807, min: 16.5408) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 674/50000 [01:52<2:17:22,  5.98it/s] 
Train: 5.7472 - Val: 31.9346 (avg: 31.4229, min: 16.3198) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 670/50000 [01:51<2:17:07,  6.00it/s] 
Train: 5.5228 - Val: 31.0278 (avg: 30.6555, min: 16.4368) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 689/50000 [01:54<2:16:22,  6.03it/s] 


Val Loss: 32.5050 ± 0.3720
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 339.1108
Function value obtained: 32.8769
Current minimum: 16.7766
Iteration No: 6 started. Evaluating function at random point.
{'condition_size': 1280, 'model_nested_size': 800, 'model_nested_layers': 1, 'model_n_blocks': 24, 'model_act_norm': False, 'model_dropout': 0.19230769230769232, 'feature_network_hidden_size': 87, 'feature_network_hidden_layers': 4, 'feature_network_dropout': 0.10869565217391304}


  0%|          | 0/50000 [00:00<?, ?it/s]


Error in fold 0: Loss exploded to 12758239.0 at epoch 0.0
Loss exploded to 12758239.0 at epoch 0.0
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.1155
Function value obtained: 100.0000
Current minimum: 16.7766
Iteration No: 7 started. Evaluating function at random point.
{'condition_size': 769, 'model_nested_size': 240, 'model_nested_layers': 3, 'model_n_blocks': 28, 'model_act_norm': True, 'model_dropout': 0.23076923076923078, 'feature_network_hidden_size': 101, 'feature_network_hidden_layers': 5, 'feature_network_dropout': 0.13043478260869565}


Train: 7.9859 - Val: 240.4087 (avg: 302.2951, min: 17.6259) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 627/50000 [01:45<2:18:26,  5.94it/s]
Train: 6.9175 - Val: 25.7566 (avg: 24.8837, min: 15.1492) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 702/50000 [01:59<2:19:38,  5.88it/s] 
Train: 6.2555 - Val: 35.5084 (avg: 34.8827, min: 14.6339) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 754/50000 [02:07<2:19:16,  5.89it/s] 


Val Loss: 24.3455 ± 3.0681
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 352.9243
Function value obtained: 27.4136
Current minimum: 16.7766
Iteration No: 8 started. Evaluating function at random point.
{'condition_size': 1792, 'model_nested_size': 576, 'model_nested_layers': 4, 'model_n_blocks': 5, 'model_act_norm': True, 'model_dropout': 0.2692307692307692, 'feature_network_hidden_size': 115, 'feature_network_hidden_layers': 6, 'feature_network_dropout': 0.15217391304347827}


Train: 9.1186 - Val: 17.6293 (avg: 17.5771, min: 15.0618) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 1129/50000 [00:54<39:13, 20.76it/s]
Train: 9.8740 - Val: 17.0640 (avg: 16.8920, min: 14.8317) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 1070/50000 [00:50<38:41, 21.07it/s] 
Train: 8.6086 - Val: 17.1301 (avg: 16.9787, min: 14.9352) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1259/50000 [01:00<39:04, 20.79it/s] 


Val Loss: 17.3704 ± 0.4960
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 165.8416
Function value obtained: 17.8664
Current minimum: 16.7766
Iteration No: 9 started. Evaluating function at random point.
{'condition_size': 129, 'model_nested_size': 912, 'model_nested_layers': 5, 'model_n_blocks': 9, 'model_act_norm': True, 'model_dropout': 0.3076923076923077, 'feature_network_hidden_size': 129, 'feature_network_hidden_layers': 7, 'feature_network_dropout': 0.17391304347826086}


Train: 7.5891 - Val: 41.6986 (avg: 40.9973, min: 19.6992) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 661/50000 [01:00<1:14:40, 11.01it/s] 
Train: 7.0939 - Val: 40.5762 (avg: 40.3797, min: 19.1432) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 654/50000 [00:59<1:14:50, 10.99it/s] 
Train: 7.3748 - Val: 40.7386 (avg: 40.1360, min: 19.4061) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 652/50000 [00:58<1:14:09, 11.09it/s] 


Val Loss: 40.6513 ± 0.5924
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 178.7103
Function value obtained: 41.2437
Current minimum: 16.7766
Iteration No: 10 started. Evaluating function at random point.
{'condition_size': 1152, 'model_nested_size': 53, 'model_nested_layers': 7, 'model_n_blocks': 13, 'model_act_norm': True, 'model_dropout': 0.34615384615384615, 'feature_network_hidden_size': 143, 'feature_network_hidden_layers': 8, 'feature_network_dropout': 0.1956521739130435}


Train: 11.6935 - Val: 13.6202 (avg: 13.5910, min: 13.6716) | lr: 5.00e-05 - Patience: 500/500:   6%|▌         | 2912/50000 [06:02<1:37:33,  8.04it/s]
Train: 13.3150 - Val: 14.0884 (avg: 14.0196, min: 14.0767) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1737/50000 [03:36<1:40:25,  8.01it/s]
Train: 11.7533 - Val: 14.2482 (avg: 14.2398, min: 14.2720) | lr: 5.00e-05 - Patience: 500/500:   5%|▌         | 2745/50000 [05:44<1:38:42,  7.98it/s] 


Val Loss: 14.1876 ± 0.6831
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 922.9965
Function value obtained: 14.8708
Current minimum: 14.8708
Iteration No: 11 started. Evaluating function at random point.
{'condition_size': 641, 'model_nested_size': 389, 'model_nested_layers': 2, 'model_n_blocks': 17, 'model_act_norm': True, 'model_dropout': 0.38461538461538464, 'feature_network_hidden_size': 157, 'feature_network_hidden_layers': 8, 'feature_network_dropout': 0.21739130434782608}


Train: 11.2134 - Val: 20.1059 (avg: 19.8201, min: 16.2140) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 864/50000 [01:23<1:19:17, 10.33it/s]
Train: 9.8688 - Val: 15.9642 (avg: 15.9254, min: 14.4211) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 992/50000 [01:36<1:19:29, 10.28it/s] 
Train: 10.8076 - Val: 17.9527 (avg: 17.8129, min: 16.1144) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 960/50000 [01:33<1:19:41, 10.26it/s]


Val Loss: 17.4416 ± 0.9126
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 273.9138
Function value obtained: 18.3542
Current minimum: 14.8708
Iteration No: 12 started. Evaluating function at random point.
{'condition_size': 1664, 'model_nested_size': 725, 'model_nested_layers': 3, 'model_n_blocks': 21, 'model_act_norm': False, 'model_dropout': 0.4230769230769231, 'feature_network_hidden_size': 171, 'feature_network_hidden_layers': 9, 'feature_network_dropout': 0.2391304347826087}


Train: 15.4982 - Val: 189.4746 (avg: 406.3960, min: 22.2838) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 637/50000 [01:23<1:47:45,  7.63it/s]  
Train: 15.3154 - Val: 27.4620 (avg: 26.4620, min: 21.4172) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 668/50000 [01:27<1:47:43,  7.63it/s]
Train: 13.0836 - Val: 21.0781 (avg: 20.6931, min: 17.6481) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 806/50000 [01:46<1:48:46,  7.54it/s]


Val Loss: 24.8975 ± 2.6195
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 278.4456
Function value obtained: 27.5170
Current minimum: 14.8708
Iteration No: 13 started. Evaluating function at random point.
{'condition_size': 385, 'model_nested_size': 165, 'model_nested_layers': 4, 'model_n_blocks': 25, 'model_act_norm': False, 'model_dropout': 0.46153846153846156, 'feature_network_hidden_size': 185, 'feature_network_hidden_layers': 10, 'feature_network_dropout': 0.2608695652173913}


Train: 17.7215 - Val: 22.5384 (avg: 21.9812, min: 21.0009) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 896/50000 [02:12<2:00:37,  6.78it/s]
Train: 19.3365 - Val: 24.9415 (avg: 25.1560, min: 22.8697) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 669/50000 [01:38<2:01:17,  6.78it/s]
Train: 18.1087 - Val: 23.0061 (avg: 22.6645, min: 21.4571) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 888/50000 [02:11<2:00:55,  6.77it/s]


Val Loss: 23.0297 ± 0.6908
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 362.0823
Function value obtained: 23.7205
Current minimum: 14.8708
Iteration No: 14 started. Evaluating function at random point.
{'condition_size': 1408, 'model_nested_size': 501, 'model_nested_layers': 6, 'model_n_blocks': 29, 'model_act_norm': False, 'model_dropout': 0.0029585798816568047, 'feature_network_hidden_size': 200, 'feature_network_hidden_layers': 11, 'feature_network_dropout': 0.2826086956521739}


Train: 11.0020 - Val: 323.6129 (avg: 261.2057, min: 21.2562) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 692/50000 [02:38<3:08:02,  4.37it/s]
Train: 12.9394 - Val: 43.6653 (avg: 42.5009, min: 20.5747) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 666/50000 [02:32<3:08:36,  4.36it/s]
Train: 11.4035 - Val: 75.7102 (avg: 69.9210, min: 20.5723) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 748/50000 [02:49<3:06:31,  4.40it/s]


Val Loss: 66.8862 ± 13.3227
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 481.9448
Function value obtained: 80.2089
Current minimum: 14.8708
Iteration No: 15 started. Evaluating function at random point.
{'condition_size': 897, 'model_nested_size': 837, 'model_nested_layers': 7, 'model_n_blocks': 5, 'model_act_norm': False, 'model_dropout': 0.04142011834319527, 'feature_network_hidden_size': 214, 'feature_network_hidden_layers': 12, 'feature_network_dropout': 0.30434782608695654}


Train: 17.0184 - Val: 46.4884 (avg: 46.3734, min: 26.0251) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 826/50000 [00:55<55:11, 14.85it/s]  
Train: 16.7294 - Val: 74.8251 (avg: 72.3775, min: 25.7867) | lr: 1.00e-04 - Patience: 500/500:   2%|▏         | 787/50000 [00:53<55:24, 14.80it/s]  
Train: 17.6705 - Val: 55.0473 (avg: 54.6597, min: 26.0576) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 745/50000 [00:49<55:04, 14.91it/s]  


Val Loss: 39.7553 ± 3.5556
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 159.0557
Function value obtained: 43.3109
Current minimum: 14.8708
Iteration No: 16 started. Evaluating function at random point.
{'condition_size': 1920, 'model_nested_size': 277, 'model_nested_layers': 2, 'model_n_blocks': 9, 'model_act_norm': False, 'model_dropout': 0.07988165680473373, 'feature_network_hidden_size': 228, 'feature_network_hidden_layers': 13, 'feature_network_dropout': 0.32608695652173914}


Train: 16.4966 - Val: 31.5662 (avg: 29.9560, min: 22.7166) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 679/50000 [00:37<45:53, 17.91it/s]
Train: 17.4042 - Val: 29.4700 (avg: 28.8747, min: 22.5302) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 664/50000 [00:36<45:13, 18.18it/s]
Train: 16.6787 - Val: 28.0467 (avg: 28.2948, min: 22.5103) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 670/50000 [00:37<45:43, 17.98it/s]


Val Loss: 28.6352 ± 0.5330
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 111.7935
Function value obtained: 29.1682
Current minimum: 14.8708
Iteration No: 17 started. Evaluating function at random point.
{'condition_size': 65, 'model_nested_size': 613, 'model_nested_layers': 3, 'model_n_blocks': 13, 'model_act_norm': False, 'model_dropout': 0.1183431952662722, 'feature_network_hidden_size': 242, 'feature_network_hidden_layers': 13, 'feature_network_dropout': 0.34782608695652173}


Train: 9.2805 - Val: 138.0184 (avg: 129.0584, min: 25.4870) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 595/50000 [00:46<1:05:02, 12.66it/s] 
Train: 9.4667 - Val: 77.5729 (avg: 77.2760, min: 24.4722) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 588/50000 [00:46<1:05:11, 12.63it/s] 
Train: 9.5287 - Val: 98.7469 (avg: 100.1848, min: 24.6457) | lr: 1.00e-04 - Patience: 500/500:   1%|          | 594/50000 [00:47<1:05:34, 12.56it/s] 


Val Loss: 100.2068 ± 21.5335
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 141.0267
Function value obtained: 121.7402
Current minimum: 14.8708
Iteration No: 18 started. Evaluating function at random point.
{'condition_size': 1088, 'model_nested_size': 949, 'model_nested_layers': 5, 'model_n_blocks': 17, 'model_act_norm': True, 'model_dropout': 0.15680473372781065, 'feature_network_hidden_size': 17, 'feature_network_hidden_layers': 14, 'feature_network_dropout': 0.3695652173913043}


Train: 9.7375 - Val: 110.2506 (avg: 136.4230, min: 22.3258) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 628/50000 [01:48<2:22:39,  5.77it/s]  
Train: 6.5969 - Val: 91.2142 (avg: 88.4589, min: 21.5304) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 684/50000 [01:58<2:22:23,  5.77it/s] 
Train: 7.4276 - Val: 96.0459 (avg: 93.8592, min: 22.3968) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 655/50000 [01:53<2:22:50,  5.76it/s] 


Val Loss: 85.4297 ± 5.5201
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 341.9426
Function value obtained: 90.9498
Current minimum: 14.8708
Iteration No: 19 started. Evaluating function at random point.
{'condition_size': 577, 'model_nested_size': 91, 'model_nested_layers': 6, 'model_n_blocks': 21, 'model_act_norm': True, 'model_dropout': 0.1952662721893491, 'feature_network_hidden_size': 31, 'feature_network_hidden_layers': 15, 'feature_network_dropout': 0.391304347826087}


Train: 13.2733 - Val: 19.1405 (avg: 19.0148, min: 18.6063) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1605/50000 [04:49<2:25:28,  5.54it/s]
Train: 13.8483 - Val: 17.4437 (avg: 17.5042, min: 17.3523) | lr: 1.00e-04 - Patience: 500/500:   3%|▎         | 1616/50000 [05:09<2:34:12,  5.23it/s]
Train: 12.7858 - Val: 17.3372 (avg: 17.4055, min: 17.2604) | lr: 1.00e-04 - Patience: 500/500:   4%|▍         | 1882/50000 [05:45<2:27:14,  5.45it/s]


Val Loss: 17.8375 ± 0.3759
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 944.1521
Function value obtained: 18.2135
Current minimum: 14.8708
Iteration No: 20 started. Evaluating function at random point.
{'condition_size': 1600, 'model_nested_size': 427, 'model_nested_layers': 7, 'model_n_blocks': 25, 'model_act_norm': True, 'model_dropout': 0.23372781065088757, 'feature_network_hidden_size': 45, 'feature_network_hidden_layers': 0, 'feature_network_dropout': 0.41304347826086957}


Train: -6.8219 - Val: 62.2505 (avg: 61.8562, min: 13.8937) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 646/50000 [02:36<3:19:11,  4.13it/s]
Train: -7.1767 - Val: 55.9786 (avg: 53.0200, min: 12.6566) | lr: 1.00e-04 - Patience: 500/500:   1%|▏         | 643/50000 [02:41<3:26:04,  3.99it/s]
Train: -0.2989 - Val: 29.7592 (avg: 26.4453, min: 13.4581) | lr: 2.00e-04 - Patience: 210/500:   1%|          | 346/50000 [01:32<3:40:11,  3.76it/s]


KeyboardInterrupt: 