In [1]:
import sys

sys.path.append('/mnt/new_home/ronedr/evolution-strategy-baselines-comparison')

In [2]:
import jax
import os
from utils.problem_utils import get_problem_name
from evosax.algorithms import algorithms
from evosax.problems import CNN, TorchVisionProblem as Problem, identity_output_fn
from tqdm import tqdm
from experiment.run_experiments import run_experiment_permutations

In [3]:
num_generations = 100
population_size = 128
seeds = list(range(0, 5))
result_dir = "../experiment_results"
problems_torch_vision = ["MNIST", "FashionMNIST", "CIFAR10", "SVHN"]

In [4]:
es_dict = {
    "SimpleES": {},
    "LES": {},
    "DES": {},
    "EvoTF_ES": {},
    "PGPE": {},
    "Open_ES": {},
    "SNES": {},
    "Sep_CMA_ES": {},
    "CMA_ES": {},
}

In [5]:
for task_name in tqdm(["MNIST", "FashionMNIST", "CIFAR10", "SVHN"], desc="Loading Problems .."):
    try:
        problem = Problem(task_name=task_name,
                          network=CNN(
                              num_filters=[8, 16],
                              kernel_sizes=[(5, 5), (5, 5)],
                              strides=[(1, 1), (1, 1)],
                              mlp_layer_sizes=[10],
                              output_fn=identity_output_fn
                          ),
                          batch_size=1024)
        print("Successfully loaded:", task_name)
        for es in es_dict:
            for seed in seeds:
                key = jax.random.key(seed)
                run_experiment_permutations(problems=[problem],
                                            es_dict={es: es_dict[es]},
                                            num_generations=num_generations,
                                            population_size=population_size,
                                            seed=seed,
                                            result_dir=result_dir, 
                                            run_again_if_exist=False)
    except Exception as e:
        print("Failed to load:", task_name, e)
        continue

Loading Problems ..:   0%|          | 0/4 [00:00<?, ?it/s]

Successfully loaded: MNIST
../experiment_results/TorchVisionProblem/MNIST/LearnedES.json
Path exists
../experiment_results/TorchVisionProblem/MNIST/SimpleES.json
Path exists
../experiment_results/TorchVisionProblem/MNIST/PGPE.json
Path exists
../experiment_results/TorchVisionProblem/MNIST/Open_ES.json
Path exists
../experiment_results/TorchVisionProblem/MNIST/SNES.json
Path exists
../experiment_results/TorchVisionProblem/MNIST/Sep_CMA_ES.json
Path exists
../experiment_results/TorchVisionProblem/MNIST/CMA_ES.json
Path does not exist



Running ES algorithms:   0%|          | 0/1 [00:00<?, ?it/s][A2025-07-31 00:47:43.110368: W external/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc:3021] Can't reduce memory use below -100.75GiB (-108176383735 bytes) by rematerialization; only reduced to 123.75GiB (132870640664 bytes), down from 123.75GiB (132870640664 bytes) originally
2025-07-31 00:47:53.160517: W external/xla/xla/tsl/framework/bfc_allocator.cc:501] Allocator (GPU_0_bfc) ran out of memory trying to allocate 61.87GiB (rounded to 66434031616)requested by op 
2025-07-31 00:47:53.160773: W external/xla/xla/tsl/framework/bfc_allocator.cc:512] **__________________________________________________________________________________________________
E0731 00:47:53.160825 3981813 pjrt_stream_executor_client.cc:3026] Execution of replica 0 failed: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 66434031504 bytes. [tf-allocator-allocation-error='']
Running ES algorithms:   0%|          | 0/1 [00:11<?, ?it/s

Failed to load: MNIST RESOURCE_EXHAUSTED: Out of memory while trying to allocate 66434031504 bytes.
Successfully loaded: FashionMNIST
../experiment_results/TorchVisionProblem/FashionMNIST/LearnedES.json
Path exists
../experiment_results/TorchVisionProblem/FashionMNIST/SimpleES.json
Path exists
../experiment_results/TorchVisionProblem/FashionMNIST/PGPE.json
Path exists
../experiment_results/TorchVisionProblem/FashionMNIST/Open_ES.json
Path exists
../experiment_results/TorchVisionProblem/FashionMNIST/SNES.json
Path exists
../experiment_results/TorchVisionProblem/FashionMNIST/Sep_CMA_ES.json
Path exists
../experiment_results/TorchVisionProblem/FashionMNIST/CMA_ES.json
Path does not exist



Running ES algorithms:   0%|          | 0/1 [00:00<?, ?it/s][A2025-07-31 00:48:07.314604: W external/xla/xla/hlo/transforms/simplifiers/hlo_rematerialization.cc:3021] Can't reduce memory use below -100.75GiB (-108176383735 bytes) by rematerialization; only reduced to 123.75GiB (132870640664 bytes), down from 123.75GiB (132870640664 bytes) originally
2025-07-31 00:48:17.366952: W external/xla/xla/tsl/framework/bfc_allocator.cc:501] Allocator (GPU_0_bfc) ran out of memory trying to allocate 61.87GiB (rounded to 66434031616)requested by op 
2025-07-31 00:48:17.367063: W external/xla/xla/tsl/framework/bfc_allocator.cc:512] ****________________________________________________________________________________________________
E0731 00:48:17.367106 3981813 pjrt_stream_executor_client.cc:3026] Execution of replica 0 failed: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 66434031504 bytes. [tf-allocator-allocation-error='']
Running ES algorithms:   0%|          | 0/1 [00:10<?, ?it/s

Failed to load: FashionMNIST RESOURCE_EXHAUSTED: Out of memory while trying to allocate 66434031504 bytes.
Successfully loaded: CIFAR10
../experiment_results/TorchVisionProblem/CIFAR10/LearnedES.json
Path does not exist



Running ES algorithms:   0%|          | 0/1 [00:00<?, ?it/s][A2025-07-31 00:48:43.441080: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 8.55GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-07-31 00:48:43.618876: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 33.02GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-07-31 00:48:43.992528: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 8.02GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
Running 

Failed to load: CIFAR10 UNKNOWN: Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.3 = (f32[1024,2048,32,32]{3,2,1,0}, u8[0]{0}) custom-call(f32[1024,1024,32,32]{3,2,1,0} %bitcast.6894, f32[2048,8,5,5]{3,2,1,0} %bitcast.6901), window={size=5x5 pad=2_2x2_2}, dim_labels=bf01_oi01->bf01, feature_group_count=128, custom_call_target="__cudnn$convForward", metadata={op_name="jit(eval)/jit(main)/vmap(CNN)/Conv_1/conv_general_dilated" source_file="/home/ronedr/.local/lib/python3.11/site-packages/flax/linen/linear.py" source_line=694}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kNone","side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false}

Original error: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 8606711808 bytes. [tf-allocator-allocation-error='']

To ignore this failure and try to use a fallback algorithm (which may have suboptimal performa


Running ES algorithms:   0%|          | 0/1 [00:00<?, ?it/s][A2025-07-31 00:49:14.036157: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 8.02GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
Running ES algorithms:   0%|          | 0/1 [00:08<?, ?it/s]
Loading Problems ..: 100%|██████████| 4/4 [01:51<00:00, 27.90s/it]

Failed to load: SVHN UNKNOWN: Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.3 = (f32[1024,2048,32,32]{3,2,1,0}, u8[0]{0}) custom-call(f32[1024,1024,32,32]{3,2,1,0} %bitcast.6894, f32[2048,8,5,5]{3,2,1,0} %bitcast.6901), window={size=5x5 pad=2_2x2_2}, dim_labels=bf01_oi01->bf01, feature_group_count=128, custom_call_target="__cudnn$convForward", metadata={op_name="jit(eval)/jit(main)/vmap(CNN)/Conv_1/conv_general_dilated" source_file="/home/ronedr/.local/lib/python3.11/site-packages/flax/linen/linear.py" source_line=694}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kNone","side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false}

Original error: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 8606711808 bytes. [tf-allocator-allocation-error='']

To ignore this failure and try to use a fallback algorithm (which may have suboptimal performance


