In [1]:
import sys

sys.path.append('/mnt/new_home/ronedr/evolution-strategy-baselines-comparison')

In [2]:
import os
os.environ["XLA_FLAGS"] = "--xla_gpu_strict_conv_algorithm_picker=false"

In [3]:
import jax
from utils.problem_utils import get_problem_name
from evosax.algorithms import algorithms
from evosax.problems import CNN, TorchVisionProblem as Problem, identity_output_fn
from tqdm import tqdm
from experiment.run_experiments import run_experiment_permutations

In [4]:
num_generations = 100
population_size = 64
seeds = list(range(0, 5))
result_dir = "../experiment_results"
problems_torch_vision = ["MNIST", "FashionMNIST", "CIFAR10", "SVHN"]

In [5]:
es_dict = {
    "SimpleES": {},
    "LES": {},
    "DES": {},
    "EvoTF_ES": {},
    "PGPE": {},
    "Open_ES": {},
    "SNES": {},
    "Sep_CMA_ES": {},
    "CMA_ES": {},
}

In [6]:
for task_name in tqdm(problems_torch_vision, desc="Loading Problems .."):
    try:
        problem = Problem(task_name=task_name,
                          network=CNN(
                              num_filters=[8, 16],
                              kernel_sizes=[(5, 5), (5, 5)],
                              strides=[(1, 1), (1, 1)],
                              mlp_layer_sizes=[10],
                              output_fn=identity_output_fn
                          ),
                          batch_size=1024)
        print("Successfully loaded:", task_name)
        for es in es_dict:
            for seed in seeds:
                key = jax.random.key(seed)
                run_experiment_permutations(problems=[problem],
                                            es_dict={es: es_dict[es]},
                                            num_generations=num_generations,
                                            population_size=population_size,
                                            seed=seed,
                                            result_dir=result_dir, 
                                            run_again_if_exist=False)
    except Exception as e:
        print("Failed to load:", task_name, e)
        continue

Loading Problems ..:   0%|          | 0/4 [00:00<?, ?it/s]

Successfully loaded: MNIST



Running ES algorithms:   0%|          | 0/1 [00:00<?, ?it/s][A

running the experiment ... [../experiment_results/TorchVisionProblem/MNIST/SimpleES/0.json]


2025-08-08 18:44:30.526171: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.27GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-08 18:44:30.705196: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.39GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-08 18:44:30.705273: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.65GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-08 18:44:30.705312: W external/xla/xla/tsl/framework/bfc_allocato

Failed to load: MNIST RESOURCE_EXHAUSTED: Out of memory while trying to allocate 6577493416 bytes.
Successfully loaded: FashionMNIST



Running ES algorithms:   0%|          | 0/1 [00:00<?, ?it/s][A

running the experiment ... [../experiment_results/TorchVisionProblem/FashionMNIST/SimpleES/0.json]


2025-08-08 18:45:05.337869: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.39GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-08 18:45:05.337921: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.65GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-08 18:45:05.337949: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.00GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-08 18:45:05.817099: W external/xla/xla/tsl/framework/bfc_allocato

Failed to load: FashionMNIST RESOURCE_EXHAUSTED: Out of memory while trying to allocate 6577493416 bytes.
Successfully loaded: CIFAR10



Running ES algorithms:   0%|          | 0/1 [00:00<?, ?it/s][A

running the experiment ... [../experiment_results/TorchVisionProblem/CIFAR10/SimpleES/0.json]


2025-08-08 18:45:53.511734: W external/xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.30GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-08 18:45:53.512488: W external/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:1095] Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.3 = (f32[1024,1024,32,32]{3,2,1,0}, u8[0]{0}) custom-call(f32[1024,512,32,32]{3,2,1,0} %bitcast.6932, f32[1024,8,5,5]{3,2,1,0} %bitcast.6936), window={size=5x5 pad=2_2x2_2}, dim_labels=bf01_oi01->bf01, feature_group_count=64, custom_call_target="__cudnn$convForward", metadata={op_name="jit(eval)/jit(main)/vmap(CNN)/Conv_1/conv_general_dilated" source_file="/home/ronedr/.local/lib/python3.11/site-packages/flax/linen/linear.py" source_line=694}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"

Failed to load: CIFAR10 RESOURCE_EXHAUSTED: Out of memory while trying to allocate 8590759336 bytes.
Successfully loaded: SVHN



Running ES algorithms:   0%|          | 0/1 [00:00<?, ?it/s][A

running the experiment ... [../experiment_results/TorchVisionProblem/SVHN/SimpleES/0.json]


2025-08-08 18:46:59.568683: W external/xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:1095] Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.3 = (f32[1024,1024,32,32]{3,2,1,0}, u8[0]{0}) custom-call(f32[1024,512,32,32]{3,2,1,0} %bitcast.6932, f32[1024,8,5,5]{3,2,1,0} %bitcast.6936), window={size=5x5 pad=2_2x2_2}, dim_labels=bf01_oi01->bf01, feature_group_count=64, custom_call_target="__cudnn$convForward", metadata={op_name="jit(eval)/jit(main)/vmap(CNN)/Conv_1/conv_general_dilated" source_file="/home/ronedr/.local/lib/python3.11/site-packages/flax/linen/linear.py" source_line=694}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kNone","side_input_scale":0,"leakyrelu_alpha":0},"force_earliest_schedule":false}

Original error: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 4311744512 bytes. [tf-allocator-allocation-error='']

As a result, convolution per

Failed to load: SVHN RESOURCE_EXHAUSTED: Out of memory while trying to allocate 8590759344 bytes.



