In [1]:
import os
from utils import set_up_logging, get_device
from training import train_with_ray_factory
from config import LOGS_PATH, RUNS_PATH, TRAIN_DATA, TEST_DATA
from ray import tune
from ray.tune.schedulers import ASHAScheduler
import os
from ray.air import RunConfig

set_up_logging(LOGS_PATH)

TRIAL_COUNT = 100
CHUNK_COUNT = 40
EPOCH_COUNT = 2

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
device = get_device()
f"Using device {device}"

'Using device cuda:0'

In [2]:
config = {
    "batch_size": 32,
    "edit_count": EPOCH_COUNT,
    "bin_count": 32,
    "learning_rate": tune.loguniform(5e-4, 5e-3),
    "scheduler_gamma": tune.uniform(0.8, 0.95),
    "elu_alpha": tune.uniform(0.5, 2),
    "leaky_relu_slope": tune.uniform(0, 0.03),
    "dropout_prob": tune.uniform(0, 0.1),
    "chunk_count": CHUNK_COUNT,
    "features": tune.choice(
        [
            [16, 32, 64],
            [16, 32, 64, 128],
            [32, 64],
            [32, 128],
            [8, 16, 32],
            [8, 8, 8, 8, 8],
            [8, 8, 8, 8, 8, 8, 8],
            [16, 16, 16],
            [16, 16, 16, 16, 16],
            [32, 32, 32],
            [32, 32, 32, 32],
            [64, 64],
            [64, 64, 64],
        ]
    ),
    "use_residual": tune.choice([True, False]),
    "kernel_size": tune.choice([3, 5]),
    "model_type": tune.choice(["HistogramNet"]),
    "use_instance_norm": True,
    "use_elu": tune.choice([True, False]),
    "leaky_relu_alpha": tune.uniform(0, 0.05),
}
scheduler = ASHAScheduler(max_t=config["chunk_count"], grace_period=2)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(
            train_with_ray_factory(
                train_data_paths=TRAIN_DATA,
                test_data_paths=TEST_DATA,
                device=device,
                log_dir=RUNS_PATH / "custom",
            )
        ),
        resources={"cpu": 32, "gpu": 1},
    ),
    run_config=RunConfig(storage_path=RUNS_PATH, name="tune"),
    tune_config=tune.TuneConfig(
        metric="chunk_test_loss",
        mode="min",
        scheduler=scheduler,
        num_samples=TRIAL_COUNT,
    ),
    param_space=config,
)
results = tuner.fit()

0,1
Current time:,2024-09-01 22:06:10
Running for:,00:02:06.64
Memory:,22.4/47.0 GiB

Trial name,status,loc,batch_size,dropout_prob,elu_alpha,features,kernel_size,leaky_relu_alpha,leaky_relu_slope,learning_rate,model_type,scheduler_gamma,use_elu,use_residual,iter,total time (s),chunk_test_loss,chunk_training_loss
train_with_ray_b7d3c_00000,TERMINATED,172.29.235.222:1134109,32,0.00395892,1.35107,"[8, 16, 32]",5,0.0448715,0.00664526,0.0029226,HistogramNet,0.816752,True,True,3,70.652,5.22149,56.5472
train_with_ray_b7d3c_00001,TERMINATED,172.29.235.222:1140440,32,0.0439061,1.74642,"[8, 8, 8, 8, 8,_c140",3,0.00579656,0.0125715,0.00155182,HistogramNet,0.898059,True,False,2,45.4228,5.79311,64.0481


[36m(train_with_ray pid=1134109)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00000_0_batch_size=32,dropout_prob=0.0040,elu_alpha=1.3511,features=8_16_32,kernel_size=5,leaky_relu_alpha=0._2024-09-01_22-04-03/checkpoint_000000)
[36m(train_with_ray pid=1134109)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00000_0_batch_size=32,dropout_prob=0.0040,elu_alpha=1.3511,features=8_16_32,kernel_size=5,leaky_relu_alpha=0._2024-09-01_22-04-03/checkpoint_000001)
[36m(train_with_ray pid=1134109)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00000_0_batch_size=32,dropout_prob=0.0040,elu_alpha=1.3511,features=8_16_32,kernel_size=5,leaky_relu_alpha=0._2024-09-01_22-04-03/checkpoint_000002)
[36m(train_with_ray pid=1140440)

[36m(train_with_ray pid=1140440)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/andras/projects/bipolaroid/runs5/tune/train_with_ray_b7d3c_00001_1_batch_size=32,dropout_prob=0.0439,elu_alpha=1.7464,features=8_8_8_8_8_8_8,kernel_size=3,leaky_relu_al_2024-09-01_22-04-03/checkpoint_000001)


In [3]:
best_result = results.get_best_result("chunk_test_loss", "min")

print("Best trial config: {}".format(best_result.config))
print(
    "Best trial final validation loss: {}".format(
        best_result.metrics["chunk_test_loss"]
    )
)

Best trial config: {'batch_size': 32, 'edit_count': 3, 'bin_count': 32, 'learning_rate': 0.0029226033808016005, 'scheduler_gamma': 0.8167516482513361, 'elu_alpha': 1.3510723758569865, 'leaky_relu_slope': 0.0066452562138349025, 'dropout_prob': 0.0039589213934103865, 'chunk_count': 4, 'features': [8, 16, 32], 'use_residual': True, 'kernel_size': 5, 'model_type': 'HistogramNet', 'use_instance_norm': True, 'use_elu': True, 'leaky_relu_alpha': 0.04487148648446764}
Best trial final validation loss: 5.2214884757995605


In [4]:
# hparams = {
#     "batch_size": 64,
#     "edit_count": 12,
#     "bin_count": 16,
#     "learning_rate": 0.0006126108207352808,
#     "scheduler_gamma": 0.9382286228762693,
#     "num_epochs": 10,
#     "elu_alpha": 1.3092260477215776,
#     "leaky_relu_slope": 0.029438156325552762,
#     "dropout_prob": 0.06261255195786307,
#     "features": [8, 16, 32],
#     "use_residual": True,
#     "kernel_size": 5,
#     "model_type": "HistogramNet",
#     "use_instance_norm": True,
#     "use_elu": False,
#     "leaky_relu_alpha": 0.03745605986732464,
# }

# train(
#     hparams,
#     train_data_paths=TRAIN_DATA,
#     test_data_paths=TEST_DATA,
#     log_dir=RUNS_PATH,
#     max_duration=None,
#     use_tqdm=True,
#     device=device,
#     **hparams
# )