In [1]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = (
    "expandable_segments:True"  # avoid fragmented CUDA memory
)

from config import LOGS_PATH, RUNS_PATH, TRAIN_DATA, TEST_DATA
from utils import set_up_logging

set_up_logging(LOGS_PATH)

from utils import get_device
from training import train_with_ray_factory
from ray import tune
import ray
from ray.tune.schedulers import ASHAScheduler
from ray.air import RunConfig


TRIAL_COUNT = 50
EPOCH_COUNT = 4
CHUNK_COUNT = EPOCH_COUNT * 40

ray.init(include_dashboard=True, dashboard_host="0.0.0.0")

device = get_device()
f"Using device {device}"

2024-09-05 22:18:20,653	INFO worker.py:1774 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://172.29.235.222:8265 [39m[22m


'Using device cuda:0'

In [2]:
config = {
    "batch_size": 48,
    "edit_count": EPOCH_COUNT,
    "bin_count": 32,
    "learning_rate": tune.loguniform(1e-4, 1e-2),
    "scheduler_gamma": tune.uniform(0.94, 0.9999),
    # "elu_alpha": tune.uniform(0.5, 2),
    "leaky_relu_slope": tune.uniform(0, 0.03),
    "dropout_prob": tune.uniform(0, 0.1),
    "chunk_count": CHUNK_COUNT,
    "features": tune.choice(
        [
            [16, 32, 64],
            [16, 32, 64, 128],
            [16, 32, 64, 128, 256],
            [16, 32, 32, 32, 64],
            [32, 64],
            [32, 128],
            [32, 64, 128],
            [32, 64, 128, 256],
            [16, 16, 16, 16, 16],
            [16, 16, 16, 16, 16, 16, 16, 16],
            [16, 16, 16, 16, 16, 16, 16, 16, 16, 16],
            [32, 32, 32],
            [32, 32, 32, 32],
            [64, 64, 64],
            [64, 64, 64, 64],
            [64, 64, 64, 64, 64],
            [256, 64, 256],
        ]
    ),
    "use_residual": True,
    "kernel_size": tune.choice([3, 5]),
    "model_type": "HistogramNet",
    "use_instance_norm": True,
    "use_elu": False,
    "leaky_relu_alpha": tune.uniform(0, 0.07),
}

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(
            train_with_ray_factory(
                train_data_paths=TRAIN_DATA,
                test_data_paths=TEST_DATA,
                device=device,
                log_dir=RUNS_PATH / "custom",
            )
        ),
        resources={"cpu": 32, "gpu": 1},
    ),
    run_config=RunConfig(storage_path=RUNS_PATH, name="tune"),
    tune_config=tune.TuneConfig(
        metric="chunk_test_loss",
        mode="min",
        scheduler=ASHAScheduler(
            max_t=config["chunk_count"], grace_period=2, time_attr="chunk_id"
        ),
        num_samples=TRIAL_COUNT,
    ),
    param_space=config,
)
results = tuner.fit()

0,1
Current time:,2024-09-05 22:23:47
Running for:,00:05:24.00
Memory:,22.7/54.9 GiB

Trial name,status,loc,dropout_prob,features,kernel_size,leaky_relu_alpha,leaky_relu_slope,learning_rate,scheduler_gamma
train_with_ray_61b01_00000,RUNNING,172.29.235.222:3274755,0.0240677,"[16, 32, 64, 12_25c0",3,0.0147726,0.0164835,0.00409419,0.977701
train_with_ray_61b01_00001,PENDING,,0.0252525,"[16, 32, 64, 12_9940",3,0.0607274,0.00769014,0.00175452,0.971626
train_with_ray_61b01_00002,PENDING,,0.0492015,"[16, 32, 64, 12_0940",5,0.0522751,0.00532775,0.000102449,0.985839
train_with_ray_61b01_00003,PENDING,,0.0091843,"[16, 32, 64, 12_8e80",5,0.0215584,0.00986109,0.0046811,0.969517
train_with_ray_61b01_00004,PENDING,,0.0200707,"[16, 32, 64, 12_e200",5,0.00839403,0.0236732,0.00456458,0.949592
train_with_ray_61b01_00005,PENDING,,0.071518,"[16, 32, 64, 12_4200",5,0.00955745,0.000338297,0.0012211,0.959114
train_with_ray_61b01_00006,PENDING,,0.00712943,"[16, 32, 64, 12_f8c0",3,0.0128474,0.0214861,0.000319318,0.954027
train_with_ray_61b01_00007,PENDING,,0.0554306,"[16, 32, 64, 12_1900",3,0.0686442,0.0103842,0.00856337,0.966035
train_with_ray_61b01_00008,PENDING,,0.0637642,"[16, 32, 64, 12_8f80",5,0.0125659,0.00386933,0.00135947,0.955525
train_with_ray_61b01_00009,PENDING,,0.0526107,"[16, 32, 64, 12_16c0",5,0.00576232,0.0120612,0.000143394,0.983664




In [None]:
best_result = results.get_best_result("chunk_test_loss", "min")

print("Best trial config: {}".format(best_result.config))
print(
    "Best trial final validation loss: {}".format(
        best_result.metrics["chunk_test_loss"]
    )
)

Best trial config: {'batch_size': 32, 'edit_count': 3, 'bin_count': 32, 'learning_rate': 0.0029226033808016005, 'scheduler_gamma': 0.8167516482513361, 'elu_alpha': 1.3510723758569865, 'leaky_relu_slope': 0.0066452562138349025, 'dropout_prob': 0.0039589213934103865, 'chunk_count': 4, 'features': [8, 16, 32], 'use_residual': True, 'kernel_size': 5, 'model_type': 'HistogramNet', 'use_instance_norm': True, 'use_elu': True, 'leaky_relu_alpha': 0.04487148648446764}
Best trial final validation loss: 5.2214884757995605


In [None]:
# hparams = {
#     "batch_size": 64,
#     "edit_count": 12,
#     "bin_count": 16,
#     "learning_rate": 0.0006126108207352808,
#     "scheduler_gamma": 0.9382286228762693,
#     "num_epochs": 10,
#     "elu_alpha": 1.3092260477215776,
#     "leaky_relu_slope": 0.029438156325552762,
#     "dropout_prob": 0.06261255195786307,
#     "features": [8, 16, 32],
#     "use_residual": True,
#     "kernel_size": 5,
#     "model_type": "HistogramNet",
#     "use_instance_norm": True,
#     "use_elu": False,
#     "leaky_relu_alpha": 0.03745605986732464,
# }

# train(
#     hparams,
#     train_data_paths=TRAIN_DATA,
#     test_data_paths=TEST_DATA,
#     log_dir=RUNS_PATH,
#     max_duration=None,
#     use_tqdm=True,
#     device=device,
#     **hparams
# )