In [1]:
from pathlib import Path

import pandas as pd
from lightning import pytorch as pl
import ray
from ray import tune
from ray.train import CheckpointConfig, RunConfig, ScalingConfig
from ray.train.lightning import (RayDDPStrategy, RayLightningEnvironment,
                                 RayTrainReportCallback, prepare_trainer)
from ray.train.torch import TorchTrainer
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import FIFOScheduler

from chemprop import data, featurizers, models, nn

  from .autonotebook import tqdm as notebook_tqdm
2025-03-12 00:18:14,722	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-03-12 00:18:15,460	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-03-12 00:18:16,578	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
chemprop_dir = Path.cwd().parent
train_path = "train_data_5ht1b.csv"
test_path = "test_data_5ht1b.csv"
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'smiles' # name of the column containing SMILES strings
target_columns = ['value'] # list of names of the columns containing targets

hpopt_save_dir = Path.cwd() / "hpopt_5ht1b_zinc" # directory to save hyperopt results
hpopt_save_dir.mkdir(exist_ok=True)

In [3]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [4]:
# Extract SMILES and target values
train_smis = df_train[smiles_column].values
train_ys = df_train[target_columns].values

test_smis = df_test[smiles_column].values
test_ys = df_test[target_columns].values

# Convert data to MoleculeDatapoint format
train_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(train_smis, train_ys)]
test_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(test_smis, test_ys)]


In [5]:
# Initialize featurizer
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

# Create training dataset
train_dset = data.MoleculeDataset(train_data, featurizer)
scaler = train_dset.normalize_targets()

# Split validation set from training data (80% train, 20% validation)
split_idx = int(len(train_data) * 0.8)
val_data = train_data[split_idx:]
train_data = train_data[:split_idx]

# Create validation and test datasets
val_dset = data.MoleculeDataset(val_data, featurizer)
val_dset.normalize_targets(scaler)

test_dset = data.MoleculeDataset(test_data, featurizer)

In [6]:
def train_model(config, train_dset, val_dset, num_workers, scaler):

    # config is a dictionary containing hyperparameters used for the trial
    depth = int(config["depth"])
    ffn_hidden_dim = int(config["ffn_hidden_dim"])
    ffn_num_layers = int(config["ffn_num_layers"])
    message_hidden_dim = int(config["message_hidden_dim"])

    train_loader = data.build_dataloader(train_dset, num_workers=num_workers, shuffle=True)
    val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)

    mp = nn.BondMessagePassing(d_h=message_hidden_dim, depth=depth)
    agg = nn.MeanAggregation()
    output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
    ffn = nn.RegressionFFN(output_transform=output_transform, input_dim=message_hidden_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers)
    batch_norm = True
    metric_list = [nn.metrics.RMSE(), nn.metrics.MAE()]
    model = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

    trainer = pl.Trainer(
        accelerator="auto",
        devices=1,
        max_epochs=20, # number of epochs to train for
        # below are needed for Ray and Lightning integration
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
    )

    trainer = prepare_trainer(trainer)
    trainer.fit(model, train_loader, val_loader)

In [7]:
search_space = {
    "depth": tune.qrandint(lower=2, upper=6, q=1),
    "ffn_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
    "ffn_num_layers": tune.qrandint(lower=1, upper=3, q=1),
    "message_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
}

In [8]:
ray.init()

scheduler = FIFOScheduler()

# Scaling config controls the resources used by Ray
scaling_config = ScalingConfig(
    num_workers=1,
    use_gpu=False, # change to True if you want to use GPU
)

# Checkpoint config controls the checkpointing behavior of Ray
checkpoint_config = CheckpointConfig(
    num_to_keep=1, # number of checkpoints to keep
    checkpoint_score_attribute="val_loss", # Save the checkpoint based on this metric
    checkpoint_score_order="min", # Save the checkpoint with the lowest metric value
)

run_config = RunConfig(
    checkpoint_config=checkpoint_config,
    storage_path=hpopt_save_dir / "ray_results", # directory to save the results
)

ray_trainer = TorchTrainer(
    lambda config: train_model(
        config, train_dset, val_dset, num_workers, scaler
    ),
    scaling_config=scaling_config,
    run_config=run_config,
)

search_alg = HyperOptSearch(
    n_initial_points=1, # number of random evaluations before tree parzen estimators
    random_state_seed=42,
)

# OptunaSearch is another search algorithm that can be used
# search_alg = OptunaSearch()

tune_config = tune.TuneConfig(
    metric="val_loss",
    mode="min",
    num_samples=2, # number of trials to run
    scheduler=scheduler,
    search_alg=search_alg,
    trial_dirname_creator=lambda trial: str(trial.trial_id), # shorten filepaths

)

tuner = tune.Tuner(
    ray_trainer,
    param_space={
        "train_loop_config": search_space,
    },
    tune_config=tune_config,
)

# Start the hyperparameter search
results = tuner.fit()

0,1
Current time:,2025-03-12 00:23:49
Running for:,00:04:22.43
Memory:,58.6/755.0 GiB

Trial name,status,loc,train_loop_config/de pth,train_loop_config/ff n_hidden_dim,train_loop_config/ff n_num_layers,train_loop_config/me ssage_hidden_dim,iter,total time (s),train_loss,train_loss_step,val/rmse
TorchTrainer_6f0157cc,TERMINATED,10.233.0.55:2724912,2,2000,2,500,20,250.919,0.43881,0.376247,0.616044
TorchTrainer_6ccfe64d,TERMINATED,10.233.0.55:2725149,2,2200,2,400,20,225.523,0.352592,0.305612,0.538872


[36m(TorchTrainer pid=2724912)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=2724912)[0m - (node_id=0194c22a06ea2b7a23c8f0f411a980ab368c58c49aab1c08f33a3da5, ip=10.233.0.55, pid=2725150) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=2725150)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(RayTrainWorker pid=2725150)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=2725150)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2725150)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2725150)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=2725150)[0m /home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_wor

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=2725150)[0m /home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/lightning/pytorch/core/saving.py:363: Skipping 'metrics' parameter because it is not possible to safely dump to YAML.
[36m(RayTrainWorker pid=2725150)[0m /home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.91it/s]
Training: |          | 0/? [00:00<?, ?it/s]                                
Epoch 0:   0%|          | 0/28 [00:00<?, ?it/s] 
Epoch 0:   4%|▎         | 1/28 [00:00<00:11,  2.38it/s, v_num=3.41e+7, train_loss_step=0.999]
Epoch 0:   7%|▋         | 2/28 [00:00<00:09,  2.62it/s, v_num=3.41e+7, train_loss_step=0.720]
Epoch 0:  11%|█         | 3/28 [00:01<00:09,  2.63it/s, v_num=3.41e+7, train_loss_step=0.837]
Epoch 0:  14%|█▍        | 4/28 [00:01<00:08,  2.71it/s, v_num=3.41e+7, train_loss_step=0.980]
Epoch 0:  18%|█▊        | 5/28 [00:01<00:08,  2.73it/s, v_num=3.41e+7, train_loss_step=0.763]
Epoch 0:  21%|██▏       | 6/28 [00:02<00:08,  2.73it/s, v_num=3.41e+7, train_loss_step=0.773]
Epoch 0:  25%|██▌       | 7/28 [00:02<00:07,  2.72it/s, v_num=3.41e+7, train_loss_step=0.738]
Epoch 0:  29%|██▊       | 8/28 [00:02<00:07,  2.74it/s, v_num=3.41e+7, train_loss_step=1.010]
Epoch 0:  32%|███▏      | 9/28 [00:03<00:06,  2

[36m(TorchTrainer pid=2725149)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=2725149)[0m - (node_id=0194c22a06ea2b7a23c8f0f411a980ab368c58c49aab1c08f33a3da5, ip=10.233.0.55, pid=2725548) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=2725548)[0m Setting up process group for: env:// [rank=0, world_size=1]


Epoch 0:  68%|██████▊   | 19/28 [00:06<00:03,  2.89it/s, v_num=3.41e+7, train_loss_step=0.596]
Epoch 0:  71%|███████▏  | 20/28 [00:06<00:02,  2.89it/s, v_num=3.41e+7, train_loss_step=0.883]
Epoch 0:  75%|███████▌  | 21/28 [00:07<00:02,  2.89it/s, v_num=3.41e+7, train_loss_step=0.782]
Epoch 0:  79%|███████▊  | 22/28 [00:07<00:02,  2.89it/s, v_num=3.41e+7, train_loss_step=0.514]
Epoch 0:  82%|████████▏ | 23/28 [00:07<00:01,  2.89it/s, v_num=3.41e+7, train_loss_step=0.512]
Epoch 0:  86%|████████▌ | 24/28 [00:08<00:01,  2.89it/s, v_num=3.41e+7, train_loss_step=0.975]
Epoch 0:  89%|████████▉ | 25/28 [00:08<00:01,  2.90it/s, v_num=3.41e+7, train_loss_step=0.704]
Epoch 0:  93%|█████████▎| 26/28 [00:08<00:00,  2.90it/s, v_num=3.41e+7, train_loss_step=0.807]
Epoch 0:  96%|█████████▋| 27/28 [00:09<00:00,  2.90it/s, v_num=3.41e+7, train_loss_step=0.717]
Epoch 0: 100%|██████████| 28/28 [00:09<00:00,  2.92it/s, v_num=3.41e+7, train_loss_step=0.682]
Validation: |          | 0/? [00:00<?, ?it/s][A
V

[36m(RayTrainWorker pid=2725548)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=2725548)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=2725548)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=2725548)[0m Loading `train_dataloader` to estimate number of stepping batches.
[36m(RayTrainWorker pid=2725548)[0m /home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(RayTrainWorker pid=2725548)[0m /home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (28) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_st

[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.97it/s][A
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.24it/s][A
Epoch 0: 100%|██████████| 28/28 [00:11<00:00,  2.53it/s, v_num=3.41e+7, train_loss_step=0.682, val_loss=0.841]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  4.75it/s]


[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000000)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 0: 100%|██████████| 28/28 [00:11<00:00,  2.48it/s, v_num=3.41e+7, train_loss_step=0.682, val_loss=0.841, train_loss_epoch=0.772]
Epoch 0:   0%|          | 0/28 [00:00<?, ?it/s]                            
Epoch 1:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.682, val_loss=0.841, train_loss_epoch=0.772]         
Epoch 0:   4%|▎         | 1/28 [00:00<00:10,  2.68it/s, v_num=3.41e+7, train_loss_step=0.988]
Epoch 0:  57%|█████▋    | 16/28 [00:05<00:04,  2.97it/s, v_num=3.41e+7, train_loss_step=0.813][32m [repeated 28x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
Epoch 0:  93%|█████████▎| 26/28 [00:08<00:00,  3.02it/s, v_num=3.41e+7, train_loss_step=0.788]
Epoch 0:  96%|█████████▋| 27/28 [00:08<00:00,  3.02it/s, v_num=3.41e+7, train_loss_step=0.637]
Epoch 0: 100%|█████████

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=2725548)[0m 
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.76it/s][A
Epoch 0: 100%|██████████| 28/28 [00:10<00:00,  2.66it/s, v_num=3.41e+7, train_loss_step=0.623, val_loss=0.846]


[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000000)


[36m(RayTrainWorker pid=2725150)[0m 
Epoch 0: 100%|██████████| 28/28 [00:10<00:00,  2.60it/s, v_num=3.41e+7, train_loss_step=0.623, val_loss=0.846, train_loss_epoch=0.772]
Epoch 1:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.623, val_loss=0.846, train_loss_epoch=0.772]         
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 


[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000001)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 1: 100%|██████████| 28/28 [00:10<00:00,  2.64it/s, v_num=3.41e+7, train_loss_step=0.625, val_loss=0.841, train_loss_epoch=0.772][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  4.01it/s][A[32m [repeated 5x across cluster][0m
Epoch 1:  50%|█████     | 14/28 [00:04<00:04,  3.05it/s, v_num=3.41e+7, train_loss_step=0.792, val_loss=0.846, train_loss_epoch=0.772][32m [repeated 21x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.27it/s][A
Epoch 1: 100%|██████████| 28/28 [00:12<00:00,  2.32it/s, v_num=3.41e+7, train_loss_step=0.625, val_loss=0.787, train_loss_epoch=0.772]
Epoch 1: 100%|██████████| 28/28 [00:12<00:00,  2.28it/s, v_num=3.41e+7, train_loss_step=0.625, val_loss=0.787, train_loss_epoch=0.719]
Epoch 2:   0%|          | 0/2

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000001)


Epoch 2:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.620, val_loss=0.811, train_loss_epoch=0.709]         
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
Epoch 2: 100%|██████████| 28/28 [00:10<00:00,  2.71it/s, v_num=3.41e+7, train_loss_step=0.683, val_loss=0.787, train_loss_epoch=0.719][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2725150)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000002)


Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.76it/s][A[32m [repeated 5x across cluster][0m
Epoch 3:   7%|▋         | 2/28 [00:00<00:09,  2.65it/s, v_num=3.41e+7, train_loss_step=1.050, val_loss=0.600, train_loss_epoch=0.714][32m [repeated 18x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.06it/s][A
Epoch 2: 100%|██████████| 28/28 [00:11<00:00,  2.36it/s, v_num=3.41e+7, train_loss_step=0.683, val_loss=0.600, train_loss_epoch=0.719]
Epoch 2: 100%|██████████| 28/28 [00:12<00:00,  2.31it/s, v_num=3.41e+7, train_loss_step=0.683, val_loss=0.600, train_loss_epoch=0.714]
Epoch 3:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.683, val_loss=0.600, train_loss_epoch=0.714]         
Epoch 2:  93%|█████████▎| 26/28 [00:08<00:00,  3.13it/s, v_num=3.41e

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000002)


Epoch 3:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.710, val_loss=0.603, train_loss_epoch=0.705]         
Epoch 3:  96%|█████████▋| 27/28 [00:10<00:00,  2.68it/s, v_num=3.41e+7, train_loss_step=0.664, val_loss=0.600, train_loss_epoch=0.714][32m [repeated 2x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Epoch 3:  39%|███▉      | 11/28 [00:03<00:05,  3.14it/s, v_num=3.41e+7, train_loss_step=0.804, val_loss=0.603, train_loss_epoch=0.705][32m [repeated 21x across cluster][0m
[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0:  33%|███▎      | 2/6 [00:00<00:00,  4.18it/s][A[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.35it/s][A
Epoch 3: 100%|██████████| 28/28 [00:11<00:00,  2.37it/s, v_num=3.41e+7, train_loss_step=0.789, val_loss=0.593, train_loss_epoch=0.714]


[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000003)


Epoch 3: 100%|██████████| 28/28 [00:12<00:00,  2.32it/s, v_num=3.41e+7, train_loss_step=0.789, val_loss=0.593, train_loss_epoch=0.666]
Epoch 4:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.789, val_loss=0.593, train_loss_epoch=0.666]         
Epoch 3: 100%|██████████| 28/28 [00:10<00:00,  2.69it/s, v_num=3.41e+7, train_loss_step=0.789, val_loss=0.600, train_loss_epoch=0.714]
Epoch 3:  93%|█████████▎| 26/28 [00:08<00:00,  3.19it/s, v_num=3.41e+7, train_loss_step=0.541, val_loss=0.603, train_loss_epoch=0.705]
Epoch 4:  36%|███▌      | 10/28 [00:03<00:06,  2.77it/s, v_num=3.41e+7, train_loss_step=0.532, val_loss=0.593, train_loss_epoch=0.666][32m [repeated 24x across cluster][0m
Epoch 3:  96%|█████████▋| 27/28 [00:08<00:00,  3.18it/s, v_num=3.41e+7, train_loss_step=0.656, val_loss=0.603, train_loss_epoch=0.705]
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  4.08it/s][A[32m [repeated 3x across cluster][0m
Epoch 3: 100%|██████████| 28/28 [00:08<

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000003)


Epoch 4:  32%|███▏      | 9/28 [00:03<00:06,  3.00it/s, v_num=3.41e+7, train_loss_step=0.565, val_loss=0.592, train_loss_epoch=0.649][32m [repeated 22x across cluster][0m
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.96it/s][A[32m [repeated 5x across cluster][0m
Epoch 4:  93%|█████████▎| 26/28 [00:09<00:00,  2.69it/s, v_num=3.41e+7, train_loss_step=0.646, val_loss=0.593, train_loss_epoch=0.666]
Epoch 4:  96%|█████████▋| 27/28 [00:10<00:00,  2.69it/s, v_num=3.41e+7, train_loss_step=0.739, val_loss=0.593, train_loss_epoch=0.666]
Epoch 4: 100%|██████████| 28/28 [00:10<00:00,  2.72it/s, v_num=3.41e+7, train_loss_step=0.514, val_loss=0.593, train_loss_epoch=0.666]
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=

[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000004)


Epoch 5:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.514, val_loss=1.920, train_loss_epoch=0.662]         
Epoch 5:  14%|█▍        | 4/28 [00:01<00:08,  2.77it/s, v_num=3.41e+7, train_loss_step=0.700, val_loss=1.920, train_loss_epoch=0.662][32m [repeated 21x across cluster][0m
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.43it/s][A[32m [repeated 5x across cluster][0m
Epoch 4:  93%|█████████▎| 26/28 [00:08<00:00,  3.06it/s, v_num=3.41e+7, train_loss_step=0.631, val_loss=0.592, train_loss_epoch=0.649]
Epoch 4:  96%|█████████▋| 27/28 [00:08<00:00,  3.04it/s, v_num=3.41e+7, train_loss_step=0.669, val_loss=0.592, train_loss_epoch=0.649]
Epoch 4: 100%|██████████| 28/28 [00:09<00:00,  3.06it/s, v_num=3.41e+7, train_loss_step=0.526, val_loss=0.592, train_loss_epoch=0.649]
[36m(RayTrainWorker pid=2725548)[0m 
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725548)[0m 
Validation:   0%|          | 0/6 [00:00<?, ?it/s

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=2725548)[0m 
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  5.25it/s][A
Epoch 4: 100%|██████████| 28/28 [00:10<00:00,  2.71it/s, v_num=3.41e+7, train_loss_step=0.526, val_loss=1.220, train_loss_epoch=0.649]
Epoch 4: 100%|██████████| 28/28 [00:10<00:00,  2.64it/s, v_num=3.41e+7, train_loss_step=0.526, val_loss=1.220, train_loss_epoch=0.648]
Epoch 5:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.526, val_loss=1.220, train_loss_epoch=0.648]         
Epoch 5:  64%|██████▍   | 18/28 [00:06<00:03,  2.72it/s, v_num=3.41e+7, train_loss_step=0.778, val_loss=1.920, train_loss_epoch=0.662][32m [repeated 22x across cluster][0m
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  5.00it/s][A[32m [repeated 5x across cluster][0m
Epoch 5:  93%|█████████▎| 26/28 [00:09<00:00,  2.69it/s, v_num=3.41e+7, train_loss_step=0.454, val_loss=1.920, train_loss_epoch=0.662]
Epoch 5:  96%|█████████▋| 27/28 [00:10<00:00,  2.68it/s, v_nu

[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000005)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 5: 100%|██████████| 28/28 [00:09<00:00,  3.11it/s, v_num=3.41e+7, train_loss_step=0.566, val_loss=1.220, train_loss_epoch=0.648][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Epoch 6:  21%|██▏       | 6/28 [00:01<00:06,  3.37it/s, v_num=3.41e+7, train_loss_step=0.655, val_loss=0.532, train_loss_epoch=0.635][32m [repeated 21x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.80it/s][A
Epoch 5: 100%|██████████| 28/28 [00:10<00:00,  2.72it/s, v_num=3.41e+7, train_loss_step=0.566, val_loss=0.532, train_loss_epoch=0.648]
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  4.46it/s][A[32m [repeated 5x across cluster][0m
Epoch 6:  29%|██▊       | 8/28 [00:02<00:05,  3.36it/s, v_num=3.41e+7, train_loss_step=0.455, val_loss=0.532, train_loss_epoch=0.635]
Epoch 6:  29%|██▊       | 8/28 

[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000006)[32m [repeated 2x across cluster][0m


Epoch 7:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.800, val_loss=0.526, train_loss_epoch=0.620]         
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 6: 100%|██████████| 28/28 [00:08<00:00,  3.23it/s, v_num=3.41e+7, train_loss_step=0.736, val_loss=0.532, train_loss_epoch=0.635][32m [repeated 3x across cluster][0m
Epoch 7:  29%|██▊       | 8/28 [00:02<00:07,  2.81it/s, v_num=3.41e+7, train_loss_step=0.606, val_loss=0.526, train_loss_epoch=0.620][32m [repeated 16x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  4.77it/s][A[32m [repeated 5x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  5.08it/s][A
Epoch 6: 100%|██████████| 28/28 [00:09<00:00,  2.83it/s, v_num=3.41e+7, train_loss_step=0.736, val_loss=0.522, train_loss_epoch=0.635]
Epoch 6: 100%|██████████| 28/28 [00:10<00:00,  2.76it/s, v_num=3.41e+7, train_loss_step=0.736, val_loss=0.522, train_loss_epoch=0.612]
Epoch 7:   0%|          | 0/28

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000007)[32m [repeated 2x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
Epoch 7: 100%|██████████| 28/28 [00:10<00:00,  2.61it/s, v_num=3.41e+7, train_loss_step=0.410, val_loss=0.857, train_loss_epoch=0.586]
Epoch 8:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.410, val_loss=0.857, train_loss_epoch=0.586]         
[36m(RayTrainWorker pid=2725150)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 8:   7%|▋         | 2/28 [00:00<00:09,  2.62it/s, v_num=3.41e+7, train_loss_step=0.777, val_loss=0.717, train_loss_epoch=0.600][32m [repeated 14x across cluster][0m
Epoch 8:  14%|█▍        | 4/28 [00:01<00:09,  2.54it/s, v_num=3.41e+7, train_loss_step=0.496, val_loss=0.717, train_loss_epoch=0.600]
Epoch 8:  14%|█▍        | 4/28 [00:01<00:09,  2.54it/s, v_num=3.41e+7, train_loss_step=0.810, val_loss=0.717, train_loss_epoch=0.600]
Epoch 7: 100%|██████████| 28/28 [00:10<00:00,  2.66it/s, v_num=3.41e+7, train_loss_step=0.413, val_loss=0.526, train_loss_epoch=0.620][32m [repeated 5x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.87it/s][A[32m [repeated 8x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.06it/s][A
Epoch 7: 100%|██████████| 28/28 

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000008)[32m [repeated 2x across cluster][0m


Epoch 8: 100%|██████████| 28/28 [00:10<00:00,  2.59it/s, v_num=3.41e+7, train_loss_step=0.707, val_loss=0.632, train_loss_epoch=0.592]
Epoch 9:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.707, val_loss=0.632, train_loss_epoch=0.592]         
Epoch 9:   4%|▎         | 1/28 [00:00<00:10,  2.64it/s, v_num=3.41e+7, train_loss_step=0.604, val_loss=0.632, train_loss_epoch=0.592][32m [repeated 18x across cluster][0m
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 8: 100%|██████████| 28/28 [00:10<00:00,  2.61it/s, v_num=3.41e+7, train_loss_step=0.783, val_loss=0.717, train_loss_epoch=0.600][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.35it/s][A[32m [repeated 5x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  3.57it/s][A
Epoch 8: 100%|██████████| 28/28 [00:12<00:00,  2.24it/s, v_num=3.41e+7, train_loss_step=0.783, val_loss=0.695, train_loss_epoch=0.600]
Epoch 8: 100%|██████████| 28/28 [00:12<00:00,  2.20it/s, v_num=3.41e+7, train_loss_step=0.783, val_loss=0.695, train_loss_epoch=0.616]
Epoch 9:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.783, val_loss=0.695, train_loss_epoch=0.616]         
Epoch 9:  32%|███▏      | 9/28 [00:03<00:06,  2.85it/s, v_num=3.41e+

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000009)[32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
Epoch 9: 100%|██████████| 28/28 [00:10<00:00,  2.76it/s, v_num=3.41e+7, train_loss_step=0.480, val_loss=0.695, train_loss_epoch=0.616][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.99it/s][A[32m [repeated 5x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.17it/s][A
Epoch 9: 100%|██████████| 28/28 [00:11<00:00,  2.40it/s, v_num=3.41e+7, train_loss_step=0.480, val_loss=0.808, train_loss_epoch=0.616]
Epoch 10:  57%|█████▋    | 16/28 [00:04<00:03,  3.27it/s, v_num=3.41e+7, train_loss_step=0.511, val_loss=0.578, train_loss_epoch=0.560][32m [repeated 22x across cluster][0m
Epoch 9: 100%|██████████| 28/28 [00:11<00:00,  2.36it/s, v_num=3.41e+7, train_loss_step=0.480, val_loss=0.808, train_loss_epoch=0.589]
Epoch 10:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.480, val_loss=0.808, train_loss_epoch=0.589]        
Epoch 10:  82%|████████▏ | 23/28 [00:07<00:01,  3.27it/s, v_num=3.

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000010)[32m [repeated 2x across cluster][0m


Epoch 11:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.544, val_loss=0.457, train_loss_epoch=0.522]         
Epoch 11:  14%|█▍        | 4/28 [00:01<00:07,  3.12it/s, v_num=3.41e+7, train_loss_step=0.583, val_loss=0.457, train_loss_epoch=0.522]
Epoch 11:  14%|█▍        | 4/28 [00:01<00:07,  3.12it/s, v_num=3.41e+7, train_loss_step=0.563, val_loss=0.457, train_loss_epoch=0.522]
Epoch 10:  96%|█████████▋| 27/28 [00:09<00:00,  2.85it/s, v_num=3.41e+7, train_loss_step=0.502, val_loss=0.808, train_loss_epoch=0.589][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0:  33%|███▎      | 2/6 [00:00<00:00,  4.18it/s][A[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pi

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.34it/s][A
Epoch 10: 100%|██████████| 28/28 [00:11<00:00,  2.50it/s, v_num=3.41e+7, train_loss_step=0.596, val_loss=0.529, train_loss_epoch=0.589]
Epoch 11:  54%|█████▎    | 15/28 [00:04<00:03,  3.34it/s, v_num=3.41e+7, train_loss_step=0.334, val_loss=0.457, train_loss_epoch=0.522][32m [repeated 21x across cluster][0m
Epoch 10: 100%|██████████| 28/28 [00:11<00:00,  2.45it/s, v_num=3.41e+7, train_loss_step=0.596, val_loss=0.529, train_loss_epoch=0.561]
Epoch 11:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.596, val_loss=0.529, train_loss_epoch=0.561]         
Epoch 10: 100%|██████████| 28/28 [00:09<00:00,  2.86it/s, v_num=3.41e+7, train_loss_step=0.596, val_loss=0.808, train_loss_epoch=0.589]
Epoch 11:  93%|█████████▎| 26/28 [00:08<00:00,  3.15it/s, v_num=3.41e+7, train_loss_step=0.646, val_loss=0.457, train_loss_epoch=0.522]
Epoch 11:  96%|█████████▋| 27/28 [00:08<00:00,  3.14it/s, v_num=3.41e+7

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000011)[32m [repeated 2x across cluster][0m


Epoch 12:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.629, val_loss=0.435, train_loss_epoch=0.510]         
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  4.09it/s][A[32m [repeated 5x across cluster][0m
Epoch 12:  46%|████▋     | 13/28 [00:04<00:04,  3.14it/s, v_num=3.41e+7, train_loss_step=0.591, val_loss=0.435, train_loss_epoch=0.510][32m [repeated 26x across cluster][0m
Epoch 11:  93%|█████████▎| 26/28 [00:10<00:00,  2.59it/s, v_num=3.41e+7, train_loss_step=0.722, val_loss=0.529, train_loss_epoch=0.561]
Epoch 11:  96%|█████████▋| 27/28 [00:10<00:00,  2.59it/s, v_num=3.41e+7, train_loss_step=0.502, val_loss=0.529, train_loss_epoch=0.561]
Epoch 11: 100%|██████████| 28/28 [00:10<00:00,  2.62it/s, v_num=3.41e+7, train_loss_step=0.671, val_loss=0.529, train_loss_epoch=0.561]
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0

[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000011)


Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.50it/s][A[32m [repeated 5x across cluster][0m
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
Epoch 12:  21%|██▏       | 6/28 [00:02<00:08,  2.67it/s, v_num=3.41e+7, train_loss_step=0.460, val_loss=0.488, train_loss_epoch=0.557][32m [repeated 18x across cluster][0m
[36m(RayTrainWorker pid=2725548)[0m 
Epoch 12: 100%|██████████| 28/28 [00:08<00:00,  3.18it/s, v_num=3.41e+7, train_loss_step=0.493, val_loss=0.435, train_loss_epoch=0.510][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725548)[0m 
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725548)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000012)


Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.45it/s][A
Epoch 12: 100%|██████████| 28/28 [00:10<00:00,  2.74it/s, v_num=3.41e+7, train_loss_step=0.493, val_loss=0.613, train_loss_epoch=0.510]
Epoch 12: 100%|██████████| 28/28 [00:10<00:00,  2.68it/s, v_num=3.41e+7, train_loss_step=0.493, val_loss=0.613, train_loss_epoch=0.508]
Epoch 13:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.493, val_loss=0.613, train_loss_epoch=0.508]         
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  4.12it/s][A[32m [repeated 5x across cluster][0m
Epoch 12:  71%|███████▏  | 20/28 [00:07<00:02,  2.73it/s, v_num=3.41e+7, train_loss_step=0.466, val_loss=0.488, train_loss_epoch=0.557][32m [repeated 25x across cluster][0m
Epoch 12:  93%|█████████▎| 26/28 [00:09<00:00,  2.76it/s, v_num=3.41e+7, train_loss_step=0.568, val_loss=0.488, train_loss_epoch=0.557]
Epoch 12:  96%|█████████▋| 27/28 [00:09<00:00,  2.77it/s, v_num=3.41e+7, train_loss_step=0.565,

[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000012)


Epoch 13:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.483, val_loss=0.773, train_loss_epoch=0.551]         
Epoch 13:   4%|▎         | 1/28 [00:00<00:10,  2.54it/s, v_num=3.41e+7, train_loss_step=0.502, val_loss=0.773, train_loss_epoch=0.551][32m [repeated 20x across cluster][0m
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 


[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000013)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 13: 100%|██████████| 28/28 [00:08<00:00,  3.15it/s, v_num=3.41e+7, train_loss_step=0.578, val_loss=0.613, train_loss_epoch=0.508][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  4.06it/s][A[32m [repeated 5x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.33it/s][A
Epoch 13: 100%|██████████| 28/28 [00:10<00:00,  2.71it/s, v_num=3.41e+7, train_loss_step=0.578, val_loss=0.605, train_loss_epoch=0.508]
Epoch 13: 100%|██████████| 28/28 [00:10<00:00,  2.65it/s, v_num=3.41e+7, train_loss_step=0.578, val_loss=0.605, train_loss_epoch=0.480]
Epoch 14:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.578, val_loss=0.605, train_loss_epoch=0.480]         
Epoch 13:  54%|█████▎    | 15/28 [00:05<00:04,  2.77it/s, v_num=

[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000013)


[36m(RayTrainWorker pid=2725548)[0m 
[36m(RayTrainWorker pid=2725548)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000014)


Epoch 14: 100%|██████████| 28/28 [00:08<00:00,  3.21it/s, v_num=3.41e+7, train_loss_step=0.465, val_loss=0.605, train_loss_epoch=0.480][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  4.17it/s][A[32m [repeated 6x across cluster][0m
Epoch 14:  32%|███▏      | 9/28 [00:03<00:07,  2.54it/s, v_num=3.41e+7, train_loss_step=0.498, val_loss=0.681, train_loss_epoch=0.529][32m [repeated 17x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.52it/s][A
Epoch 14: 100%|██████████| 28/28 [00:10<00:00,  2.77it/s, v_num=3.41e+7, train_loss_step=0.465, val_loss=0.382, train_loss_epoch=0.480]
Epoch 14: 100%|██████████| 28/28 [00:10<00:00,  2.70it/s, v_num=3.41e+7, train_loss_step=0.465, val_loss=0.382, train_loss_epoch=0.457]
Epoch 15:   0%|          |

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=2725548)[0m 
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.26it/s][A
Epoch 15: 100%|██████████| 28/28 [00:10<00:00,  2.58it/s, v_num=3.41e+7, train_loss_step=0.408, val_loss=0.444, train_loss_epoch=0.457]
[36m(RayTrainWorker pid=2725150)[0m 


[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000015)


Epoch 15: 100%|██████████| 28/28 [00:11<00:00,  2.51it/s, v_num=3.41e+7, train_loss_step=0.408, val_loss=0.444, train_loss_epoch=0.432]
[36m(RayTrainWorker pid=2725150)[0m 
Epoch 16:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.408, val_loss=0.444, train_loss_epoch=0.432]         
[36m(RayTrainWorker pid=2725150)[0m 
Epoch 16:   4%|▎         | 1/28 [00:00<00:07,  3.38it/s, v_num=3.41e+7, train_loss_step=0.408, val_loss=0.444, train_loss_epoch=0.432]
Epoch 16:   4%|▎         | 1/28 [00:00<00:08,  3.37it/s, v_num=3.41e+7, train_loss_step=0.382, val_loss=0.444, train_loss_epoch=0.432]
[36m(RayTrainWorker pid=2725150)[0m 


[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000014)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Epoch 15:  11%|█         | 3/28 [00:01<00:09,  2.74it/s, v_num=3.41e+7, train_loss_step=0.400, val_loss=0.420, train_loss_epoch=0.506][32m [repeated 11x across cluster][0m
Epoch 14: 100%|██████████| 28/28 [00:10<00:00,  2.59it/s, v_num=3.41e+7, train_loss_step=0.439, val_loss=0.681, train_loss_epoch=0.529][32m [repeated 4x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.62it/s][A[32m [repeated 7x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  3.92it/s][A
Epoch 14: 100%|██████████| 28/28 [00:12<00:00,  2.26it/s, v_num=3.41e+7, train_loss_step=0.439, val_loss=0.420, train_loss_epoch=0.529]
Epoch 14: 100%|██████████| 28/28 [00:12<00:00,  2.22it/s, v_num=3.41e+7, train_loss_step=0.439, val_loss=0.420, train_loss_epoch=0.506]
Epoch 15:   0%|          |

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000016)


Epoch 16: 100%|██████████| 28/28 [00:11<00:00,  2.52it/s, v_num=3.41e+7, train_loss_step=0.310, val_loss=0.359, train_loss_epoch=0.415]
Epoch 17:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.310, val_loss=0.359, train_loss_epoch=0.415]         
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
Epoch 17:   7%|▋         | 2/28 [00:00<00:07,  3.41it/s, v_num=3.41e+7, train_loss_step=0.423, val_loss=0.359, train_loss_epoch=0.415][32m [repeated 15x across cluster][0m
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000015)


Epoch 15: 100%|██████████| 28/28 [00:10<00:00,  2.59it/s, v_num=3.41e+7, train_loss_step=0.477, val_loss=0.420, train_loss_epoch=0.506][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.98it/s][A[32m [repeated 5x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  4.27it/s][A
Epoch 15: 100%|██████████| 28/28 [00:12<00:00,  2.28it/s, v_num=3.41e+7, train_loss_step=0.477, val_loss=0.490, train_loss_epoch=0.506]
Epoch 15: 100%|██████████| 28/28 [00:12<00:00,  2.25it/s, v_num=3.41e+7, train_loss_step=0.477, val_loss=0.490, train_loss_epoch=0.493]
Epoch 16:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.477, val_loss=0.490, train_loss_epoch=0.493]         
Epoch 17:  64%|██████▍   | 18/28 [00:05<00:03,  3.08it/s, v_num=

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000017)


Epoch 16:  82%|████████▏ | 23/28 [00:08<00:01,  2.60it/s, v_num=3.41e+7, train_loss_step=0.596, val_loss=0.490, train_loss_epoch=0.493][32m [repeated 21x across cluster][0m
Epoch 18:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.433, val_loss=0.347, train_loss_epoch=0.392]         
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
Epoch 16: 100%|██████████| 28/28 [00:10<00:00,  2.67it/s, v_num=3.41e+7, train_loss_step=0.370, val_loss=0.490, train_loss_epoch=0.493][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 
[36m(RayTrainWorker pid=2725150)[0m 


[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000016)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:01<00:00,  3.73it/s][A[32m [repeated 5x across cluster][0m
Validation DataLoader 0: 100%|██████████| 6/6 [00:01<00:00,  3.96it/s][A
Epoch 16: 100%|██████████| 28/28 [00:12<00:00,  2.33it/s, v_num=3.41e+7, train_loss_step=0.370, val_loss=0.465, train_loss_epoch=0.493]
Epoch 16: 100%|██████████| 28/28 [00:12<00:00,  2.29it/s, v_num=3.41e+7, train_loss_step=0.370, val_loss=0.465, train_loss_epoch=0.496]
Epoch 18:  61%|██████    | 17/28 [00:05<00:03,  3.35it/s, v_num=3.41e+7, train_loss_step=0.291, val_loss=0.347, train_loss_epoch=0.392][32m [repeated 23x across cluster][0m
Epoch 17:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.370, val_loss=0.465, train_loss_epoch=0.496]         
Epoch 18:  93%|█████████▎| 26/28 [00:07<00:00,  3.32it/s, v_num

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000018)


Epoch 19:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.245, val_loss=0.342, train_loss_epoch=0.361]         
Epoch 17:  93%|█████████▎| 26/28 [00:09<00:00,  2.74it/s, v_num=3.41e+7, train_loss_step=0.554, val_loss=0.465, train_loss_epoch=0.496]
Epoch 17:  96%|█████████▋| 27/28 [00:09<00:00,  2.73it/s, v_num=3.41e+7, train_loss_step=0.463, val_loss=0.465, train_loss_epoch=0.496]
Epoch 17: 100%|██████████| 28/28 [00:10<00:00,  2.74it/s, v_num=3.41e+7, train_loss_step=0.603, val_loss=0.465, train_loss_epoch=0.496]
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0:  17%|█▋        | 1/6 [00:00<00:01,  4.49it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0:  33%|███▎      | 2/6 [00:00<00:00,  4.12it/s][A
[36m(RayTrainWorker pi

[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000017)


Epoch 19:  93%|█████████▎| 26/28 [00:08<00:00,  3.09it/s, v_num=3.41e+7, train_loss_step=0.412, val_loss=0.342, train_loss_epoch=0.361]
Epoch 19:  96%|█████████▋| 27/28 [00:08<00:00,  3.08it/s, v_num=3.41e+7, train_loss_step=0.296, val_loss=0.342, train_loss_epoch=0.361]
Epoch 19: 100%|██████████| 28/28 [00:09<00:00,  3.10it/s, v_num=3.41e+7, train_loss_step=0.306, val_loss=0.342, train_loss_epoch=0.361]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725548)[0m 
Validation DataLoader 0:  17%|█▋        | 1/6 [00:00<00:00,  6.12it/s][A
[36m(RayTrainWorker pid=2725548)[0m 
Validation DataLoader 0:  33%|███▎      | 2/6 [00:00<00:00,  5.49it/s][A
[36m(RayTrainWorker pid=2725548)[0m 
Validation DataLoader 0:  50%|█████     | 3/6 [00:00<00:00,  5.20it/s][A
Epoch 18:  43%|████▎     | 12/28 [00:04<00:06,  2.67it/s, v_num=3.41e+7, train_loss_s

[36m(RayTrainWorker pid=2725548)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000019)
[36m(RayTrainWorker pid=2725548)[0m `Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 18:  93%|█████████▎| 26/28 [00:08<00:00,  2.98it/s, v_num=3.41e+7, train_loss_step=0.541, val_loss=0.393, train_loss_epoch=0.472]
Epoch 18:  96%|█████████▋| 27/28 [00:09<00:00,  3.00it/s, v_num=3.41e+7, train_loss_step=0.441, val_loss=0.393, train_loss_epoch=0.472]
Epoch 18: 100%|██████████| 28/28 [00:09<00:00,  3.03it/s, v_num=3.41e+7, train_loss_step=0.374, val_loss=0.393, train_loss_epoch=0.472]
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0:  17%|█▋        | 1/6 [00:00<00:00,  5.38it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0:  33%|███▎      | 2/6 [00:00<00:00,  4.99it/s][A
Epoch 18:  89%|████████▉ | 25/28 [00:08<00:01,  2.96it/s, v_num=3.41e+7, train_loss_step=0.562, val_loss=0.393, train_loss_epoch=0.472][32m [repeated 13x acro

[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000018)


Epoch 19:  54%|█████▎    | 15/28 [00:04<00:03,  3.63it/s, v_num=3.41e+7, train_loss_step=0.529, val_loss=0.402, train_loss_epoch=0.455][32m [repeated 15x across cluster][0m
Epoch 19:  93%|█████████▎| 26/28 [00:07<00:00,  3.61it/s, v_num=3.41e+7, train_loss_step=0.480, val_loss=0.402, train_loss_epoch=0.455]
Epoch 19:  96%|█████████▋| 27/28 [00:07<00:00,  3.61it/s, v_num=3.41e+7, train_loss_step=0.386, val_loss=0.402, train_loss_epoch=0.455]
Epoch 19: 100%|██████████| 28/28 [00:07<00:00,  3.63it/s, v_num=3.41e+7, train_loss_step=0.376, val_loss=0.402, train_loss_epoch=0.455]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0:  17%|█▋        | 1/6 [00:00<00:00,  5.42it/s][A
[36m(RayTrainWorker pid=2725150)[0m 
Validation DataLoader 0:  33%|███▎      | 2/6 [00:00<00:00,  4.92it/s][A
[36m(RayTrainWorker pi

[36m(RayTrainWorker pid=2725150)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000019)
[36m(RayTrainWorker pid=2725150)[0m `Trainer.fit` stopped: `max_epochs=20` reached.
2025-03-12 00:23:49,043	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26' in 0.0063s.
2025-03-12 00:23:49,048	INFO tune.py:1041 -- Total run time: 262.56 seconds (262.42 seconds for the tuning loop).


In [9]:
results

ResultGrid<[
  Result(
    metrics={'train_loss': 0.4388100206851959, 'train_loss_step': 0.3762468099594116, 'val/rmse': 0.6160438656806946, 'val/mae': 0.45948243141174316, 'val_loss': 0.3795100152492523, 'train_loss_epoch': 0.4388100206851959, 'epoch': 19, 'step': 560},
    path='/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6f0157cc/checkpoint_000019)
  ),
  Result(
    metrics={'train_loss': 0.3525916337966919, 'train_loss_step': 0.30561158061027527, 'val/rmse': 0.538872241973877, 'val/mae': 0.42712679505348206, 'val_loss': 0.29038330912590027, 'train_loss_epoch': 0.3525916337966919, 'epoch': 19, 'step': 560},
    path='/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_202

In [10]:
result_df = results.get_dataframe()
result_df

Unnamed: 0,train_loss,train_loss_step,val/rmse,val/mae,val_loss,train_loss_epoch,epoch,step,timestamp,checkpoint_dir_name,...,pid,hostname,node_ip,time_since_restore,iterations_since_restore,config/train_loop_config/depth,config/train_loop_config/ffn_hidden_dim,config/train_loop_config/ffn_num_layers,config/train_loop_config/message_hidden_dim,logdir
0,0.43881,0.376247,0.616044,0.459482,0.37951,0.43881,19,560,1741735427,checkpoint_000019,...,2724912,cpusrv45.scidom.de,10.233.0.55,250.919009,20,2,2000,2,500,6f0157cc
1,0.352592,0.305612,0.538872,0.427127,0.290383,0.352592,19,560,1741735412,checkpoint_000019,...,2725149,cpusrv45.scidom.de,10.233.0.55,225.522838,20,2,2200,2,400,6ccfe64d


In [11]:
# best configuration
best_result = results.get_best_result()
best_config = best_result.config
best_config['train_loop_config']

{'depth': 2,
 'ffn_hidden_dim': 2200,
 'ffn_num_layers': 2,
 'message_hidden_dim': 400}

In [12]:
# best model checkpoint path
best_result = results.get_best_result()
best_checkpoint_path = Path(best_result.checkpoint.path) / "checkpoint.ckpt"
print(f"Best model checkpoint path: {best_checkpoint_path}")

Best model checkpoint path: /ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_5ht1b_zinc/ray_results/TorchTrainer_2025-03-12_00-19-26/6ccfe64d/checkpoint_000019/checkpoint.ckpt


In [13]:
mpnn = models.MPNN.load_from_checkpoint(best_checkpoint_path)
mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=86, out_features=400, bias=False)
    (W_h): Linear(in_features=400, out_features=400, bias=False)
    (W_o): Linear(in_features=472, out_features=400, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=400, out_features=2200, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=2200, out_features=2200, bias=True)
      )
      (2): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=2200, out_features=1, bias=True)
      )
    )
    (criterion): MSE(task_weights=[[1.0]

In [14]:
import torch
test_loader = data.build_dataloader(test_dset, shuffle=False)
with torch.inference_mode():
    trainer = pl.Trainer(
        logger=None,
        enable_progress_bar=True,
        accelerator="cpu",
        devices=1
    )
    test_preds = trainer.predict(mpnn, test_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
SLURM auto-requeueing enabled. Setting signal handlers.
/home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 4/4 [00:00<00:00, 11.48it/s]


In [15]:
import numpy as np
test_preds = np.concatenate(test_preds, axis=0)
df_test['preds'] = test_preds
df_test

Unnamed: 0,smiles,value,preds
0,COc1ccc(C(=O)Nc2ccccc2C(=O)Nc2ccccn2)cc1,9.8,9.118670
1,CCc1noc(CC)c1CNC(=O)N[C@H]1CCc2c(F)cccc21,9.5,7.903823
2,CC(C)C[C@@H](C[NH2+]C1CC1)C[C@@H]1COc2ccccc21,7.3,8.024309
3,CC[NH+]1CCc2c(cccc2-n2nnc(C(=O)NC3CC3)c2C)C1,6.5,8.062380
4,Cc1ccc(N2C[C@H](C(=O)N3CCC[C@@H](C(=O)[O-])C3)...,9.7,10.343315
...,...,...,...
193,Cc1[nH]ncc1CCCNc1ccc(Br)cc1[N+](=O)[O-],6.9,7.192903
194,C[C@@H](NC(=O)c1ccccc1NC(=O)Cn1ccc([N+](=O)[O-...,10.4,8.987608
195,COc1cccc(-c2cc(C(=O)N3CCC[C@@H]3c3cc(C(C)C)no3...,10.3,8.561325
196,CC(C)c1ccc2c(c1)[C@@H]([NH2+][C@@H]1CCO[C@]3(C...,9.4,8.666800


In [16]:
from sklearn.metrics import r2_score, mean_squared_error

# Get true values (ground truth) and predictions
y_true = df_test['value'].values  # True target values
y_pred = df_test['preds'].values  # Predicted values

# Calculate metrics
r2 = r2_score(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)

print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")

R² Score: 0.1920
MSE: 1.3791
