In [1]:
from pathlib import Path

import pandas as pd
from lightning import pytorch as pl
import ray
from ray import tune
from ray.train import CheckpointConfig, RunConfig, ScalingConfig
from ray.train.lightning import (RayDDPStrategy, RayLightningEnvironment,
                                 RayTrainReportCallback, prepare_trainer)
from ray.train.torch import TorchTrainer
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import FIFOScheduler

from chemprop import data, featurizers, models, nn

  from .autonotebook import tqdm as notebook_tqdm
2025-03-11 23:37:40,280	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-03-11 23:37:40,551	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-03-11 23:37:40,916	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
chemprop_dir = Path.cwd().parent
train_path = "/home/aih/serra.korkmaz/projects/saturn/SurogateModel/training_data_4sm_zinc/train_data_braf.csv"
test_path = "/home/aih/serra.korkmaz/projects/saturn/SurogateModel/training_data_4sm_zinc/test_data_braf.csv"
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'smiles' # name of the column containing SMILES strings
target_columns = ['value'] # list of names of the columns containing targets

hpopt_save_dir = Path.cwd() / "hpopt_braf_zinc" # directory to save hyperopt results
hpopt_save_dir.mkdir(exist_ok=True)

In [3]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [4]:
# Extract SMILES and target values
train_smis = df_train[smiles_column].values
train_ys = df_train[target_columns].values

test_smis = df_test[smiles_column].values
test_ys = df_test[target_columns].values

# Convert data to MoleculeDatapoint format
train_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(train_smis, train_ys)]
test_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(test_smis, test_ys)]


In [5]:
# Initialize featurizer
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

# Create training dataset
train_dset = data.MoleculeDataset(train_data, featurizer)
scaler = train_dset.normalize_targets()

# Split validation set from training data (80% train, 20% validation)
split_idx = int(len(train_data) * 0.8)
val_data = train_data[split_idx:]
train_data = train_data[:split_idx]

# Create validation and test datasets
val_dset = data.MoleculeDataset(val_data, featurizer)
val_dset.normalize_targets(scaler)

test_dset = data.MoleculeDataset(test_data, featurizer)

In [6]:
def train_model(config, train_dset, val_dset, num_workers, scaler):

    # config is a dictionary containing hyperparameters used for the trial
    depth = int(config["depth"])
    ffn_hidden_dim = int(config["ffn_hidden_dim"])
    ffn_num_layers = int(config["ffn_num_layers"])
    message_hidden_dim = int(config["message_hidden_dim"])

    train_loader = data.build_dataloader(train_dset, num_workers=num_workers, shuffle=True)
    val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)

    mp = nn.BondMessagePassing(d_h=message_hidden_dim, depth=depth)
    agg = nn.MeanAggregation()
    output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
    ffn = nn.RegressionFFN(output_transform=output_transform, input_dim=message_hidden_dim, hidden_dim=ffn_hidden_dim, n_layers=ffn_num_layers)
    batch_norm = True
    metric_list = [nn.metrics.RMSE(), nn.metrics.MAE()]
    model = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

    trainer = pl.Trainer(
        accelerator="auto",
        devices=1,
        max_epochs=20, # number of epochs to train for
        # below are needed for Ray and Lightning integration
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
    )

    trainer = prepare_trainer(trainer)
    trainer.fit(model, train_loader, val_loader)

In [7]:
search_space = {
    "depth": tune.qrandint(lower=2, upper=6, q=1),
    "ffn_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
    "ffn_num_layers": tune.qrandint(lower=1, upper=3, q=1),
    "message_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
}

In [8]:
ray.init()

scheduler = FIFOScheduler()

# Scaling config controls the resources used by Ray
scaling_config = ScalingConfig(
    num_workers=1,
    use_gpu=False, # change to True if you want to use GPU
)

# Checkpoint config controls the checkpointing behavior of Ray
checkpoint_config = CheckpointConfig(
    num_to_keep=1, # number of checkpoints to keep
    checkpoint_score_attribute="val_loss", # Save the checkpoint based on this metric
    checkpoint_score_order="min", # Save the checkpoint with the lowest metric value
)

run_config = RunConfig(
    checkpoint_config=checkpoint_config,
    storage_path=hpopt_save_dir / "ray_results", # directory to save the results
)

ray_trainer = TorchTrainer(
    lambda config: train_model(
        config, train_dset, val_dset, num_workers, scaler
    ),
    scaling_config=scaling_config,
    run_config=run_config,
)

search_alg = HyperOptSearch(
    n_initial_points=1, # number of random evaluations before tree parzen estimators
    random_state_seed=42,
)

# OptunaSearch is another search algorithm that can be used
# search_alg = OptunaSearch()

tune_config = tune.TuneConfig(
    metric="val_loss",
    mode="min",
    num_samples=2, # number of trials to run
    scheduler=scheduler,
    search_alg=search_alg,
    trial_dirname_creator=lambda trial: str(trial.trial_id), # shorten filepaths

)

tuner = tune.Tuner(
    ray_trainer,
    param_space={
        "train_loop_config": search_space,
    },
    tune_config=tune_config,
)

# Start the hyperparameter search
results = tuner.fit()

0,1
Current time:,2025-03-11 23:43:30
Running for:,00:04:52.78
Memory:,66.3/753.9 GiB

Trial name,# failures,error file
TorchTrainer_62f3f280,1,/tmp/ray/session_2025-03-11_23-38-09_028142_22001/artifacts/2025-03-11_23-38-37/TorchTrainer_2025-03-11_23-38-37/driver_artifacts/62f3f280/error.txt

Trial name,status,loc,train_loop_config/de pth,train_loop_config/ff n_hidden_dim,train_loop_config/ff n_num_layers,train_loop_config/me ssage_hidden_dim,iter,total time (s),train_loss,train_loss_step,val/rmse
TorchTrainer_1f3ddaf5,TERMINATED,10.233.0.20:28562,2,2200,2,400,20.0,255.221,0.143614,0.147378,0.380723
TorchTrainer_62f3f280,ERROR,10.233.0.20:28295,2,2000,2,500,,,,,


[36m(RayTrainWorker pid=28563)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=28295)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=28295)[0m - (node_id=59ac841c515528e1193ca800fd265e608c3742206adb41f7716efabd, ip=10.233.0.20, pid=28563) world_rank=0, local_rank=0, node_rank=0
[36m(TrainTrainable pid=28562)[0m Trainable.setup took 16.305 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[36m(RayTrainWorker pid=28563)[0m GPU available: False, used: False
[36m(RayTrainWorker pid=28563)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=28563)[0m HPU available: False, using: 0 HPUs


[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffdff9fb9e8a2e7281adf3748201000000 Worker ID: 5ab7ca9fadc39bf72057d058b322a06b85cfe1b81046c6814d21818a Node ID: 59ac841c515528e1193ca800fd265e608c3742206adb41f7716efabd Worker IP address: 10.233.0.20 Worker port: 44471 Worker PID: 28563 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


[36m(TorchTrainer pid=28295)[0m Worker 0 has failed.
2025-03-11 23:39:17,570	ERROR tune_controller.py:1331 -- Trial task failed for trial TorchTrainer_62f3f280
Traceback (most recent call last):
  File "/home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/ray/_private/worker.py", line 2771, in get
    values, debugger_breakpoint = worker.get

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  3.39it/s]
Epoch 0:   0%|          | 0/28 [00:00<?, ?it/s]                            
Epoch 0:   4%|▎         | 1/28 [00:00<00:12,  2.23it/s, v_num=3.41e+7, train_loss_step=1.070]
Epoch 0:   7%|▋         | 2/28 [00:00<00:09,  2.83it/s, v_num=3.41e+7, train_loss_step=0.885]
Epoch 0:  11%|█         | 3/28 [00:00<00:08,  3.11it/s, v_num=3.41e+7, train_loss_step=0.574]
Epoch 0:  14%|█▍        | 4/28 [00:01<00:07,  3.20it/s, v_num=3.41e+7, train_loss_step=0.773]
Epoch 0:  18%|█▊        | 5/28 [00:01<00:06,  3.36it/s, v_num=3.41e+7, train_loss_step=0.634]
Epoch 0:  21%|██▏       | 6/28 [00:01<00:06,  3.46it/s, v_num=3.41e+7, train_loss_step=0.339]
Epoch 0:  25%|██▌       | 7/28 [00:02<00:06,  3.48it/s, v_num=3.41e+7, train_loss_step=0.707]
Epoch 0:  29%|██▊       | 8/28 [00:02<00:05,  3.57it/s, v_num=3.41e+7, 

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000000)


Epoch 1:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=1.090, val_loss=0.847, train_loss_epoch=0.598]         
Epoch 1:   4%|▎         | 1/28 [00:00<00:06,  3.99it/s, v_num=3.41e+7, train_loss_step=0.519, val_loss=0.847, train_loss_epoch=0.598]
Epoch 1:   7%|▋         | 2/28 [00:00<00:06,  4.14it/s, v_num=3.41e+7, train_loss_step=0.583, val_loss=0.847, train_loss_epoch=0.598]
Epoch 1:  11%|█         | 3/28 [00:00<00:06,  4.06it/s, v_num=3.41e+7, train_loss_step=0.785, val_loss=0.847, train_loss_epoch=0.598]
Epoch 1:  14%|█▍        | 4/28 [00:00<00:05,  4.11it/s, v_num=3.41e+7, train_loss_step=0.467, val_loss=0.847, train_loss_epoch=0.598]
Epoch 1:  18%|█▊        | 5/28 [00:01<00:05,  4.14it/s, v_num=3.41e+7, train_loss_step=0.540, val_loss=0.847, train_loss_epoch=0.598]
Epoch 1:  21%|██▏       | 6/28 [00:01<00:05,  4.14it/s, v_num=3.41e+7, train_loss_step=0.333, val_loss=0.847, train_loss_epoch=0.598]
Epoch 1:  25%|██▌       | 7/28 [00:01<00:05,  4.09it/s, v_num

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000001)


Epoch 2:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.716, val_loss=0.638, train_loss_epoch=0.471]         
Epoch 2:   4%|▎         | 1/28 [00:00<00:07,  3.65it/s, v_num=3.41e+7, train_loss_step=0.454, val_loss=0.638, train_loss_epoch=0.471]
Epoch 2:   7%|▋         | 2/28 [00:00<00:06,  3.94it/s, v_num=3.41e+7, train_loss_step=0.445, val_loss=0.638, train_loss_epoch=0.471]
Epoch 2:  11%|█         | 3/28 [00:00<00:06,  4.08it/s, v_num=3.41e+7, train_loss_step=0.372, val_loss=0.638, train_loss_epoch=0.471]
Epoch 2:  14%|█▍        | 4/28 [00:00<00:05,  4.09it/s, v_num=3.41e+7, train_loss_step=0.434, val_loss=0.638, train_loss_epoch=0.471]
Epoch 2:  18%|█▊        | 5/28 [00:01<00:05,  4.01it/s, v_num=3.41e+7, train_loss_step=0.467, val_loss=0.638, train_loss_epoch=0.471]
Epoch 2:  21%|██▏       | 6/28 [00:01<00:05,  4.06it/s, v_num=3.41e+7, train_loss_step=0.517, val_loss=0.638, train_loss_epoch=0.471]
Epoch 2:  25%|██▌       | 7/28 [00:01<00:05,  4.10it/s, v_num

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000002)


Epoch 3:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.309, val_loss=0.399, train_loss_epoch=0.418]         
Epoch 3:   4%|▎         | 1/28 [00:00<00:06,  4.35it/s, v_num=3.41e+7, train_loss_step=0.345, val_loss=0.399, train_loss_epoch=0.418]
Epoch 3:   7%|▋         | 2/28 [00:00<00:06,  4.29it/s, v_num=3.41e+7, train_loss_step=0.350, val_loss=0.399, train_loss_epoch=0.418]
Epoch 3:  11%|█         | 3/28 [00:00<00:05,  4.25it/s, v_num=3.41e+7, train_loss_step=0.250, val_loss=0.399, train_loss_epoch=0.418]
Epoch 3:  14%|█▍        | 4/28 [00:00<00:05,  4.17it/s, v_num=3.41e+7, train_loss_step=0.617, val_loss=0.399, train_loss_epoch=0.418]
Epoch 3:  18%|█▊        | 5/28 [00:01<00:05,  4.19it/s, v_num=3.41e+7, train_loss_step=0.405, val_loss=0.399, train_loss_epoch=0.418]
Epoch 3:  21%|██▏       | 6/28 [00:01<00:05,  4.23it/s, v_num=3.41e+7, train_loss_step=0.363, val_loss=0.399, train_loss_epoch=0.418]
Epoch 3:  25%|██▌       | 7/28 [00:01<00:04,  4.22it/s, v_num

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000003)


Epoch 4:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.382, val_loss=0.351, train_loss_epoch=0.367]         
Epoch 4:   4%|▎         | 1/28 [00:00<00:07,  3.62it/s, v_num=3.41e+7, train_loss_step=0.299, val_loss=0.351, train_loss_epoch=0.367]
Epoch 4:   7%|▋         | 2/28 [00:00<00:06,  3.93it/s, v_num=3.41e+7, train_loss_step=0.390, val_loss=0.351, train_loss_epoch=0.367]
Epoch 4:  11%|█         | 3/28 [00:00<00:06,  4.06it/s, v_num=3.41e+7, train_loss_step=0.434, val_loss=0.351, train_loss_epoch=0.367]
Epoch 4:  14%|█▍        | 4/28 [00:00<00:05,  4.06it/s, v_num=3.41e+7, train_loss_step=0.462, val_loss=0.351, train_loss_epoch=0.367]
Epoch 4:  18%|█▊        | 5/28 [00:01<00:05,  4.00it/s, v_num=3.41e+7, train_loss_step=0.275, val_loss=0.351, train_loss_epoch=0.367]
Epoch 4:  21%|██▏       | 6/28 [00:01<00:05,  4.03it/s, v_num=3.41e+7, train_loss_step=0.306, val_loss=0.351, train_loss_epoch=0.367]
Epoch 4:  25%|██▌       | 7/28 [00:01<00:05,  4.07it/s, v_num

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000004)


Epoch 4: 100%|██████████| 28/28 [00:08<00:00,  3.43it/s, v_num=3.41e+7, train_loss_step=0.289, val_loss=0.635, train_loss_epoch=0.377]
Epoch 5:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.289, val_loss=0.635, train_loss_epoch=0.377]         
Epoch 5:   4%|▎         | 1/28 [00:00<00:06,  4.16it/s, v_num=3.41e+7, train_loss_step=0.263, val_loss=0.635, train_loss_epoch=0.377]
Epoch 5:   7%|▋         | 2/28 [00:00<00:06,  4.16it/s, v_num=3.41e+7, train_loss_step=0.297, val_loss=0.635, train_loss_epoch=0.377]
Epoch 5:  11%|█         | 3/28 [00:00<00:06,  4.10it/s, v_num=3.41e+7, train_loss_step=0.321, val_loss=0.635, train_loss_epoch=0.377]
Epoch 5:  14%|█▍        | 4/28 [00:00<00:05,  4.15it/s, v_num=3.41e+7, train_loss_step=0.262, val_loss=0.635, train_loss_epoch=0.377]
Epoch 5:  18%|█▊        | 5/28 [00:01<00:05,  4.21it/s, v_num=3.41e+7, train_loss_step=0.315, val_loss=0.635, train_loss_epoch=0.377]
Epoch 5:  21%|██▏       | 6/28 [00:01<00:05,  4.23it/s, v_nu

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000005)


Epoch 6:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.327, val_loss=0.418, train_loss_epoch=0.317]         
Epoch 6:   4%|▎         | 1/28 [00:00<00:06,  3.98it/s, v_num=3.41e+7, train_loss_step=0.192, val_loss=0.418, train_loss_epoch=0.317]
Epoch 6:   7%|▋         | 2/28 [00:00<00:06,  3.93it/s, v_num=3.41e+7, train_loss_step=0.473, val_loss=0.418, train_loss_epoch=0.317]
Epoch 6:  11%|█         | 3/28 [00:00<00:06,  4.10it/s, v_num=3.41e+7, train_loss_step=0.331, val_loss=0.418, train_loss_epoch=0.317]
Epoch 6:  14%|█▍        | 4/28 [00:00<00:05,  4.11it/s, v_num=3.41e+7, train_loss_step=0.399, val_loss=0.418, train_loss_epoch=0.317]
Epoch 6:  18%|█▊        | 5/28 [00:01<00:05,  4.15it/s, v_num=3.41e+7, train_loss_step=0.185, val_loss=0.418, train_loss_epoch=0.317]
Epoch 6:  21%|██▏       | 6/28 [00:01<00:05,  4.09it/s, v_num=3.41e+7, train_loss_step=0.355, val_loss=0.418, train_loss_epoch=0.317]
Epoch 6:  25%|██▌       | 7/28 [00:01<00:05,  4.12it/s, v_num

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000006)


Epoch 7:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.244, val_loss=0.465, train_loss_epoch=0.314]         
Epoch 7:   4%|▎         | 1/28 [00:00<00:06,  4.27it/s, v_num=3.41e+7, train_loss_step=0.357, val_loss=0.465, train_loss_epoch=0.314]
Epoch 7:   7%|▋         | 2/28 [00:00<00:06,  4.27it/s, v_num=3.41e+7, train_loss_step=0.383, val_loss=0.465, train_loss_epoch=0.314]
Epoch 7:  11%|█         | 3/28 [00:00<00:05,  4.25it/s, v_num=3.41e+7, train_loss_step=0.210, val_loss=0.465, train_loss_epoch=0.314]
Epoch 7:  14%|█▍        | 4/28 [00:00<00:05,  4.14it/s, v_num=3.41e+7, train_loss_step=0.364, val_loss=0.465, train_loss_epoch=0.314]
Epoch 7:  18%|█▊        | 5/28 [00:01<00:05,  4.15it/s, v_num=3.41e+7, train_loss_step=0.208, val_loss=0.465, train_loss_epoch=0.314]
Epoch 7:  21%|██▏       | 6/28 [00:01<00:05,  4.15it/s, v_num=3.41e+7, train_loss_step=0.395, val_loss=0.465, train_loss_epoch=0.314]
Epoch 7:  25%|██▌       | 7/28 [00:01<00:05,  4.13it/s, v_num

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000007)


Epoch 8:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.175, val_loss=0.279, train_loss_epoch=0.305]         
Epoch 8:   4%|▎         | 1/28 [00:00<00:07,  3.45it/s, v_num=3.41e+7, train_loss_step=0.204, val_loss=0.279, train_loss_epoch=0.305]
Epoch 8:   7%|▋         | 2/28 [00:00<00:07,  3.65it/s, v_num=3.41e+7, train_loss_step=0.205, val_loss=0.279, train_loss_epoch=0.305]
Epoch 8:  11%|█         | 3/28 [00:00<00:06,  3.83it/s, v_num=3.41e+7, train_loss_step=0.324, val_loss=0.279, train_loss_epoch=0.305]
Epoch 8:  14%|█▍        | 4/28 [00:01<00:06,  3.92it/s, v_num=3.41e+7, train_loss_step=0.221, val_loss=0.279, train_loss_epoch=0.305]
Epoch 8:  18%|█▊        | 5/28 [00:01<00:05,  3.97it/s, v_num=3.41e+7, train_loss_step=0.274, val_loss=0.279, train_loss_epoch=0.305]
Epoch 8:  21%|██▏       | 6/28 [00:01<00:05,  3.93it/s, v_num=3.41e+7, train_loss_step=0.387, val_loss=0.279, train_loss_epoch=0.305]
Epoch 8:  25%|██▌       | 7/28 [00:01<00:05,  3.98it/s, v_num

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000008)


Epoch 9:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.363, val_loss=0.269, train_loss_epoch=0.264]         
Epoch 9:   4%|▎         | 1/28 [00:00<00:06,  4.05it/s, v_num=3.41e+7, train_loss_step=0.209, val_loss=0.269, train_loss_epoch=0.264]
Epoch 9:   7%|▋         | 2/28 [00:00<00:06,  4.23it/s, v_num=3.41e+7, train_loss_step=0.274, val_loss=0.269, train_loss_epoch=0.264]
Epoch 9:  11%|█         | 3/28 [00:00<00:05,  4.28it/s, v_num=3.41e+7, train_loss_step=0.282, val_loss=0.269, train_loss_epoch=0.264]
Epoch 9:  14%|█▍        | 4/28 [00:00<00:05,  4.25it/s, v_num=3.41e+7, train_loss_step=0.370, val_loss=0.269, train_loss_epoch=0.264]
Epoch 9:  18%|█▊        | 5/28 [00:01<00:05,  4.17it/s, v_num=3.41e+7, train_loss_step=0.224, val_loss=0.269, train_loss_epoch=0.264]
Epoch 9:  21%|██▏       | 6/28 [00:01<00:05,  4.18it/s, v_num=3.41e+7, train_loss_step=0.212, val_loss=0.269, train_loss_epoch=0.264]
Epoch 9:  25%|██▌       | 7/28 [00:01<00:05,  4.17it/s, v_num

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000009)


Epoch 10:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.299, val_loss=0.284, train_loss_epoch=0.270]        
Epoch 10:   4%|▎         | 1/28 [00:00<00:06,  4.25it/s, v_num=3.41e+7, train_loss_step=0.183, val_loss=0.284, train_loss_epoch=0.270]
Epoch 10:   7%|▋         | 2/28 [00:00<00:06,  4.28it/s, v_num=3.41e+7, train_loss_step=0.247, val_loss=0.284, train_loss_epoch=0.270]
Epoch 10:  11%|█         | 3/28 [00:00<00:06,  4.07it/s, v_num=3.41e+7, train_loss_step=0.161, val_loss=0.284, train_loss_epoch=0.270]
Epoch 10:  14%|█▍        | 4/28 [00:00<00:05,  4.16it/s, v_num=3.41e+7, train_loss_step=0.135, val_loss=0.284, train_loss_epoch=0.270]
Epoch 10:  18%|█▊        | 5/28 [00:01<00:05,  4.23it/s, v_num=3.41e+7, train_loss_step=0.224, val_loss=0.284, train_loss_epoch=0.270]
Epoch 10:  21%|██▏       | 6/28 [00:01<00:05,  4.20it/s, v_num=3.41e+7, train_loss_step=0.227, val_loss=0.284, train_loss_epoch=0.270]
Epoch 10:  25%|██▌       | 7/28 [00:01<00:05,  4.15it/s

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000010)


Epoch 11:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.227, val_loss=0.228, train_loss_epoch=0.246]         
Epoch 11:   4%|▎         | 1/28 [00:00<00:09,  2.91it/s, v_num=3.41e+7, train_loss_step=0.197, val_loss=0.228, train_loss_epoch=0.246]
Epoch 11:   7%|▋         | 2/28 [00:00<00:07,  3.40it/s, v_num=3.41e+7, train_loss_step=0.262, val_loss=0.228, train_loss_epoch=0.246]
Epoch 11:  11%|█         | 3/28 [00:00<00:07,  3.53it/s, v_num=3.41e+7, train_loss_step=0.207, val_loss=0.228, train_loss_epoch=0.246]
Epoch 11:  14%|█▍        | 4/28 [00:01<00:06,  3.72it/s, v_num=3.41e+7, train_loss_step=0.140, val_loss=0.228, train_loss_epoch=0.246]
Epoch 11:  18%|█▊        | 5/28 [00:01<00:06,  3.83it/s, v_num=3.41e+7, train_loss_step=0.157, val_loss=0.228, train_loss_epoch=0.246]
Epoch 11:  21%|██▏       | 6/28 [00:01<00:05,  3.85it/s, v_num=3.41e+7, train_loss_step=0.322, val_loss=0.228, train_loss_epoch=0.246]
Epoch 11:  25%|██▌       | 7/28 [00:01<00:05,  3.84it/

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000011)


Epoch 12:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.263, val_loss=0.327, train_loss_epoch=0.237]         
Epoch 12:   4%|▎         | 1/28 [00:00<00:06,  4.10it/s, v_num=3.41e+7, train_loss_step=0.137, val_loss=0.327, train_loss_epoch=0.237]
Epoch 12:   7%|▋         | 2/28 [00:00<00:06,  4.06it/s, v_num=3.41e+7, train_loss_step=0.182, val_loss=0.327, train_loss_epoch=0.237]
Epoch 12:  11%|█         | 3/28 [00:00<00:06,  3.97it/s, v_num=3.41e+7, train_loss_step=0.199, val_loss=0.327, train_loss_epoch=0.237]
Epoch 12:  14%|█▍        | 4/28 [00:00<00:05,  4.02it/s, v_num=3.41e+7, train_loss_step=0.223, val_loss=0.327, train_loss_epoch=0.237]
Epoch 12:  18%|█▊        | 5/28 [00:01<00:05,  4.07it/s, v_num=3.41e+7, train_loss_step=0.258, val_loss=0.327, train_loss_epoch=0.237]
Epoch 12:  21%|██▏       | 6/28 [00:01<00:05,  4.11it/s, v_num=3.41e+7, train_loss_step=0.208, val_loss=0.327, train_loss_epoch=0.237]
Epoch 12:  25%|██▌       | 7/28 [00:01<00:05,  4.08it/

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000012)


Epoch 12: 100%|██████████| 28/28 [00:09<00:00,  2.85it/s, v_num=3.41e+7, train_loss_step=0.191, val_loss=0.228, train_loss_epoch=0.218]
Epoch 13:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.191, val_loss=0.228, train_loss_epoch=0.218]         
Epoch 13:   4%|▎         | 1/28 [00:00<00:08,  3.36it/s, v_num=3.41e+7, train_loss_step=0.143, val_loss=0.228, train_loss_epoch=0.218]
Epoch 13:   7%|▋         | 2/28 [00:00<00:07,  3.62it/s, v_num=3.41e+7, train_loss_step=0.226, val_loss=0.228, train_loss_epoch=0.218]
Epoch 13:  11%|█         | 3/28 [00:00<00:06,  3.79it/s, v_num=3.41e+7, train_loss_step=0.150, val_loss=0.228, train_loss_epoch=0.218]
Epoch 13:  14%|█▍        | 4/28 [00:01<00:06,  3.87it/s, v_num=3.41e+7, train_loss_step=0.171, val_loss=0.228, train_loss_epoch=0.218]
Epoch 13:  18%|█▊        | 5/28 [00:01<00:05,  3.92it/s, v_num=3.41e+7, train_loss_step=0.207, val_loss=0.228, train_loss_epoch=0.218]
Epoch 13:  21%|██▏       | 6/28 [00:01<00:05,  3.94it

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000013)


Epoch 14:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.268, val_loss=0.229, train_loss_epoch=0.213]         
Epoch 14:   4%|▎         | 1/28 [00:00<00:07,  3.76it/s, v_num=3.41e+7, train_loss_step=0.268, val_loss=0.229, train_loss_epoch=0.213]
Epoch 14:   7%|▋         | 2/28 [00:00<00:06,  3.88it/s, v_num=3.41e+7, train_loss_step=0.150, val_loss=0.229, train_loss_epoch=0.213]
Epoch 14:  11%|█         | 3/28 [00:00<00:06,  3.98it/s, v_num=3.41e+7, train_loss_step=0.142, val_loss=0.229, train_loss_epoch=0.213]
Epoch 14:  14%|█▍        | 4/28 [00:00<00:05,  4.05it/s, v_num=3.41e+7, train_loss_step=0.175, val_loss=0.229, train_loss_epoch=0.213]
Epoch 14:  18%|█▊        | 5/28 [00:01<00:05,  4.08it/s, v_num=3.41e+7, train_loss_step=0.260, val_loss=0.229, train_loss_epoch=0.213]
Epoch 14:  21%|██▏       | 6/28 [00:01<00:05,  4.05it/s, v_num=3.41e+7, train_loss_step=0.141, val_loss=0.229, train_loss_epoch=0.213]
Epoch 14:  25%|██▌       | 7/28 [00:01<00:05,  4.10it/

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000014)


Epoch 15:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.205, val_loss=0.176, train_loss_epoch=0.191]         
Epoch 15:   4%|▎         | 1/28 [00:00<00:08,  3.31it/s, v_num=3.41e+7, train_loss_step=0.162, val_loss=0.176, train_loss_epoch=0.191]
Epoch 15:   7%|▋         | 2/28 [00:00<00:06,  3.78it/s, v_num=3.41e+7, train_loss_step=0.199, val_loss=0.176, train_loss_epoch=0.191]
Epoch 15:  11%|█         | 3/28 [00:00<00:06,  3.94it/s, v_num=3.41e+7, train_loss_step=0.146, val_loss=0.176, train_loss_epoch=0.191]
Epoch 15:  14%|█▍        | 4/28 [00:01<00:06,  4.00it/s, v_num=3.41e+7, train_loss_step=0.180, val_loss=0.176, train_loss_epoch=0.191]
Epoch 15:  18%|█▊        | 5/28 [00:01<00:05,  3.97it/s, v_num=3.41e+7, train_loss_step=0.155, val_loss=0.176, train_loss_epoch=0.191]
Epoch 15:  21%|██▏       | 6/28 [00:01<00:05,  4.01it/s, v_num=3.41e+7, train_loss_step=0.196, val_loss=0.176, train_loss_epoch=0.191]
Epoch 15:  25%|██▌       | 7/28 [00:01<00:05,  4.06it/

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000015)


Epoch 16:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.302, val_loss=0.231, train_loss_epoch=0.185]         
Epoch 16:   4%|▎         | 1/28 [00:00<00:07,  3.55it/s, v_num=3.41e+7, train_loss_step=0.137, val_loss=0.231, train_loss_epoch=0.185]
Epoch 16:   7%|▋         | 2/28 [00:00<00:06,  3.82it/s, v_num=3.41e+7, train_loss_step=0.174, val_loss=0.231, train_loss_epoch=0.185]
Epoch 16:  11%|█         | 3/28 [00:00<00:06,  3.88it/s, v_num=3.41e+7, train_loss_step=0.198, val_loss=0.231, train_loss_epoch=0.185]
Epoch 16:  14%|█▍        | 4/28 [00:01<00:06,  3.86it/s, v_num=3.41e+7, train_loss_step=0.173, val_loss=0.231, train_loss_epoch=0.185]
Epoch 16:  18%|█▊        | 5/28 [00:01<00:05,  3.95it/s, v_num=3.41e+7, train_loss_step=0.169, val_loss=0.231, train_loss_epoch=0.185]
Epoch 16:  21%|██▏       | 6/28 [00:01<00:05,  4.04it/s, v_num=3.41e+7, train_loss_step=0.186, val_loss=0.231, train_loss_epoch=0.185]
Epoch 16:  25%|██▌       | 7/28 [00:01<00:05,  4.07it/

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000016)


Epoch 17:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.150, val_loss=0.158, train_loss_epoch=0.169]         
Epoch 17:   4%|▎         | 1/28 [00:00<00:06,  4.36it/s, v_num=3.41e+7, train_loss_step=0.180, val_loss=0.158, train_loss_epoch=0.169]
Epoch 17:   7%|▋         | 2/28 [00:00<00:06,  4.23it/s, v_num=3.41e+7, train_loss_step=0.141, val_loss=0.158, train_loss_epoch=0.169]
Epoch 17:  11%|█         | 3/28 [00:00<00:05,  4.18it/s, v_num=3.41e+7, train_loss_step=0.137, val_loss=0.158, train_loss_epoch=0.169]
Epoch 17:  14%|█▍        | 4/28 [00:00<00:05,  4.14it/s, v_num=3.41e+7, train_loss_step=0.167, val_loss=0.158, train_loss_epoch=0.169]
Epoch 17:  18%|█▊        | 5/28 [00:01<00:05,  4.19it/s, v_num=3.41e+7, train_loss_step=0.124, val_loss=0.158, train_loss_epoch=0.169]
Epoch 17:  21%|██▏       | 6/28 [00:01<00:05,  4.18it/s, v_num=3.41e+7, train_loss_step=0.189, val_loss=0.158, train_loss_epoch=0.169]
Epoch 17:  25%|██▌       | 7/28 [00:01<00:05,  4.17it/

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000017)


Epoch 18:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.096, val_loss=0.158, train_loss_epoch=0.168]         
Epoch 18:   4%|▎         | 1/28 [00:00<00:06,  3.96it/s, v_num=3.41e+7, train_loss_step=0.145, val_loss=0.158, train_loss_epoch=0.168]
Epoch 18:   7%|▋         | 2/28 [00:00<00:06,  4.12it/s, v_num=3.41e+7, train_loss_step=0.120, val_loss=0.158, train_loss_epoch=0.168]
Epoch 18:  11%|█         | 3/28 [00:00<00:05,  4.17it/s, v_num=3.41e+7, train_loss_step=0.183, val_loss=0.158, train_loss_epoch=0.168]
Epoch 18:  14%|█▍        | 4/28 [00:00<00:05,  4.08it/s, v_num=3.41e+7, train_loss_step=0.117, val_loss=0.158, train_loss_epoch=0.168]
Epoch 18:  18%|█▊        | 5/28 [00:01<00:05,  4.13it/s, v_num=3.41e+7, train_loss_step=0.144, val_loss=0.158, train_loss_epoch=0.168]
Epoch 18:  21%|██▏       | 6/28 [00:01<00:05,  4.16it/s, v_num=3.41e+7, train_loss_step=0.0952, val_loss=0.158, train_loss_epoch=0.168]
Epoch 18:  25%|██▌       | 7/28 [00:01<00:05,  4.15it

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000018)


Epoch 19:   0%|          | 0/28 [00:00<?, ?it/s, v_num=3.41e+7, train_loss_step=0.160, val_loss=0.145, train_loss_epoch=0.155]         
Epoch 19:   4%|▎         | 1/28 [00:00<00:07,  3.83it/s, v_num=3.41e+7, train_loss_step=0.149, val_loss=0.145, train_loss_epoch=0.155]
Epoch 19:   7%|▋         | 2/28 [00:00<00:06,  3.98it/s, v_num=3.41e+7, train_loss_step=0.123, val_loss=0.145, train_loss_epoch=0.155]
Epoch 19:  11%|█         | 3/28 [00:00<00:06,  3.94it/s, v_num=3.41e+7, train_loss_step=0.0965, val_loss=0.145, train_loss_epoch=0.155]
Epoch 19:  14%|█▍        | 4/28 [00:00<00:05,  4.04it/s, v_num=3.41e+7, train_loss_step=0.125, val_loss=0.145, train_loss_epoch=0.155] 
Epoch 19:  18%|█▊        | 5/28 [00:01<00:05,  4.10it/s, v_num=3.41e+7, train_loss_step=0.186, val_loss=0.145, train_loss_epoch=0.155]
Epoch 19:  21%|██▏       | 6/28 [00:01<00:05,  4.12it/s, v_num=3.41e+7, train_loss_step=0.149, val_loss=0.145, train_loss_epoch=0.155]
Epoch 19:  25%|██▌       | 7/28 [00:01<00:05,  4.07i

[36m(RayTrainWorker pid=29191)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000019)


Epoch 19: 100%|██████████| 28/28 [00:09<00:00,  2.92it/s, v_num=3.41e+7, train_loss_step=0.147, val_loss=0.145, train_loss_epoch=0.144]


[36m(RayTrainWorker pid=29191)[0m `Trainer.fit` stopped: `max_epochs=20` reached.
2025-03-11 23:43:30,298	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37' in 0.0067s.
2025-03-11 23:43:30,311	ERROR tune.py:1037 -- Trials did not complete: [TorchTrainer_62f3f280]
2025-03-11 23:43:30,311	INFO tune.py:1041 -- Total run time: 292.85 seconds (292.77 seconds for the tuning loop).


In [9]:
results

ResultGrid<[
  Result(
    error='RayTaskError(ActorDiedError)',
    metrics={},
    path='/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/62f3f280',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    metrics={'train_loss': 0.14361390471458435, 'train_loss_step': 0.147378072142601, 'val/rmse': 0.38072335720062256, 'val/mae': 0.2952577471733093, 'val_loss': 0.1449502855539322, 'train_loss_epoch': 0.14361390471458435, 'epoch': 19, 'step': 560},
    path='/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5',
    filesystem='local',
    checkpoint=Checkpoint(filesystem=local, path=/ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000019)
  )
]>

In [10]:
result_df = results.get_dataframe()
result_df

Unnamed: 0,train_loss,train_loss_step,val/rmse,val/mae,val_loss,train_loss_epoch,epoch,step,timestamp,checkpoint_dir_name,...,pid,hostname,node_ip,time_since_restore,iterations_since_restore,config/train_loop_config/depth,config/train_loop_config/ffn_hidden_dim,config/train_loop_config/ffn_num_layers,config/train_loop_config/message_hidden_dim,logdir
0,0.143614,0.147378,0.380723,0.295258,0.14495,0.143614,19,560,1741733007,checkpoint_000019,...,28562,cpusrv10.scidom.de,10.233.0.20,255.221482,20,2,2200,2,400,1f3ddaf5


In [11]:
# best configuration
best_result = results.get_best_result()
best_config = best_result.config
best_config['train_loop_config']

{'depth': 2,
 'ffn_hidden_dim': 2200,
 'ffn_num_layers': 2,
 'message_hidden_dim': 400}

In [12]:
# best model checkpoint path
best_result = results.get_best_result()
best_checkpoint_path = Path(best_result.checkpoint.path) / "checkpoint.ckpt"
print(f"Best model checkpoint path: {best_checkpoint_path}")

Best model checkpoint path: /ictstr01/home/aih/serra.korkmaz/projects/saturn/hopt/hpopt_braf_zinc/ray_results/TorchTrainer_2025-03-11_23-38-37/1f3ddaf5/checkpoint_000019/checkpoint.ckpt


In [13]:
mpnn = models.MPNN.load_from_checkpoint(best_checkpoint_path)
mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=86, out_features=400, bias=False)
    (W_h): Linear(in_features=400, out_features=400, bias=False)
    (W_o): Linear(in_features=472, out_features=400, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=400, out_features=2200, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=2200, out_features=2200, bias=True)
      )
      (2): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=2200, out_features=1, bias=True)
      )
    )
    (criterion): MSE(task_weights=[[1.0]

In [14]:
import torch
test_loader = data.build_dataloader(test_dset, shuffle=False)
with torch.inference_mode():
    trainer = pl.Trainer(
        logger=None,
        enable_progress_bar=True,
        accelerator="cpu",
        devices=1
    )
    test_preds = trainer.predict(mpnn, test_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
SLURM auto-requeueing enabled. Setting signal handlers.
/home/aih/serra.korkmaz/miniconda3/envs/saturn/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 4/4 [00:00<00:00, 12.52it/s]


In [15]:
import numpy as np
test_preds = np.concatenate(test_preds, axis=0)
df_test['preds'] = test_preds
df_test

Unnamed: 0,smiles,value,preds
0,O=C(/C=C/c1cccnc1)Nc1ccc(C(=O)Nc2nccs2)cc1,9.2,8.836904
1,Cc1ccc(-n2nnnc2N2CCN(c3ccc(O)cc3)CC2)cc1,9.7,8.810091
2,CCCN(CCC)C(=O)c1ccc2c(c1)[C@H]1C=CC[C@H]1[C@H]...,8.7,7.913101
3,O=C(OC[C@@H](O)CO)c1ccccc1Nc1cc[nH+]c2cc(Cl)ccc12,8.3,8.283792
4,Cn1cc(CCC(=O)N2CCC[C@H](n3cncn3)C2)cn1,7.1,7.189588
...,...,...,...
191,CN1CCO[C@H](CN(C)c2cccc(/C(N)=N/O)c2)C1,8.3,7.679236
192,Cc1ccc(C(=O)CCC(=O)Nc2cc(Cl)c(O)c(Cl)c2)cc1,8.9,8.762691
193,O=C1CCCN1CC[NH+]1CCC[C@H](c2ccccc2)CC1,7.7,8.115525
194,COC(=O)c1c(N)c2cc(F)ccc2n1C,6.9,6.939610


In [16]:
from sklearn.metrics import r2_score, mean_squared_error

# Get true values (ground truth) and predictions
y_true = df_test['value'].values  # True target values
y_pred = df_test['preds'].values  # Predicted values

# Calculate metrics
r2 = r2_score(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)

print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")

R² Score: 0.6602
MSE: 0.4056
