In [106]:
import ray
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig,RunConfig,CheckpointConfig
import torch.nn as nn
import torch
from torchvision.datasets import CIFAR10
from torchvision.models import resnet18 , ResNet18_Weights
from torchvision.models import VisionTransformer
from torchvision.transforms import ToTensor, Compose,Normalize
from torch.utils.data import Subset,DataLoader
import matplotlib.pyplot as plt
from torchmetrics.classification import Accuracy
from PIL import Image
from filelock import FileLock
from pathlib import Path

import tempfile
import os
import uuid

In [6]:
data = CIFAR10(root="../marimo_notebooks/data",download=True,train=False)
data

Dataset CIFAR10
    Number of datapoints: 10000
    Root location: ../marimo_notebooks/data
    Split: Test

In [7]:
# the data contains of a PIL image and the label
next(iter(data))

(<PIL.Image.Image image mode=RGB size=32x32>, 3)

In [8]:
class_to_idx = data.class_to_idx
class_to_idx

{'airplane': 0,
 'automobile': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

In [9]:
weights = ResNet18_Weights.IMAGENET1K_V1
weights.transforms()

ImageClassification(
    crop_size=[224]
    resize_size=[256]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [114]:
# model = resnet18(weights=ResNet18_Weights)
# model

In [107]:
def get_cifar_dataloader(batch_size):
    #imagenet_transforms = ResNet18_Weights.IMAGENET1K_V1.transforms
    #full_transform = imagenet_transforms()
    full_transform = Compose([ToTensor(), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    with FileLock(os.path.expanduser("~/cifar_data.lock")):
        train = CIFAR10(
            root="~/cifar_data",
            train=True,
            download=True,
            transform=full_transform,
        )
        valid = CIFAR10(
            root="~/cifar_data",
            train=False,
            download=True,
            transform=full_transform,
        )
    train_sub = Subset(train,indices=range(1000))
    valid_sub = Subset(valid,indices=range(1000))
    # dataloaders to get data in batches
    train_dataloader = DataLoader(train_sub, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_sub, batch_size=batch_size)

    return train_dataloader, valid_dataloader



In [108]:
sample_dataloader = next(iter(get_cifar_dataloader(3)))
single_batch =  next(iter(sample_dataloader))

In [109]:
single_batch[1].shape

torch.Size([3])

In [112]:
single_batch[0].shape

torch.Size([3, 3, 32, 32])

In [119]:
def train_func(config):

    epochs = config["epochs"]
    batch_size = config["batch_size"]
    lr = config["lr"]
    
    
    # use detected device
    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

    # metrics
    accuracy = Accuracy(task="multiclass", num_classes=config["num_classes"]).to(device)
    
    device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #weights = ResNet18_Weights.IMAGENET1K_V1
    # model = resnet18(weights=weights)
    # model = VisionTransformer(
    #     image_size=32,   # CIFAR-10 image size is 32x32
    #     patch_size=4,    # Patch size is 4x4
    #     num_layers=3,   # Number of transformer layers
    #     num_heads=2,     # Number of attention heads
    #     hidden_dim=128,  # Hidden size (can be adjusted)
    #     mlp_dim=64,     # MLP dimension (can be adjusted)
    #     num_classes=10   # CIFAR-10 has 10 classes
    # )
    model = VisionTransformer(
        image_size=32,   # CIFAR-10 image size is 32x32
        patch_size=4,    # Patch size is 4x4
        num_layers=12,   # Number of transformer layers
        num_heads=8,     # Number of attention heads
        hidden_dim=384,  # Hidden size (can be adjusted)
        mlp_dim=768,     # MLP dimension (can be adjusted)
        num_classes=10   # CIFAR-10 has 10 classes
    )
    #model = SimpleModel(in_channels=3,hidden_features=128,out_features=10)
    # for parameter in model.parameters():
    #     parameter.requires_grad = False
    # model.fc = nn.Linear(512,config["num_classes"],bias=True)
    
    model = ray.train.torch.prepare_model(model)
 
    loss_fn = nn.CrossEntropyLoss()
    #optimizer = torch.optim.SGD(model.parameters(), lr=config["lr"])
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])

    train_dataloader, valid_dataloader = get_cifar_dataloader(batch_size=batch_size)
    train_dataloader = ray.train.torch.prepare_data_loader(train_dataloader)
    valid_dataloader = ray.train.torch.prepare_data_loader(valid_dataloader)

    for epoch in range(epochs):
        # checking if training is scheduled in a distributed setting or not.
        if ray.train.get_context().get_world_size() > 1:
            train_dataloader.sampler.set_epoch(epoch)
        train_loss = 0.0
        train_acc = 0.0
        num_total = 0.0
        num_correct = 0.0
        #num_batches = 0.0
        model.train()
        for idx, batch in enumerate(train_dataloader):
            x, y = batch[0], batch[1]
            y_preds = model(x)
            y_labels = y_preds.argmax(dim=1)
            loss = loss_fn(y_preds,y)
            acc = accuracy(y_labels,y)
            train_loss +=  loss.item()
            train_acc += acc.item()
            num_total += y.shape[0]
            num_correct += (y_labels == y).sum().item()
        train_loss /=len(train_dataloader)
        train_acc /=len(train_dataloader)
        check_acc = num_correct / num_total
        metrics = {"epoch":epoch,"train_loss":train_loss, "train_acc":train_acc,"check_acc": check_acc}

        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            torch.save(
                model.module.state_dict(),
                os.path.join(temp_checkpoint_dir, "model.pt")
            )
            ray.train.report(
                metrics,
                checkpoint=ray.train.Checkpoint.from_directory(temp_checkpoint_dir),
            )

        if ray.train.get_context().get_world_rank() == 0:
            print(metrics)
        


In [120]:
global_batch_size = 300
num_workers = 4
use_gpu = True

train_config = {
    "lr": 0.01,
    "epochs": 50,
    "num_classes": 10,
    "batch_size": global_batch_size // num_workers,
    "weight_decay": 0.02
}
scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)
run_config = RunConfig(
    #storage_path=str(Path("../marimo_notebooks/data/storage_path").resolve()), 
    storage_path="/mnt/cluster_storage",
    name=f"ray_train_torch_run-{uuid.uuid4().hex}",
    checkpoint_config = CheckpointConfig(num_to_keep=1,
    checkpoint_score_attribute="train_acc",
    checkpoint_score_order="max",) 
)

trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config=train_config,
    scaling_config=scaling_config,
    run_config=run_config,
)

In [121]:
result = trainer.fit()
print(f"Training result: {result}")

2025-08-02 14:50:06,334	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2025-08-02 14:50:06 (running for 00:00:00.11)
Using FIFO scheduling algorithm.
Logical resource usage: 0/0 CPUs, 0/0 GPUs (0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/region:us-east-2, 0.0/1.0 anyscale/cpu_only:true, 0.0/1.0 anyscale/provider:aws)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-08-02 14:50:11 (running for 00:00:05.13)
Using FIFO scheduling algorithm.
Logical resource usage: 0/0 CPUs, 0/0 GPUs (0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/region:us-east-2, 0.0/1.0 anyscale/cpu_only:true, 0.0/1.0 anyscale/provider:aws)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 PENDING)


[36m(autoscaler +1h33m1s)[0m [auto



== Status ==
Current time: 2025-08-02 14:51:06 (running for 00:01:00.37)
Using FIFO scheduling algorithm.
Logical resource usage: 0/16 CPUs, 0/2 GPUs (0.0/2.0 anyscale/accelerator_shape:1xT4, 0.0/3.0 anyscale/region:us-east-2, 0.0/3.0 anyscale/provider:aws, 0.0/2.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/2.0 accelerator_type:T4, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-08-02 14:51:11 (running for 00:01:05.39)
Using FIFO scheduling algorithm.
Logical resource usage: 0/16 CPUs, 0/2 GPUs (0.0/2.0 anyscale/accelerator_shape:1xT4, 0.0/3.0 anyscale/region:us-east-2, 0.0/3.0 anyscale/provider:aws, 0.0/2.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/2.0 accelerator_type:T4, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result



== Status ==
Current time: 2025-08-02 14:52:06 (running for 00:02:00.63)
Using FIFO scheduling algorithm.
Logical resource usage: 0/32 CPUs, 0/4 GPUs (0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/region:us-east-2, 0.0/5.0 anyscale/provider:aws, 0.0/1.0 anyscale/cpu_only:true, 0.0/1.0 anyscale/node-group:head)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-08-02 14:52:11 (running for 00:02:05.65)
Using FIFO scheduling algorithm.
Logical resource usage: 0/32 CPUs, 0/4 GPUs (0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/region:us-east-2, 0.0/5.0 anyscale/provider:aws, 0.0/1.0 anyscale/cpu_only:true, 0.0/1.0 anyscale/node-group:head)
Result

[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m Setting up process group for: env:// [rank=0, world_size=4]
[36m(TorchTrainer pid=2955, ip=100.86.214.127)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=2955, ip=100.86.214.127)[0m - (node_id=167ec187d72cd6790239e350d5242986d9423df372cf8e81a1fb1e2d, ip=100.86.214.127, pid=3036) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=2955, ip=100.86.214.127)[0m - (node_id=0801d83110f8c752cf99adf9e60508702179960a8c89c679bd3e0c22, ip=100.103.56.113, pid=2924) world_rank=1, local_rank=0, node_rank=1
[36m(TorchTrainer pid=2955, ip=100.86.214.127)[0m - (node_id=ce9e615b8df1abbe9ae8a87dac30e3f3e594896615f15a80e8e2513e, ip=100.95.108.125, pid=2980) world_rank=2, local_rank=0, node_rank=2
[36m(TorchTrainer pid=2955, ip=100.86.214.127)[0m - (node_id=b73bae5408d674329b4850702b54c452279597f1bc867f3e02780564, ip=100.102.153.87, pid=3037) world_rank=3, local_rank=0, node_rank=3


== Status ==
Current time: 2025-08-02 14:52:47 (running for 00:02:40.86)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/provider:aws, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/region:us-east-2, 0.0/1.0 anyscale/cpu_only:true, 0.0/1.0 anyscale/node-group:head)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m Moving model to device: cuda:0
[36m(RayTrainWorker pid=2924, ip=100.103.56.113)[0m Wrapping provided model in DistributedDataParallel.
  0%|          | 0.00/170M [00:00<?, ?B/s]56.113)[0m 
  0%|          | 328k/170M [00:00<00:57, 2.99MB/s][0m 


== Status ==
Current time: 2025-08-02 14:52:52 (running for 00:02:45.88)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/provider:aws, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/region:us-east-2, 0.0/1.0 anyscale/cpu_only:true, 0.0/1.0 anyscale/node-group:head)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




[36m(RayTrainWorker pid=3037, ip=100.102.153.87)[0m Moving model to device: cuda:0[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=2980, ip=100.95.108.125)[0m Wrapping provided model in DistributedDataParallel.[32m [repeated 3x across cluster][0m
  0%|          | 0.00/170M [00:00<?, ?B/s][32m [repeated 3x across cluster][0m
 52%|█████▏    | 89.4M/170M [00:05<00:04, 16.3MB/s][32m [repeated 197x across cluster][0m


== Status ==
Current time: 2025-08-02 14:52:57 (running for 00:02:50.90)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/provider:aws, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




 92%|█████████▏| 157M/170M [00:08<00:00, 18.3MB/s][0m 
 92%|█████████▏| 157M/170M [00:08<00:00, 18.4MB/s][0m 
 92%|█████████▏| 157M/170M [00:08<00:00, 18.5MB/s][0m 
 92%|█████████▏| 157M/170M [00:08<00:00, 18.5MB/s][0m 
 93%|█████████▎| 159M/170M [00:08<00:00, 18.6MB/s][0m 
 93%|█████████▎| 159M/170M [00:08<00:00, 18.7MB/s][0m 
 93%|█████████▎| 159M/170M [00:08<00:00, 18.7MB/s][0m 
 93%|█████████▎| 159M/170M [00:08<00:00, 18.7MB/s][0m 
 94%|█████████▍| 161M/170M [00:08<00:00, 18.7MB/s][0m 
 94%|█████████▍| 161M/170M [00:08<00:00, 18.7MB/s][0m 
 94%|█████████▍| 161M/170M [00:08<00:00, 18.7MB/s][0m 
 94%|█████████▍| 161M/170M [00:08<00:00, 18.8MB/s][0m 
 96%|█████████▌| 163M/170M [00:08<00:00, 18.8MB/s][0m 
 95%|█████████▌| 163M/170M [00:09<00:00, 18.8MB/s][0m 
 96%|█████████▌| 163M/170M [00:09<00:00, 18.9MB/s][0m 
 95%|█████████▌| 163M/170M [00:09<00:00, 18.9MB/s][0m 
 97%|█████████▋| 165M/170M [00:09<00:00, 18.9MB/s][0m 
 97%|█████████▋| 165M/170M [00:09<00:00, 19.0MB/s][0m 
 97%|█████

== Status ==
Current time: 2025-08-02 14:53:02 (running for 00:02:55.91)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/provider:aws, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




[36m(RayTrainWorker pid=2980, ip=100.95.108.125)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06/checkpoint_000000)
 91%|█████████ | 155M/170M [00:08<00:00, 18.2MB/s][32m [repeated 138x across cluster][0m


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 0, 'train_loss': 2.302584648132324, 'train_acc': 0.08000000193715096, 'check_acc': 0.088}
[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 1, 'train_loss': 2.302584648132324, 'train_acc': 0.08666666597127914, 'check_acc': 0.096}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 2, 'train_loss': 2.302584648132324, 'train_acc': 0.08000000193715096, 'check_acc': 0.088}
[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 3, 'train_loss': 2.302584648132324, 'train_acc': 0.10333333350718021, 'check_acc': 0.1}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 4, 'train_loss': 2.302584648132324, 'train_acc': 0.0800000000745058, 'check_acc': 0.08}
== Status ==
Current time: 2025-08-02 14:53:07 (running for 00:03:00.92)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/provider:aws, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 5, 'train_loss': 2.302584648132324, 'train_acc': 0.09666666574776173, 'check_acc': 0.1}


[36m(RayTrainWorker pid=3037, ip=100.102.153.87)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06/checkpoint_000006)[32m [repeated 24x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 6, 'train_loss': 2.302584648132324, 'train_acc': 0.09999999776482582, 'check_acc': 0.112}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 7, 'train_loss': 2.302584648132324, 'train_acc': 0.08333333395421505, 'check_acc': 0.092}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 8, 'train_loss': 2.302584648132324, 'train_acc': 0.08999999985098839, 'check_acc': 0.1}
[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 9, 'train_loss': 2.302584648132324, 'train_acc': 0.10333333536982536, 'check_acc': 0.1}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


== Status ==
Current time: 2025-08-02 14:53:12 (running for 00:03:05.93)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/provider:aws, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 10, 'train_loss': 2.302584648132324, 'train_acc': 0.09999999776482582, 'check_acc': 0.096}
[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 11, 'train_loss': 2.302584648132324, 'train_acc': 0.1133333332836628, 'check_acc': 0.112}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06/checkpoint_000012)[32m [repeated 24x across cluster][0m


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 12, 'train_loss': 2.302584648132324, 'train_acc': 0.07999999821186066, 'check_acc': 0.088}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 13, 'train_loss': 2.302584648132324, 'train_acc': 0.10666666738688946, 'check_acc': 0.112}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 14, 'train_loss': 2.302584648132324, 'train_acc': 0.07666666805744171, 'check_acc': 0.084}
[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 15, 'train_loss': 2.302584648132324, 'train_acc': 0.08999999985098839, 'check_acc': 0.084}
== Status ==
Current time: 2025-08-02 14:53:17 (running for 00:03:11.00)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 accelerator_type:T4, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/5.0 anyscale/provider:aws, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 16, 'train_loss': 2.302584648132324, 'train_acc': 0.13333333283662796, 'check_acc': 0.12}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 17, 'train_loss': 2.302584648132324, 'train_acc': 0.09333333373069763, 'check_acc': 0.096}


[36m(RayTrainWorker pid=2924, ip=100.103.56.113)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06/checkpoint_000018)[32m [repeated 24x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 18, 'train_loss': 2.302584648132324, 'train_acc': 0.1133333332836628, 'check_acc': 0.112}
[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 19, 'train_loss': 2.302584648132324, 'train_acc': 0.086666664108634, 'check_acc': 0.096}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 20, 'train_loss': 2.302584648132324, 'train_acc': 0.11999999731779099, 'check_acc': 0.12}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


== Status ==
Current time: 2025-08-02 14:53:22 (running for 00:03:16.12)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 accelerator_type:T4, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/5.0 anyscale/provider:aws, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 21, 'train_loss': 2.302584648132324, 'train_acc': 0.12333333119750023, 'check_acc': 0.116}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 22, 'train_loss': 2.302584648132324, 'train_acc': 0.10666666552424431, 'check_acc': 0.104}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 23, 'train_loss': 2.302584648132324, 'train_acc': 0.1333333309739828, 'check_acc': 0.112}


[36m(RayTrainWorker pid=2924, ip=100.103.56.113)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06/checkpoint_000023)[32m [repeated 23x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 24, 'train_loss': 2.302584648132324, 'train_acc': 0.12666666507720947, 'check_acc': 0.128}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 25, 'train_loss': 2.302584648132324, 'train_acc': 0.07333333417773247, 'check_acc': 0.08}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


== Status ==
Current time: 2025-08-02 14:53:27 (running for 00:03:21.15)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 accelerator_type:T4, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/provider:aws, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 26, 'train_loss': 2.302584648132324, 'train_acc': 0.10000000149011612, 'check_acc': 0.096}
[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 27, 'train_loss': 2.302584648132324, 'train_acc': 0.13333333283662796, 'check_acc': 0.128}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 28, 'train_loss': 2.302584648132324, 'train_acc': 0.06333333440124989, 'check_acc': 0.068}


[36m(RayTrainWorker pid=2980, ip=100.95.108.125)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06/checkpoint_000029)[32m [repeated 21x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 29, 'train_loss': 2.302584648132324, 'train_acc': 0.09333333186805248, 'check_acc': 0.096}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 30, 'train_loss': 2.302584648132324, 'train_acc': 0.12666666507720947, 'check_acc': 0.12}
== Status ==
Current time: 2025-08-02 14:53:32 (running for 00:03:26.16)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 accelerator_type:T4, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/provider:aws, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 31, 'train_loss': 2.302584648132324, 'train_acc': 0.11999999731779099, 'check_acc': 0.12}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 32, 'train_loss': 2.302584648132324, 'train_acc': 0.12333333119750023, 'check_acc': 0.124}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 33, 'train_loss': 2.302584648132324, 'train_acc': 0.07666666619479656, 'check_acc': 0.076}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 34, 'train_loss': 2.302584648132324, 'train_acc': 0.07666666619479656, 'check_acc': 0.076}


[36m(RayTrainWorker pid=2924, ip=100.103.56.113)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06/checkpoint_000035)[32m [repeated 24x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 35, 'train_loss': 2.302584648132324, 'train_acc': 0.08666666597127914, 'check_acc': 0.096}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 36, 'train_loss': 2.302584648132324, 'train_acc': 0.10000000149011612, 'check_acc': 0.096}
== Status ==
Current time: 2025-08-02 14:53:37 (running for 00:03:31.18)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/5.0 anyscale/provider:aws, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/1.0 anyscale/cpu_only:true, 0.0/1.0 anyscale/node-group:head)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 37, 'train_loss': 2.302584648132324, 'train_acc': 0.1133333332836628, 'check_acc': 0.12}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 38, 'train_loss': 2.302584648132324, 'train_acc': 0.09666666761040688, 'check_acc': 0.092}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 39, 'train_loss': 2.302584648132324, 'train_acc': 0.09999999776482582, 'check_acc': 0.096}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 40, 'train_loss': 2.302584648132324, 'train_acc': 0.08333333395421505, 'check_acc': 0.092}


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06/checkpoint_000041)[32m [repeated 24x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 41, 'train_loss': 2.302584648132324, 'train_acc': 0.12000000104308128, 'check_acc': 0.12}
== Status ==
Current time: 2025-08-02 14:53:42 (running for 00:03:36.25)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/5.0 anyscale/provider:aws, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/4.0 accelerator_type:T4, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/1.0 anyscale/cpu_only:true, 0.0/1.0 anyscale/node-group:head)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 42, 'train_loss': 2.302584648132324, 'train_acc': 0.08666666597127914, 'check_acc': 0.088}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 43, 'train_loss': 2.302584648132324, 'train_acc': 0.06666666641831398, 'check_acc': 0.072}
[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 44, 'train_loss': 2.302584648132324, 'train_acc': 0.09333333745598793, 'check_acc': 0.104}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 45, 'train_loss': 2.302584648132324, 'train_acc': 0.12333333119750023, 'check_acc': 0.132}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 46, 'train_loss': 2.302584648132324, 'train_acc': 0.07000000029802322, 'check_acc': 0.076}
== Status ==
Current time: 2025-08-02 14:53:47 (running for 00:03:41.31)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/5.0 anyscale/provider:aws, 0.0/4.0 accelerator_type:T4, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 47, 'train_loss': 2.302584648132324, 'train_acc': 0.06333333346992731, 'check_acc': 0.068}


[36m(RayTrainWorker pid=3037, ip=100.102.153.87)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06/checkpoint_000047)[32m [repeated 24x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 48, 'train_loss': 2.302584648132324, 'train_acc': 0.08333333395421505, 'check_acc': 0.076}


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.


[36m(RayTrainWorker pid=3036, ip=100.86.214.127)[0m {'epoch': 49, 'train_loss': 2.302584648132324, 'train_acc': 0.07333333324640989, 'check_acc': 0.072}


2025-08-02 14:53:51,377	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c' in 0.1863s.
2025-08-02 14:53:51,379	INFO tune.py:1041 -- Total run time: 225.05 seconds (224.84 seconds for the tuning loop).


== Status ==
Current time: 2025-08-02 14:53:51 (running for 00:03:45.03)
Using FIFO scheduling algorithm.
Logical resource usage: 1.0/32 CPUs, 4.0/4 GPUs (0.0/5.0 anyscale/provider:aws, 0.0/4.0 accelerator_type:T4, 0.0/4.0 anyscale/node-group:1xT4:8CPU-32GB, 0.0/5.0 anyscale/region:us-east-2, 0.0/4.0 anyscale/accelerator_shape:1xT4, 0.0/1.0 anyscale/node-group:head, 0.0/1.0 anyscale/cpu_only:true)
Result logdir: /tmp/ray/session_2025-08-02_12-18-15_816680_2464/artifacts/2025-08-02_14-50-06/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/driver_artifacts
Number of trials: 1/1 (1 TERMINATED)


Training result: Result(
  metrics={'epoch': 49, 'train_loss': 2.302584648132324, 'train_acc': 0.07333333324640989, 'check_acc': 0.072},
  path='/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87f1ff86cb1a720c/TorchTrainer_faa35_00000_0_2025-08-02_14-50-06',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/mnt/cluster_storage/ray_train_torch_run-e68db446183641bd87

In [None]:
ray.shutdown()