In [None]:
# Copyright  2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Torch in Ray Local

In [14]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [15]:
!nvidia-smi

Mon Feb 17 00:58:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      On  |   00000000:00:03.0 Off |                    0 |
| N/A   76C    P0             34W /   72W |     301MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA L4                      On  |   00

### Configuration

In [1]:
! pip install --user -q "google-cloud-aiplatform[ray]>=1.56.0" \
                        "ray[data,train,tune,serve]>=2.9.3"

In [2]:
import ray
import ray.train.torch

import pandas as pd

# __torch_setup_begin__
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

ray.__version__, pd.__version__


('2.9.3', '2.1.4')

In [3]:
def get_dataset():
    return datasets.FashionMNIST(
        root="/tmp/data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

In [4]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, inputs):
        inputs = self.flatten(inputs)
        logits = self.linear_relu_stack(inputs)
        return logits

In [5]:
# __torch_single_begin__
def train_func():
    num_epochs = 3
    batch_size = 64

    dataset = get_dataset()
    dataloader = DataLoader(dataset, batch_size=batch_size)

    model = NeuralNetwork()

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    for epoch in range(num_epochs):
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            pred = model(inputs)
            loss = criterion(pred, labels)
            loss.backward()
            optimizer.step()
        print(f"epoch: {epoch}, loss: {loss.item()}")
# __torch_single_end__

In [6]:
# __torch_distributed_begin__
import ray.train.torch

def train_func_distributed():
    num_epochs = 4
    batch_size = 64

    dataset = get_dataset()
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    dataloader = ray.train.torch.prepare_data_loader(dataloader)

    model = NeuralNetwork()
    model = ray.train.torch.prepare_model(model)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

    for epoch in range(num_epochs):
        if ray.train.get_context().get_world_size() > 1:
            dataloader.sampler.set_epoch(epoch)

        for inputs, labels in dataloader:
            optimizer.zero_grad()
            pred = model(inputs)
            loss = criterion(pred, labels)
            loss.backward()
            optimizer.step()
        print(f"epoch: {epoch}, loss: {loss.item()}")
# __torch_distributed_end__




In [7]:
if __name__ == "__main__":
    # __torch_single_run_begin__
    train_func()
    # __torch_single_run_end__

    # __torch_trainer_begin__
    from ray.train.torch import TorchTrainer
    from ray.train import ScalingConfig

    # For GPU Training, set `use_gpu` to True.
    use_gpu = False

    trainer = TorchTrainer(
        train_func_distributed,
        scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)
    )

    results = trainer.fit()
    
    print("---- end training ---")
    
    # __torch_trainer_end__

0,1
Current time:,2025-02-17 01:15:00
Running for:,00:00:24.72
Memory:,24.3/188.7 GiB

Trial name,status,loc
TorchTrainer_8d095_00000,TERMINATED,10.128.0.4:133117


2025-02-17 01:14:35,579	INFO data_parallel_trainer.py:344 -- GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
[36m(TrainTrainable pid=133117)[0m GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
[36m(TorchTrainer pid=133117)[0m GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
[36m(RayTrainWorker pid=133965)[0m Setting up process group for: env:// [rank=0, world_size=4]
[36m(TorchTrainer pid=133117)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=133117)[0m - (ip=10.128.0.4, pid=133965) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=133117)[0m - (ip=10.128.0.4, pid=133966) world_rank

[36m(RayTrainWorker pid=133968)[0m epoch: 0, loss: 1.6010398864746094
[36m(RayTrainWorker pid=133968)[0m epoch: 2, loss: 0.7470226287841797[32m [repeated 8x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
Trial TorchTrainer_8d095_00000 completed. Last result: 


2025-02-17 01:15:00,296	INFO tune.py:1042 -- Total run time: 24.76 seconds (24.72 seconds for the tuning loop).


---- end training ---
