In [1]:
import torch

In [2]:
torch.cuda.device_count()

2

In [3]:
def sample(device):
    model = lambda x: x**2 + 3
    return model(torch.randint(0, 1000, (10,), device=device))

In [4]:
sample(torch.device('cuda:0'))

tensor([335244, 302503,  19884, 384403, 810003, 839059, 693892, 233292, 952579,
        574567], device='cuda:0')

In [5]:
from src.nn_handler import parallelize_on_gpus

In [6]:
@parallelize_on_gpus()
def sample_gpu(device):
    """
    This function will be executed on a specific GPU.
    The `device` argument (e.g., 'cuda:0', 'cuda:1') is provided by the decorator.
    """
    # Create the model on the correct device (if it were a real nn.Module)
    model = lambda x: x**2 + 3

    # Create the input tensor directly on the target device
    input_tensor = torch.randint(0, 1000, (10,), device=device)

    print(f"Running on device: {torch.get_device(input_tensor)}")

    return model(input_tensor).cpu().numpy()

In [7]:
sample_gpu()

Running on device: 0
Running on device: 1


[array([990028, 868627,   8467,  84684, 549084,  21612, 429028, 480252,
         31332, 234259]),
 array([255028,  37252, 192724,  16132,  84103,    444, 683932, 174727,
        376999, 279844])]

In [8]:
# Similarly, if you use only .cuda() and .cpu() placements, it is handled automatically and no device argument is needed.
@parallelize_on_gpus(pass_device=False)
def sample_gpu():
    """
    This function will be executed on a specific GPU.
    The `device` argument (e.g., 'cuda:0', 'cuda:1') is provided by the decorator.
    """
    # Create the model on the correct device (if it were a real nn.Module)
    model = lambda x: x**2 + 3

    # Create the input tensor and pass it to the target device
    input_tensor = torch.randint(0, 1000, (2000,)).cuda()

    print(f"Running on device: {torch.get_device(input_tensor)}")

    return model(input_tensor).cpu().numpy()

sample_gpu()

Running on device: 0
Running on device: 1


[array([525628,   5932, 237172, ..., 220903, 153667, 418612]),
 array([978124, 698899,   6564, ...,  36867, 715719, 760387])]

In [9]:
@parallelize_on_gpus()
def heavy_gpu_work(device: torch.device, *, n: int = 4096, steps: int = 20) -> torch.Tensor:
    """
    A computationally intensive task: repeatedly multiply two large matrices
    of shape (n, n) on a given device.

    Parameters
    ----------
    device : torch.device
        Target CUDA device.
    n : int
        Size of the square matrices. 4096 → ~128 MB per matrix in FP32.
    steps : int
        Number of multiply–assign iterations.

    Returns
    -------
    torch.Tensor
        Final matrix produced by the loop (still on `device`).
    """
    # Two random matrices on the selected GPU
    a = torch.randn(n, n, device=device, dtype=torch.float32)
    b = torch.randn(n, n, device=device, dtype=torch.float32)

    # Repeated multiplications to ensure heavy load
    for _ in range(steps):
        c = a @ b
        # Re-use the result in the next iteration to keep the memory footprint stable
        a, b = b, c

    return c.cpu().numpy()


heavy_gpu_work(n=64**2, steps=1000)

[array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]], dtype=float32),
 array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]], dtype=float32)]

In [10]:
def heavy_gpu_work(device: torch.device, *, n: int = 4096, steps: int = 20) -> torch.Tensor:
    """
    A computationally intensive task: repeatedly multiply two large matrices
    of shape (n, n) on a given device.

    Parameters
    ----------
    device : torch.device
        Target CUDA device.
    n : int
        Size of the square matrices. 4096 → ~128 MB per matrix in FP32.
    steps : int
        Number of multiply–assign iterations.

    Returns
    -------
    torch.Tensor
        Final matrix produced by the loop (still on `device`).
    """
    # Two random matrices on the selected GPU
    a = torch.randn(n, n, device=device, dtype=torch.float32)
    b = torch.randn(n, n, device=device, dtype=torch.float32)

    # Repeated multiplications to ensure heavy load
    for _ in range(steps):
        c = a @ b
        # Re-use the result in the next iteration to keep the memory footprint stable
        a, b = b, c

    return c.cpu().numpy()


heavy_gpu_work(torch.device("cuda:1"), n=64**2, steps=1000 * 2)

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=float32)