In [1]:
import os
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"


In [2]:
#!/usr/bin/env python3
import time
import subprocess
import torch
import torch.nn as nn
import torch.optim as optim

def print_gpu_utilization():
    """
    Calls nvidia-smi to query each GPU's compute and memory utilization,
    then prints it out.
    """
    try:
        # Query index, GPU util (%), memory util (%) in CSV format without units
        cmd = [
            "nvidia-smi",
            "--query-gpu=index,utilization.gpu,utilization.memory",
            "--format=csv,noheader,nounits"
        ]
        output = subprocess.check_output(cmd).decode().strip()
        for line in output.splitlines():
            idx, gpu_util, mem_util = [x.strip() for x in line.split(",")]
            print(f"  GPU {idx}: {gpu_util}% GPU util, {mem_util}% memory util")
    except Exception as e:
        print(f"Error querying nvidia-smi: {e}")

class DummyModel(nn.Module):
    """Tiny feed-forward net to generate some GPU load."""
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(1024, 2048),
            nn.ReLU(inplace=True),
            nn.Linear(2048, 1024),
            nn.ReLU(inplace=True),
        )

    def forward(self, x):
        return self.net(x)

def main():
    if not torch.cuda.is_available():
        print("No CUDA devices found.")
        return

    ngpu = torch.cuda.device_count()
    print(f"Found {ngpu} CUDA device(s).")

    # Wrap model in DataParallel to shard batches across all GPUs
    model = DummyModel().cuda()
    model = nn.DataParallel(model)

    optimizer = optim.SGD(model.parameters(), lr=1e-2)
    criterion = nn.MSELoss()

    # Create dummy data sized to keep GPUs busy
    batch_size = 64
    input_dim = 1024
    data = torch.randn(batch_size, input_dim, device='cuda')
    target = torch.randn(batch_size, input_dim, device='cuda')

    for epoch in range(5):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        print(f"\nEpoch {epoch+1} — loss: {loss.item():.4f}")
        print("GPU utilization:")
        print_gpu_utilization()

        # Pause so you can watch the numbers
        time.sleep(1)

if __name__ == "__main__":
    main()


Found 4 CUDA device(s).
6fdef14527a4:115:115 [0] NCCL INFO cudaDriverVersion 12000
6fdef14527a4:115:115 [0] NCCL INFO Bootstrap: Using eth0:172.20.0.11<0>
6fdef14527a4:115:115 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
6fdef14527a4:115:195 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
6fdef14527a4:115:195 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
6fdef14527a4:115:195 [0] NCCL INFO NET/Socket : Using [0]eth0:172.20.0.11<0>
6fdef14527a4:115:195 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
6fdef14527a4:115:195 [0] NCCL INFO Using network Socket
6fdef14527a4:115:196 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
6fdef14527a4:115:196 [1] NCCL INFO Using network Socket
6fdef14527a4:115:197 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
6fdef14527a4:115:197 [2] NCCL INFO Using network Socket
6fdef14527a4:115:198 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler

RuntimeError: NCCL Error 2: unhandled system error (run with NCCL_DEBUG=INFO for details)