pytorch · Flamefire · Oct 15, 2020 · mrshenli · Oct 20, 2020 · mrshenli
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -362,16 +362,14 @@ def _init_multigpu_helper(self):
             """
             nGPUs = torch.cuda.device_count()
             world_size = dist.get_world_size()
-            visible_devices = range(nGPUs)
 
             if BACKEND == "nccl":
                 apply_hack_for_nccl()
 
             nGPUs_per_process = nGPUs // world_size
             rank_to_GPU = {
-                i: list(
-                    visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process]
-                )
+                # Each rank has to get the GPU with the index equal to its rank
+                i: [i + gpu_num * world_size for gpu_num in range(nGPUs_per_process)]
 int16_t deviceIdx = static_cast<int16_t>(rank_ % numGPUs); 
 int16_t deviceIdx = static_cast<int16_t>(rank_ % numGPUs); 
                 for i in range(world_size)
             }
             return rank_to_GPU