fix the device type for with_comms decorator (#125798)

found by @yifuwang, it looks like we are wrongly using self.device_type="cuda" for gloo backend, which are triggering some flakiness. i.e. #125366 Pull Request resolved: #125798 Approved by: https://github.com/yifuwang
pytorch · May 19, 2024 · c73f90c · c73f90c
1 parent 8dced59
commit c73f90c
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 9 deletions.
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
@@ -23,10 +23,10 @@
     is_nccl_available,
     ProcessGroup,
 )
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
-    skip_unless_torch_gpu,
     with_comms,
 )
 from torch.testing._internal.distributed.fake_pg import FakeStore
@@ -66,7 +66,7 @@ def test_init_process_group(self):
         self.destroy_pg()
 
     @with_comms
-    @skip_unless_torch_gpu
+    @skip_if_lt_x_gpu(4)
     def test_assert_invalid_mesh_tensor(self):
         mesh = torch.arange(self.world_size).to(self.rank)
         with self.assertRaises(ValueError):

diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -35,7 +35,6 @@
 DEVICE_TYPE = (
     "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else "cpu"
 )
-PG_BACKEND = "nccl" if DEVICE_TYPE == "cuda" else "gloo"
 
 NUM_DEVICES = 4
 
@@ -298,10 +297,11 @@ def world_size(self) -> int:
 
     @property
     def backend(self) -> str:
-        return PG_BACKEND
+        backend = "nccl" if self.device_type == "cuda" else "gloo"
+        return backend
 
     def build_device_mesh(self) -> DeviceMesh:
-        return DeviceMesh(DEVICE_TYPE, list(range(self.world_size)))
+        return DeviceMesh(self.device_type, list(range(self.world_size)))
 
     def init_pg(self) -> None:
         if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
@@ -359,11 +359,11 @@ def with_comms(func: TestFunc) -> TestFunc:
     def wrapper(
         self, *args: Tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
     ) -> None:
-        # if backend not specified, and cuda available, then use nccl, else gloo
-        if torch.cuda.is_available() and torch.cuda.device_count() >= self.world_size:
-            self.device_type = "cuda"
-        else:
+        # if enough GPU we can use GPU, otherwise we fallback to CPU
+        if not torch.cuda.is_available() or torch.cuda.device_count() < self.world_size:
             self.device_type = "cpu"
+        else:
+            self.device_type = DEVICE_TYPE
 
         self.init_pg()
         func(self, *args, **kwargs)  # type: ignore[misc]