fix the device type for with_comms decorator

found by yifuwang, it looks like we are wrongly using self.device_type="cuda" for gloo backend, which are triggering some flakiness. i.e. #125366 ghstack-source-id: a845ed283106e463ef87e4576e67573f514bb487 Pull Request resolved: #125798
pytorch · May 14, 2024 · 12ba7f4 · 12ba7f4
1 parent bd3cbdb
commit 12ba7f4
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 11 deletions.
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
@@ -52,10 +52,6 @@ def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0):
 
 
 class DeviceMeshTest(DTensorTestBase):
-    @property
-    def world_size(self):
-        return 4
-
     def test_init_process_group(self):
         device_type = _get_device_type(self.world_size)
         mesh_tensor = torch.arange(4).reshape(2, 2)

diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -35,7 +35,6 @@
 DEVICE_TYPE = (
     "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else "cpu"
 )
-PG_BACKEND = "nccl" if DEVICE_TYPE == "cuda" else "gloo"
 
 NUM_DEVICES = 4
 
@@ -298,10 +297,11 @@ def world_size(self) -> int:
 
     @property
     def backend(self) -> str:
-        return PG_BACKEND
+        backend = "nccl" if self.device_type == "cuda" else "gloo"
+        return backend
 
     def build_device_mesh(self) -> DeviceMesh:
-        return DeviceMesh(DEVICE_TYPE, list(range(self.world_size)))
+        return DeviceMesh(self.device_type, list(range(self.world_size)))
 
     def init_pg(self) -> None:
         if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
@@ -359,11 +359,11 @@ def with_comms(func: TestFunc) -> TestFunc:
     def wrapper(
         self, *args: Tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
     ) -> None:
-        # if backend not specified, and cuda available, then use nccl, else gloo
-        if torch.cuda.is_available() and torch.cuda.device_count() >= self.world_size:
-            self.device_type = "cuda"
-        else:
+        # if enough GPU we can use GPU, otherwise we fallback to CPU
+        if not torch.cuda.is_available() or torch.cuda.device_count() < self.world_size:
             self.device_type = "cpu"
+        else:
+            self.device_type = DEVICE_TYPE
 
         self.init_pg()
         func(self, *args, **kwargs)  # type: ignore[misc]