pytorch · jonb377 · Mar 6, 2023 · Mar 3, 2023
@@ -98,14 +98,6 @@ PJRT_DEVICE=GPU GPU_NUM_DEVICES=4 python3 xla/test/test_train_mp_imagenet.py --f
 Currently, only a single host is supported, and multi-host GPU cluster support
 will be added in an future release.
 
-#### Known Issues
-
-The GPU integration has issues with replica groups in collectives (i.e. the
-`group` parameter of the XLA collective ops). If the replica groups are
-changed, there is a chance that the process will hang. For now, the
-recommendation is to use a single replica group containing all devices, as is
-the case in data parallel training.
-
 ## Key differences from XRT
 
 Although in most cases we expect PjRt and XRT to work mostly interchangeably

diff --git a/test/pjrt/test_ddp.py b/test/pjrt/test_ddp.py
@@ -31,6 +31,8 @@ def _ddp_init(index: int = ...):
   def test_ddp_init(self):
     pjrt._run_multiprocess(self._ddp_init)
 
+  @absltest.skipIf(pjrt.device_type() == 'GPU',
+                   "GPU device is not supported by pjrt.spawn_threads")
   def test_ddp_init_threaded(self):
     pjrt.spawn_threads(self._ddp_init)
 

diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -97,8 +97,7 @@ function run_xla_backend_mp {
 function run_pjrt {
   echo "Running in PjRt runtime: $@"
   if [ -x "$(command -v nvidia-smi)" ]; then
-    # TODO(jonbolin): Only run GPU tests with a single device due to collective failures.
-    PJRT_DEVICE=GPU GPU_NUM_DEVICES=1 run_test "$@"
+    PJRT_DEVICE=GPU run_test "$@"
   else
     # TODO(darisoy): run these tests with multiple CPU devices, this fails due to TF issue.
     PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_test "$@"

diff --git a/test/utils/run_test_coverage.sh b/test/utils/run_test_coverage.sh
@@ -53,8 +53,7 @@ function run_xla_backend_mp {
 function run_pjrt {
   echo "Running in PjRt runtime: $@"
   if [ -x "$(command -v nvidia-smi)" ]; then
-    # TODO(jonbolin): Only run GPU tests with a single device due to collective failures.
-    PJRT_DEVICE=GPU GPU_NUM_DEVICES=1 run_test "$@"
+    PJRT_DEVICE=GPU run_test "$@"
   else
     # TODO(darisoy): run these tests with multiple CPU devices, this fails due to TF issue.
     PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_test "$@"

diff --git a/torch_xla/experimental/pjrt.py b/torch_xla/experimental/pjrt.py
@@ -373,6 +373,7 @@ def _initialize_single_process(local_rank: int, local_world_size: int):
 
 def spawn_threads(fn: Callable, args: Tuple = ()) -> None:
   """Run function in one process with one thread per addressable device."""
+  assert device_type() != 'GPU', "spawn_threads does not support GPU device"
   spawn_fn = _SpawnFn(fn, *args)
   _run_thread_per_device(
       local_rank=0,