Cherry pick pjrt:// init method rename and doc updates (#5562)

will-cromar · web-flow · commit ee723327755b · 2023-09-14T13:53:53.000-07:00
* Change `pjrt://` init method to `xla://` (#5560) * Update PJRT documentation for the 2.1 release (#5557) * Update PJRT documentation for the 2.1 release * clarify plugins * clarify PJRT doc * Update `pjrt://` to `xla://`
diff --git a/README.md b/README.md
@@ -3,6 +3,10 @@
 <b>Current CI status:</b>  ![GitHub Actions
 status](https://github.com/pytorch/xla/actions/workflows/build_and_test.yml/badge.svg)
 
+Note: PyTorch/XLA r2.1 will be the last release with XRT available as a legacy
+runtime. Our main release build will not include XRT, but it will be available
+in a separate package.
+
 PyTorch/XLA is a Python package that uses the [XLA deep learning
 compiler](https://www.tensorflow.org/xla) to connect the [PyTorch deep learning
 framework](https://pytorch.org/) and [Cloud
@@ -70,6 +74,7 @@ If you're using `DistributedDataParallel`, make the following changes:
 +import torch_xla.core.xla_model as xm
 +import torch_xla.distributed.parallel_loader as pl
 +import torch_xla.distributed.xla_multiprocessing as xmp
++import torch_xla.distributed.xla_backend
 
  def _mp_fn(rank, world_size):
    ...
@@ -78,7 +83,7 @@ If you're using `DistributedDataParallel`, make the following changes:
 -  os.environ['MASTER_PORT'] = '12355'
 -  dist.init_process_group("gloo", rank=rank, world_size=world_size)
 +  # Rank and world size are inferred from the XLA device runtime
-+  dist.init_process_group("xla", init_method='pjrt://')
++  dist.init_process_group("xla", init_method='xla://')
 +
 +  model.to(xm.xla_device())
 +  # `gradient_as_bucket_view=tpu` required for XLA
@@ -101,7 +106,6 @@ If you're using `DistributedDataParallel`, make the following changes:
 +  xmp.spawn(_mp_fn, args=())
 ```
 
-
 Additional information on PyTorch/XLA, including a description of its semantics
 and functions, is available at [PyTorch.org](http://pytorch.org/xla/). See the
 [API Guide](API_GUIDE.md) for best practices when writing networks that run on
diff --git a/docs/pjrt.md b/docs/pjrt.md
@@ -1,18 +1,23 @@
-# PJRT Runtime (Beta)
+# PJRT Runtime
 
-_This document reflects the current state of PJRT support in current nightly
-builds_. See the [same document on the r2.0 branch](https://github.com/pytorch/xla/blob/r2.0/docs/pjrt.md)
-for the status in the latest stable release.
-
-The PyTorch/XLA team is currently migrating from the currently-supported XRT
-runtime to the [PJRT
+PyTorch/XLA has migrated from the TensorFlow-based XRT runtime to the [PJRT
 runtime](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla/pjrt)
 used by [JAX](https://github.com/google/jax).
 
-PJRT is available for preview in PyTorch/XLA 2.0. **We are planning to make
-PJRT our officially supported runtime**, so we encourage all users to experiment
-with it. We aim to make PJRT stable in release 2.1, so if you encounter a bug
-with PJRT, please file an issue on GitHub with the `runtime` tag.
+If you encounter a bug with PJRT, please file an issue on GitHub with the
+`runtime` tag.
+
+_New features in PyTorch/XLA r2.1_:
+
+* PJRT is stable in PyTorch/XLA r2.1!
+* Public runtime APIs have moved from `torch_xla.experimental.pjrt` to
+  `torch_xla.runtime`.
+  * The `pjrt://` init method has been renamed to `xla://`, and it is registered
+    by `torch_xla.distributed.xla_backend`.
+  * The previous `torch_xla.experimental.*` names are still available in this
+    release for compatibility.
+* `torchrun` is now supported when using `init_method='xla://'`.
+* New plugins for XPU and Neuron via the PJRT C API.
 
 _New features in PyTorch/XLA r2.0_:
 
@@ -29,7 +34,7 @@ _New features in PyTorch/XLA r2.0_:
 ## TL;DR
 
 * To use the PJRT preview runtime, set the `PJRT_DEVICE` environment variable to
-  `CPU`, `TPU, or `GPU`
+  `CPU`, `TPU`, or `GPU`
 * In XRT, all distributed workloads are multiprocess, with one process per
   device. On TPU v2 and v3 in PJRT, workloads are multiprocess and multithreaded
   (4 processes with 2 threads each), so your workload should be thread-safe. See
@@ -45,7 +50,7 @@ _New features in PyTorch/XLA r2.0_:
     The global `torch` RNG is _not_ thread-safe, even if you set the same
     `torch.manual_seed` across replicas.
   * To use `torch.distributed`, import `torch_xla.experimental.pjrt_backend` and
-    use the `pjrt://` `init_method`.
+    use the `xla://` `init_method`.
   * These steps are optional for GPU and TPU v4.
 
 Sample diff from XRT to PJRT:
@@ -62,20 +67,19 @@ Sample diff from XRT to PJRT:
  import torch_xla.distributed.parallel_loader as pl
  import torch_xla.distributed.xla_backend
  import torch_xla.distributed.xla_multiprocessing as xmp
-+import torch_xla.experimental.pjrt_backend
-+import torch_xla.experimental.pjrt as pjrt
++import torch_xla.runtime as xr
 
 
  def _mp_fn(index):
    device = xm.xla_device()
 -  dist.init_process_group('xla', rank=xm.get_ordinal(), world_size=xm.xrt_world_size())
-+  dist.init_process_group('xla', init_method='pjrt://')
++  dist.init_process_group('xla', init_method='xla://')
 
    torch.manual_seed(42)
    model = nn.Linear(128, 10).to(device)
 
 +  # Optional for TPU v4 and GPU
-+  pjrt.broadcast_master_param(model)
++  xm.broadcast_master_param(model)
    model = DDP(model, gradient_as_bucket_view=True)
 
    loss_fn = nn.MSELoss()
@@ -286,15 +290,15 @@ without altering the XLA graph and/or synchronizing a subset of workers),
 consider using
 [`torch.distributed.barrier`](https://pytorch.org/docs/stable/distributed.html#torch.distributed.barrier)
 or
-`[torch.distributed.all_gather_object](https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_gather_object)`
+[`torch.distributed.all_gather_object`](https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_gather_object)
 with a `gloo` process group. If you are also using the `xla` `torch.distributed`
 backend, you can use `torch.new_group` to create a `gloo` subgroup. See [this
 example](https://pytorch.org/docs/stable/distributed.html#monitored-barrier)
 from the PyTorch documentation. Keep in mind these constraints:
 
 * `torch.distributed` is not fully supported on TPU v2/v3. Only a subset of
   operations with the `xla` backend are implemented, and `gloo` will likely not
-  work as expected in a multiprocessing context.
+  work as expected in a multithreaded context.
 * In our experiments, `gloo` does not scale well to thousands of TPU chips, so
   expect this alternative to be less reliable than using `xm.rendezvous` with
   PJRT at large scales.
@@ -305,7 +309,7 @@ _New in PyTorch/XLA r2.0_
 
 When using PJRT with `torch.distributed` and
 `[torch.nn.parallel.DistributedDataParallel](https://github.com/pytorch/xla/blob/master/docs/ddp.md)`
-we strongly recommend using the new `pjrt://` `init_method`, which automatically
+we strongly recommend using the new `xla://` `init_method`, which automatically
 finds the replica IDs, world size, and master IP by querying the runtime. For
 example:
 
@@ -316,12 +320,12 @@ import torch_xla.core.xla_model as xm
 import torch_xla.distributed.xla_multiprocessing as xmp
 from torch_xla.experimental import pjrt
 
-# Required for `pjrt://` init_method
-import torch_xla.experimental.pjrt_backend
+# Required for `xla://` init_method and `xla` backend
+import torch_xla.distributed.xla_backend
 
 def _all_gather(index: int):
   # No need to pass in `rank` or `world_size`
-  dist.init_process_group('xla', init_method='pjrt://')
+  dist.init_process_group('xla', init_method='xla://')
 
   t = torch.tensor([index], dtype=torch.int32, device=xm.xla_device())
   output = [torch.zeros_like(t) for _ in range(dist.get_world_size())]
@@ -334,10 +338,14 @@ if __name__ == '__main__':
   xmp.spawn(_all_gather)
 ```
 
-Note: Although the `pjrt://` init_method is not required on TPU v4, it is still
+Note: Although the `xla://` init_method is not required on TPU v4, it is still
 recommended. If you use `env://`, `MASTER_ADDR` must be set to IP host that has
-device 0, which is _not_ always worker 0. The `pjrt://` init_method finds this
-IP automatically and supports TPU v2/v3.
+device 0, which is _not_ always worker 0. The `xla://` init_method finds this
+IP automatically.
+
+Note: For TPU v2/v3, you still need to import
+`torch_xla.experimental.pjrt_backend`, as TPU v2/v3 support in
+`torch.distributed` is still experimental.
 
 For more information about using `DistributedDataParallel` on PyTorch/XLA, see
 [`ddp.md`](./ddp.md) on TPU V4. For an example that uses DDP and PJRT together,
diff --git a/test/pjrt/test_ddp.py b/test/pjrt/test_ddp.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 from torch.nn.parallel import DistributedDataParallel as DDP
 import torch_xla.core.xla_model as xm
-import torch_xla.experimental.pjrt_backend
+import torch_xla.distributed.xla_backend
 from torch_xla import runtime as xr
 from torch_xla._internal import pjrt, tpu
 
@@ -24,7 +24,7 @@ class TestPjRtDistributedDataParallel(parameterized.TestCase):
 
   @staticmethod
   def _ddp_init(index: int = ...):
-    dist.init_process_group('xla', init_method='pjrt://')
+    dist.init_process_group('xla', init_method='xla://')
     device = xm.xla_device()
     model = nn.Linear(10, 10).to(device)
     ddp_model = DDP(model)
@@ -41,7 +41,7 @@ def test_ddp_init_threaded(self):
   def test_ddp_correctness(self, use_large_net: bool):
     pjrt.run_multiprocess(
         util.ddp_correctness,
-        init_method='pjrt://',
+        init_method='xla://',
         use_large_net=use_large_net,
         debug=FLAGS.debug)
 
diff --git a/test/pjrt/test_torchrun.py b/test/pjrt/test_torchrun.py
@@ -3,15 +3,15 @@
 import torch
 import torch.distributed as dist
 import torch_xla.core.xla_model as xm
-import torch_xla.experimental.pjrt_backend
+import torch_xla.distributed.xla_backend
 import torch_xla.runtime as xr
 import torch_xla.utils.utils as xu
 
 
 class TestTorchrun(absltest.TestCase):
 
   def test_all_gather(self):
-    dist.init_process_group('xla', init_method='pjrt://')
+    dist.init_process_group('xla', init_method='xla://')
 
     dist_world_size = xu.getenv_as('WORLD_SIZE', int)
     devices_per_thread = xr.addressable_device_count()
diff --git a/test/test_torch_distributed_xla_backend.py b/test/test_torch_distributed_xla_backend.py
@@ -10,7 +10,6 @@
 import torch_xla
 import torch_xla.core.xla_model as xm
 import torch_xla.distributed.xla_backend
-import torch_xla.experimental.pjrt_backend
 from torch_xla import runtime as xr
 
 
@@ -42,7 +41,7 @@ class XlaBackendTest(parameterized.TestCase):
   def setUpClass(cls):
     # Add no-op all-reduce ops to HLO
     os.environ['XLA_ALWAYS_ALLREDUCE'] = '1'
-    dist.init_process_group('xla', init_method='pjrt://')
+    dist.init_process_group('xla', init_method='xla://')
 
   def tearDown(self) -> None:
     # Purge all computations attached the device.
diff --git a/test/test_train_mp_imagenet.py b/test/test_train_mp_imagenet.py
@@ -31,7 +31,7 @@
     '--ddp': {
         'action': 'store_true',
     },
-    # Use pjrt:// init_method instead of env:// for `torch.distributed`.
+    # Use xla:// init_method instead of env:// for `torch.distributed`.
     # Required for DDP on TPU v2/v3 when using PJRT.
     '--pjrt_distributed': {
         'action': 'store_true',
@@ -180,8 +180,7 @@ def _train_update(device, step, loss, tracker, epoch, writer):
 
 def train_imagenet():
   if FLAGS.pjrt_distributed:
-    import torch_xla.experimental.pjrt_backend
-    dist.init_process_group('xla', init_method='pjrt://')
+    dist.init_process_group('xla', init_method='xla://')
   elif FLAGS.ddp:
     dist.init_process_group(
         'xla', world_size=xm.xrt_world_size(), rank=xm.get_ordinal())
diff --git a/test/test_train_mp_mnist.py b/test/test_train_mp_mnist.py
@@ -77,8 +77,7 @@ def _train_update(device, step, loss, tracker, epoch, writer):
 
 def train_mnist(flags, **kwargs):
   if flags.pjrt_distributed:
-    import torch_xla.experimental.pjrt_backend
-    dist.init_process_group('xla', init_method='pjrt://')
+    dist.init_process_group('xla', init_method='xla://')
   elif flags.ddp:
     dist.init_process_group(
         'xla', world_size=xm.xrt_world_size(), rank=xm.get_ordinal())
diff --git a/torch_xla/_internal/rendezvous.py b/torch_xla/_internal/rendezvous.py
@@ -0,0 +1,47 @@
+import datetime
+import logging
+import threading
+
+import torch.distributed as dist
+from torch_xla.distributed import xla_backend
+from torch_xla import runtime as xr
+from torch_xla._internal import pjrt
+from torch_xla._internal import tpu
+import torch_xla.utils.utils as xu
+
+_store = None
+_store_lock = threading.Lock()
+
+
+def pjrt_rendezvous_handler(url: str,
+                            timeout: datetime.timedelta = ...,
+                            **kwargs):
+  # Assume `xmp.spawn` has not been called when using torchrun
+  if dist.is_torchelastic_launched():
+    local_world_size = xu.getenv_as('LOCAL_WORLD_SIZE', int)
+    local_rank = xu.getenv_as('LOCAL_RANK', int)
+    pjrt.initialize_multiprocess(local_rank, local_world_size)
+
+  master_ip = xu.getenv_as('MASTER_ADDR', str)
+  if not master_ip:
+    master_ip = tpu.discover_master_worker_ip() if xr.device_type(
+    ) == 'TPU' else 'localhost'
+
+  master_port = xu.getenv_as('MASTER_PORT', int, 12355)
+  world_size = xr.world_size()
+  with _store_lock:
+    global _store
+    if not _store:
+      if xu.getenv_as('TORCHELASTIC_USE_AGENT_STORE', str) == 'True':
+        attempt = xu.getenv_as('TORCHELASTIC_RESTART_COUNT', int, defval=0)
+        tcp_store = dist.TCPStore(
+            master_ip, master_port, xr.process_count(), is_master=False)
+        _store = dist.PrefixStore(f"/worker/attempt_{attempt}", tcp_store)
+      else:
+        _store = dist.TCPStore(
+            master_ip,
+            master_port,
+            xr.process_count(),
+            is_master=xr.process_index() == 0)
+
+  yield (_store, xr.global_ordinal(), world_size)
diff --git a/torch_xla/distributed/xla_backend.py b/torch_xla/distributed/xla_backend.py
@@ -1,6 +1,7 @@
 import torch
 import torch.distributed as dist
 import torch_xla.core.xla_model as xm
+from torch_xla._internal import rendezvous
 import logging
 import os
 from torch._C._distributed_c10d import ProcessGroup
@@ -16,6 +17,8 @@ def _register_xla_backend():
 
 _register_xla_backend()
 
+dist.register_rendezvous_handler('xla', rendezvous.pjrt_rendezvous_handler)
+
 
 def _ret_work(ret):
   fut = torch.futures.Future()
diff --git a/torch_xla/experimental/pjrt_backend.py b/torch_xla/experimental/pjrt_backend.py
@@ -1,51 +1,9 @@
-import datetime
 import logging
-import threading
 
 import torch.distributed as dist
 from torch_xla.distributed import xla_backend
-from torch_xla import runtime as xr
-from torch_xla._internal import pjrt
+from torch_xla._internal import rendezvous
 from torch_xla._internal import tpu
-import torch_xla.utils.utils as xu
-
-_store = None
-_store_lock = threading.Lock()
-
-
-def _pjrt_rendezvous_handler(url: str,
-                             timeout: datetime.timedelta = ...,
-                             **kwargs):
-  # Assume `xmp.spawn` has not been called when using torchrun
-  if dist.is_torchelastic_launched():
-    local_world_size = xu.getenv_as('LOCAL_WORLD_SIZE', int)
-    local_rank = xu.getenv_as('LOCAL_RANK', int)
-    pjrt.initialize_multiprocess(local_rank, local_world_size)
-
-  master_ip = xu.getenv_as('MASTER_ADDR', str)
-  if not master_ip:
-    master_ip = tpu.discover_master_worker_ip() if xr.device_type(
-    ) == 'TPU' else 'localhost'
-
-  master_port = xu.getenv_as('MASTER_PORT', int, 12355)
-  world_size = xr.world_size()
-  with _store_lock:
-    global _store
-    if not _store:
-      if xu.getenv_as('TORCHELASTIC_USE_AGENT_STORE', str) == 'True':
-        attempt = xu.getenv_as('TORCHELASTIC_RESTART_COUNT', int, defval=0)
-        tcp_store = dist.TCPStore(
-            master_ip, master_port, xr.process_count(), is_master=False)
-        _store = dist.PrefixStore(f"/worker/attempt_{attempt}", tcp_store)
-      else:
-        _store = dist.TCPStore(
-            master_ip,
-            master_port,
-            xr.process_count(),
-            is_master=xr.process_index() == 0)
-
-  yield (_store, xr.global_ordinal(), world_size)
-
 
 if tpu.num_available_chips() > 0 and tpu.version() <= 3:
   from torch.testing._internal.distributed import multi_threaded_pg
@@ -54,4 +12,4 @@ def _pjrt_rendezvous_handler(url: str,
                   'and does not support torchrun.')
   multi_threaded_pg._install_threaded_pg()
 
-dist.register_rendezvous_handler('pjrt', _pjrt_rendezvous_handler)
+dist.register_rendezvous_handler('pjrt', rendezvous.pjrt_rendezvous_handler)
diff --git a/torch_xla/runtime.py b/torch_xla/runtime.py
@@ -8,7 +8,6 @@
 import torch_xla
 import torch_xla.core.xla_env_vars as xenv
 import torch_xla.core.xla_model as xm
-import torch_xla.distributed.xla_backend
 import torch_xla.utils.utils as xu
 import torch_xla._internal.tpu as tpu