From 25b2bbc905e299703013684d65727486000af39b Mon Sep 17 00:00:00 2001
From: Matthew Deng <matt@anyscale.com>
Date: Mon, 29 Jan 2024 15:21:26 -0800
Subject: [PATCH 1/5] [train] remove DEFAULT_NCCL_SOCKET_IFNAME

Signed-off-by: Matthew Deng <matt@anyscale.com>
---
 python/ray/air/util/torch_dist.py              |  3 ---
 python/ray/train/constants.py                  |  3 ---
 python/ray/train/tests/test_gpu.py             |  3 +--
 python/ray/train/torch/config.py               | 18 ------------------
 .../air_benchmarks/workloads/benchmark_util.py |  6 +-----
 .../workloads/torch_benchmark.py               | 18 ------------------
 .../workloads/tune_torch_benchmark.py          |  1 -
 release/release_tests.yaml                     |  2 --
 release/xgboost_tests/app_config_gpu.yaml      |  6 ------
 9 files changed, 2 insertions(+), 58 deletions(-)

diff --git a/python/ray/air/util/torch_dist.py b/python/ray/air/util/torch_dist.py
index de55efdeef7a2..47ac496460c4f 100644
--- a/python/ray/air/util/torch_dist.py
+++ b/python/ray/air/util/torch_dist.py
@@ -16,7 +16,6 @@
 import ray
 from ray.actor import ActorHandle
 from ray.train._internal.utils import get_address_and_port
-from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
 from ray.air._internal.torch_utils import get_device
 
 
@@ -69,8 +68,6 @@ def _init_torch_distributed(
         # All workers on a same node should share the same set of
         # visible GPUs. Otherwise they can't talk among themselves.
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gid) for gid in gpu_ids)
-        if "NCCL_SOCKET_IFNAME" not in os.environ:
-            os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME
 
     init_process_group_kwargs.update(
         dict(
diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py
index e89354f490474..efb36bd57eab1 100644
--- a/python/ray/train/constants.py
+++ b/python/ray/train/constants.py
@@ -97,9 +97,6 @@ def _get_defaults_results_dir() -> str:
     RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE,
 }
 
-# Blacklist virtualized networking.
-DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,veth"
-
 # Key for AIR Checkpoint metadata in TrainingResult metadata
 CHECKPOINT_METADATA_KEY = "checkpoint_metadata"
 
diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
index 6fee72b088bde..e5353b83d0d64 100644
--- a/python/ray/train/tests/test_gpu.py
+++ b/python/ray/train/tests/test_gpu.py
@@ -17,7 +17,6 @@
 from ray.exceptions import RayTaskError
 from ray.train import ScalingConfig
 from ray.train._internal.worker_group import WorkerGroup
-from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
 from ray.train.examples.pytorch.torch_linear_example import LinearDataset
 from ray.train.torch.config import TorchConfig, _TorchBackend
 from ray.train.torch.torch_trainer import TorchTrainer
@@ -328,7 +327,7 @@ def set_env_var():
         worker_group.execute(set_env_var)
 
     def assert_env_var_set():
-        value = nccl_socket_ifname if nccl_socket_ifname else DEFAULT_NCCL_SOCKET_IFNAME
+        value = nccl_socket_ifname if nccl_socket_ifname else "TODO"
         assert os.environ["NCCL_SOCKET_IFNAME"] == value
 
     torch_backend = _TorchBackend()
diff --git a/python/ray/train/torch/config.py b/python/ray/train/torch/config.py
index c65b66bd05346..79f0f60da2e94 100644
--- a/python/ray/train/torch/config.py
+++ b/python/ray/train/torch/config.py
@@ -11,7 +11,6 @@
 from ray.train._internal.utils import get_address_and_port
 from ray.train._internal.worker_group import WorkerGroup
 from ray.train.backend import Backend, BackendConfig
-from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
 from ray.util import PublicAPI
 
 logger = logging.getLogger(__name__)
@@ -45,20 +44,6 @@ def backend_cls(self):
         return _TorchBackend
 
 
-def _set_nccl_network_interface():
-    """Set the appropriate NCCL network interface to use."""
-
-    if "NCCL_SOCKET_IFNAME" not in os.environ:
-        logger.debug(
-            f"Setting NCCL_SOCKET_IFNAME to {DEFAULT_NCCL_SOCKET_IFNAME} "
-            f"to prioritize ethernet connection. To override this behavior, set the "
-            f"`NCCL_SOCKET_IFNAME` environment variable in your Ray runtime "
-            "environment: "
-            "`ray.init(runtime_env={{'env_vars': {'NCCL_SOCKET_IFNAME': 'ens5'}}}`"
-        )
-        os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME
-
-
 def _setup_torch_process_group(
     backend: str,
     world_rank: int,
@@ -159,9 +144,6 @@ def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig):
             else:
                 backend = backend_config.backend
 
-            if backend == "nccl":
-                worker_group.execute(_set_nccl_network_interface)
-
             master_addr, master_port = worker_group.execute_single(
                 0, get_address_and_port
             )
diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py
index 7bee5d682aff1..5fbaaf8c285a3 100644
--- a/release/air_tests/air_benchmarks/workloads/benchmark_util.py
+++ b/release/air_tests/air_benchmarks/workloads/benchmark_util.py
@@ -7,7 +7,7 @@
 from ray.air.util.node import _force_on_node
 
 import ray
-from typing import Any, List, Dict, Union, Callable
+from typing import List, Dict, Union, Callable
 
 
 def schedule_remote_fn_on_all_nodes(
@@ -77,16 +77,12 @@ def run_fn(self, fn: Callable, *args, **kwargs):
 def create_actors_with_options(
     num_actors: int,
     resources: Dict[str, Union[float, int]],
-    runtime_env: Dict[str, Any] = None,
 ) -> List[ray.actor.ActorHandle]:
     num_cpus = resources.pop("CPU", 1)
     num_gpus = resources.pop("GPU", 0)
 
     options = {"num_cpus": num_cpus, "num_gpus": num_gpus, "resources": resources}
 
-    if runtime_env:
-        options["runtime_env"] = runtime_env
-
     return [CommandRunner.options(**options).remote() for _ in range(num_actors)]
 
 
diff --git a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
index 0b7d8c8a6ded3..0ce327cb6e5dd 100644
--- a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
+++ b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
@@ -18,17 +18,6 @@
 VANILLA_RESULT_JSON = "/tmp/vanilla_out.json"
 
 
-def find_network_interface():
-    for iface in os.listdir("/sys/class/net"):
-        if iface.startswith("ens"):
-            network_interface = iface
-            break
-    else:
-        network_interface = "^lo,docker"
-
-    return network_interface
-
-
 # Define model
 class NeuralNetwork(nn.Module):
     def __init__(self):
@@ -311,19 +300,12 @@ def train_torch_vanilla(
 
     num_epochs = config["epochs"]
 
-    try:
-        nccl_network_interface = find_network_interface()
-        runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": nccl_network_interface}}
-    except Exception:
-        runtime_env = {}
-
     actors = create_actors_with_options(
         num_actors=num_workers,
         resources={
             "CPU": cpus_per_worker,
             "GPU": int(use_gpu),
         },
-        runtime_env=runtime_env,
     )
 
     run_fn_on_actors(actors=actors, fn=lambda: os.environ.pop("OMP_NUM_THREADS", None))
diff --git a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
index 4b60b38b137cd..0d7b594d14976 100644
--- a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
+++ b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
@@ -117,7 +117,6 @@ def main(
     ray.init(
         runtime_env={
             "working_dir": os.path.dirname(__file__),
-            "env_vars": {"NCCL_SOCKET_IFNAME": "ens"},
         }
     )
     prepare_mnist()
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index 21ac898844143..2fdaba237c9c0 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -1128,8 +1128,6 @@
   cluster:
     byod:
       type: gpu
-      runtime_env:
-        - NCCL_SOCKET_IFNAME=ens
       post_build_script: byod_xgboost_test.sh
     cluster_compute: tpl_gpu_small_aws.yaml
 
diff --git a/release/xgboost_tests/app_config_gpu.yaml b/release/xgboost_tests/app_config_gpu.yaml
index 55db2bfa14f2d..193a05e5c1df2 100755
--- a/release/xgboost_tests/app_config_gpu.yaml
+++ b/release/xgboost_tests/app_config_gpu.yaml
@@ -1,10 +1,4 @@
 base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] }}
-env_vars:
-  # Manually set NCCL_SOCKET_IFNAME to "ens" so NCCL training works on
-  # anyscale_default_cloud.
-  # See https://github.com/pytorch/pytorch/issues/68893 for more details.
-  NCCL_SOCKET_IFNAME: ens
-
 debian_packages:
   - curl
 

From 78e04a10667205dd1f0f9eee85a8c7d565c3b2df Mon Sep 17 00:00:00 2001
From: Matthew Deng <matt@anyscale.com>
Date: Mon, 29 Jan 2024 21:28:59 -0800
Subject: [PATCH 2/5] remove test

Signed-off-by: Matthew Deng <matt@anyscale.com>
---
 python/ray/train/tests/test_gpu.py | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
index e5353b83d0d64..34d1a8482a453 100644
--- a/python/ray/train/tests/test_gpu.py
+++ b/python/ray/train/tests/test_gpu.py
@@ -315,27 +315,6 @@ def train_func():
     assert result1.metrics["loss"] == result2.metrics["loss"]
 
 
-@pytest.mark.parametrize("nccl_socket_ifname", ["", "ens3"])
-def test_torch_backend_nccl_socket_ifname(ray_start_4_cpus_2_gpus, nccl_socket_ifname):
-    worker_group = WorkerGroup(num_workers=2, num_gpus_per_worker=1)
-
-    if nccl_socket_ifname:
-
-        def set_env_var():
-            os.environ["NCCL_SOCKET_IFNAME"] = nccl_socket_ifname
-
-        worker_group.execute(set_env_var)
-
-    def assert_env_var_set():
-        value = nccl_socket_ifname if nccl_socket_ifname else "TODO"
-        assert os.environ["NCCL_SOCKET_IFNAME"] == value
-
-    torch_backend = _TorchBackend()
-    torch_backend.on_start(worker_group, backend_config=TorchConfig(backend="nccl"))
-
-    worker_group.execute(assert_env_var_set)
-
-
 def test_torch_fail_on_nccl_timeout(ray_start_4_cpus_2_gpus):
     """Tests that TorchTrainer raises exception on NCCL timeouts."""
 

From 01325cd963d06ece62e1cef999b1a4bb62a23ecb Mon Sep 17 00:00:00 2001
From: Matthew Deng <matt@anyscale.com>
Date: Tue, 30 Jan 2024 10:11:19 -0800
Subject: [PATCH 3/5] lint

Signed-off-by: Matthew Deng <matt@anyscale.com>
---
 python/ray/train/tests/test_gpu.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
index 34d1a8482a453..64aa7f988509b 100644
--- a/python/ray/train/tests/test_gpu.py
+++ b/python/ray/train/tests/test_gpu.py
@@ -16,9 +16,8 @@
 from ray import train
 from ray.exceptions import RayTaskError
 from ray.train import ScalingConfig
-from ray.train._internal.worker_group import WorkerGroup
 from ray.train.examples.pytorch.torch_linear_example import LinearDataset
-from ray.train.torch.config import TorchConfig, _TorchBackend
+from ray.train.torch.config import TorchConfig
 from ray.train.torch.torch_trainer import TorchTrainer
 from ray.train.trainer import TrainingFailedError
 

From 031a29f923651ddcc6fda05f5a20f9ae76874995 Mon Sep 17 00:00:00 2001
From: Matthew Deng <matt@anyscale.com>
Date: Thu, 1 Feb 2024 13:44:24 -0800
Subject: [PATCH 4/5] docs

Signed-off-by: Matthew Deng <matt@anyscale.com>
---
 doc/source/train/user-guides/using-gpus.rst | 83 +++++++++++++--------
 1 file changed, 52 insertions(+), 31 deletions(-)

diff --git a/doc/source/train/user-guides/using-gpus.rst b/doc/source/train/user-guides/using-gpus.rst
index 537c035e85d6e..b34666af269b3 100644
--- a/doc/source/train/user-guides/using-gpus.rst
+++ b/doc/source/train/user-guides/using-gpus.rst
@@ -72,6 +72,58 @@ You can get the associated devices with :meth:`ray.train.torch.get_device`.
     trainer.fit()
 
 
+(PyTorch) Setting the communication backend 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. note::
+
+    This is an advanced setting. In most cases, you don't have to change this setting.
+
+You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a
+:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`.
+
+See the `PyTorch API reference <https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group>`__
+for valid options.
+
+.. testcode::
+    :hide:
+
+    num_training_workers = 1
+
+.. testcode::
+
+    from ray.train.torch import TorchConfig, TorchTrainer
+
+    trainer = TorchTrainer(
+        train_func,
+        scaling_config=ScalingConfig(
+            num_workers=num_training_workers,
+            use_gpu=True,
+        ),
+        torch_config=TorchConfig(backend="gloo"),
+    )
+
+(NCCL) Setting the communication network interface
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When using NCCL for distributed training, you can configure the network interface cards
+that are used for communicating between GPUs by setting the 
+`NCCL_SOCKET_IFNAME <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-socket-ifname>`__ 
+environment variable.
+
+To ensure that the environment variable is set for all training workers, you can pass it
+in a :ref:`Ray runtime environment <runtime-environments>`:
+
+.. testcode::
+    :skipif: True
+
+    import ray
+
+    runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens5"}}
+    ray.init(runtime_env=runtime_env)
+
+    trainer = TorchTrainer(...)
+
 Setting the resources per worker
 --------------------------------
 If you want to allocate more than one CPU or GPU per training worker, or if you
@@ -113,37 +165,6 @@ will be assigned the same CUDA device.
     )
 
 
-Setting the communication backend (PyTorch)
--------------------------------------------
-
-.. note::
-
-    This is an advanced setting. In most cases, you don't have to change this setting.
-
-You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a
-:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`.
-
-See the `PyTorch API reference <https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group>`__
-for valid options.
-
-.. testcode::
-    :hide:
-
-    num_training_workers = 1
-
-.. testcode::
-
-    from ray.train.torch import TorchConfig, TorchTrainer
-
-    trainer = TorchTrainer(
-        train_func,
-        scaling_config=ScalingConfig(
-            num_workers=num_training_workers,
-            use_gpu=True,
-        ),
-        torch_config=TorchConfig(backend="gloo"),
-    )
-
 
 .. _train_trainer_resources:
 

From a0d44d8d76e8375ff4d09cd34173f9f707fe0aed Mon Sep 17 00:00:00 2001
From: Matthew Deng <matt@anyscale.com>
Date: Fri, 2 Feb 2024 11:10:56 -0800
Subject: [PATCH 5/5] backend

Signed-off-by: Matthew Deng <matt@anyscale.com>
---
 doc/source/train/user-guides/using-gpus.rst | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/doc/source/train/user-guides/using-gpus.rst b/doc/source/train/user-guides/using-gpus.rst
index 9777495e8492a..24526d552f6f8 100644
--- a/doc/source/train/user-guides/using-gpus.rst
+++ b/doc/source/train/user-guides/using-gpus.rst
@@ -107,15 +107,11 @@ You can get a list of associated devices with :meth:`ray.train.torch.get_devices
 (PyTorch) Setting the communication backend 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. note::
-
-    This is an advanced setting. In most cases, you don't have to change this setting.
+PyTorch Distributed supports multiple `backends <https://pytorch.org/docs/stable/distributed.html#backends>`__
+for communicating tensors across workers. By default Ray Train will use NCCL when ``use_gpu=True`` and Gloo otherwise.
 
-You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a
-:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`.
-
-See the `PyTorch API reference <https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group>`__
-for valid options.
+If you explictly want to override this setting, you can configure a :class:`~ray.train.torch.TorchConfig` 
+and pass it into the :class:`~ray.train.torch.TorchTrainer`.
 
 .. testcode::
     :hide:
@@ -130,7 +126,7 @@ for valid options.
         train_func,
         scaling_config=ScalingConfig(
             num_workers=num_training_workers,
-            use_gpu=True,
+            use_gpu=True, # Defaults to NCCL
         ),
         torch_config=TorchConfig(backend="gloo"),
     )