From 25b2bbc905e299703013684d65727486000af39b Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Mon, 29 Jan 2024 15:21:26 -0800 Subject: [PATCH 1/5] [train] remove DEFAULT_NCCL_SOCKET_IFNAME Signed-off-by: Matthew Deng --- python/ray/air/util/torch_dist.py | 3 --- python/ray/train/constants.py | 3 --- python/ray/train/tests/test_gpu.py | 3 +-- python/ray/train/torch/config.py | 18 ------------------ .../air_benchmarks/workloads/benchmark_util.py | 6 +----- .../workloads/torch_benchmark.py | 18 ------------------ .../workloads/tune_torch_benchmark.py | 1 - release/release_tests.yaml | 2 -- release/xgboost_tests/app_config_gpu.yaml | 6 ------ 9 files changed, 2 insertions(+), 58 deletions(-) diff --git a/python/ray/air/util/torch_dist.py b/python/ray/air/util/torch_dist.py index de55efdeef7a2..47ac496460c4f 100644 --- a/python/ray/air/util/torch_dist.py +++ b/python/ray/air/util/torch_dist.py @@ -16,7 +16,6 @@ import ray from ray.actor import ActorHandle from ray.train._internal.utils import get_address_and_port -from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME from ray.air._internal.torch_utils import get_device @@ -69,8 +68,6 @@ def _init_torch_distributed( # All workers on a same node should share the same set of # visible GPUs. Otherwise they can't talk among themselves. os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gid) for gid in gpu_ids) - if "NCCL_SOCKET_IFNAME" not in os.environ: - os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME init_process_group_kwargs.update( dict( diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index e89354f490474..efb36bd57eab1 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -97,9 +97,6 @@ def _get_defaults_results_dir() -> str: RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE, } -# Blacklist virtualized networking. -DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,veth" - # Key for AIR Checkpoint metadata in TrainingResult metadata CHECKPOINT_METADATA_KEY = "checkpoint_metadata" diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index 6fee72b088bde..e5353b83d0d64 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -17,7 +17,6 @@ from ray.exceptions import RayTaskError from ray.train import ScalingConfig from ray.train._internal.worker_group import WorkerGroup -from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME from ray.train.examples.pytorch.torch_linear_example import LinearDataset from ray.train.torch.config import TorchConfig, _TorchBackend from ray.train.torch.torch_trainer import TorchTrainer @@ -328,7 +327,7 @@ def set_env_var(): worker_group.execute(set_env_var) def assert_env_var_set(): - value = nccl_socket_ifname if nccl_socket_ifname else DEFAULT_NCCL_SOCKET_IFNAME + value = nccl_socket_ifname if nccl_socket_ifname else "TODO" assert os.environ["NCCL_SOCKET_IFNAME"] == value torch_backend = _TorchBackend() diff --git a/python/ray/train/torch/config.py b/python/ray/train/torch/config.py index c65b66bd05346..79f0f60da2e94 100644 --- a/python/ray/train/torch/config.py +++ b/python/ray/train/torch/config.py @@ -11,7 +11,6 @@ from ray.train._internal.utils import get_address_and_port from ray.train._internal.worker_group import WorkerGroup from ray.train.backend import Backend, BackendConfig -from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME from ray.util import PublicAPI logger = logging.getLogger(__name__) @@ -45,20 +44,6 @@ def backend_cls(self): return _TorchBackend -def _set_nccl_network_interface(): - """Set the appropriate NCCL network interface to use.""" - - if "NCCL_SOCKET_IFNAME" not in os.environ: - logger.debug( - f"Setting NCCL_SOCKET_IFNAME to {DEFAULT_NCCL_SOCKET_IFNAME} " - f"to prioritize ethernet connection. To override this behavior, set the " - f"`NCCL_SOCKET_IFNAME` environment variable in your Ray runtime " - "environment: " - "`ray.init(runtime_env={{'env_vars': {'NCCL_SOCKET_IFNAME': 'ens5'}}}`" - ) - os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME - - def _setup_torch_process_group( backend: str, world_rank: int, @@ -159,9 +144,6 @@ def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig): else: backend = backend_config.backend - if backend == "nccl": - worker_group.execute(_set_nccl_network_interface) - master_addr, master_port = worker_group.execute_single( 0, get_address_and_port ) diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py index 7bee5d682aff1..5fbaaf8c285a3 100644 --- a/release/air_tests/air_benchmarks/workloads/benchmark_util.py +++ b/release/air_tests/air_benchmarks/workloads/benchmark_util.py @@ -7,7 +7,7 @@ from ray.air.util.node import _force_on_node import ray -from typing import Any, List, Dict, Union, Callable +from typing import List, Dict, Union, Callable def schedule_remote_fn_on_all_nodes( @@ -77,16 +77,12 @@ def run_fn(self, fn: Callable, *args, **kwargs): def create_actors_with_options( num_actors: int, resources: Dict[str, Union[float, int]], - runtime_env: Dict[str, Any] = None, ) -> List[ray.actor.ActorHandle]: num_cpus = resources.pop("CPU", 1) num_gpus = resources.pop("GPU", 0) options = {"num_cpus": num_cpus, "num_gpus": num_gpus, "resources": resources} - if runtime_env: - options["runtime_env"] = runtime_env - return [CommandRunner.options(**options).remote() for _ in range(num_actors)] diff --git a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py index 0b7d8c8a6ded3..0ce327cb6e5dd 100644 --- a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py +++ b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py @@ -18,17 +18,6 @@ VANILLA_RESULT_JSON = "/tmp/vanilla_out.json" -def find_network_interface(): - for iface in os.listdir("/sys/class/net"): - if iface.startswith("ens"): - network_interface = iface - break - else: - network_interface = "^lo,docker" - - return network_interface - - # Define model class NeuralNetwork(nn.Module): def __init__(self): @@ -311,19 +300,12 @@ def train_torch_vanilla( num_epochs = config["epochs"] - try: - nccl_network_interface = find_network_interface() - runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": nccl_network_interface}} - except Exception: - runtime_env = {} - actors = create_actors_with_options( num_actors=num_workers, resources={ "CPU": cpus_per_worker, "GPU": int(use_gpu), }, - runtime_env=runtime_env, ) run_fn_on_actors(actors=actors, fn=lambda: os.environ.pop("OMP_NUM_THREADS", None)) diff --git a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py index 4b60b38b137cd..0d7b594d14976 100644 --- a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py +++ b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py @@ -117,7 +117,6 @@ def main( ray.init( runtime_env={ "working_dir": os.path.dirname(__file__), - "env_vars": {"NCCL_SOCKET_IFNAME": "ens"}, } ) prepare_mnist() diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 21ac898844143..2fdaba237c9c0 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -1128,8 +1128,6 @@ cluster: byod: type: gpu - runtime_env: - - NCCL_SOCKET_IFNAME=ens post_build_script: byod_xgboost_test.sh cluster_compute: tpl_gpu_small_aws.yaml diff --git a/release/xgboost_tests/app_config_gpu.yaml b/release/xgboost_tests/app_config_gpu.yaml index 55db2bfa14f2d..193a05e5c1df2 100755 --- a/release/xgboost_tests/app_config_gpu.yaml +++ b/release/xgboost_tests/app_config_gpu.yaml @@ -1,10 +1,4 @@ base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] }} -env_vars: - # Manually set NCCL_SOCKET_IFNAME to "ens" so NCCL training works on - # anyscale_default_cloud. - # See https://github.com/pytorch/pytorch/issues/68893 for more details. - NCCL_SOCKET_IFNAME: ens - debian_packages: - curl From 78e04a10667205dd1f0f9eee85a8c7d565c3b2df Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Mon, 29 Jan 2024 21:28:59 -0800 Subject: [PATCH 2/5] remove test Signed-off-by: Matthew Deng --- python/ray/train/tests/test_gpu.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index e5353b83d0d64..34d1a8482a453 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -315,27 +315,6 @@ def train_func(): assert result1.metrics["loss"] == result2.metrics["loss"] -@pytest.mark.parametrize("nccl_socket_ifname", ["", "ens3"]) -def test_torch_backend_nccl_socket_ifname(ray_start_4_cpus_2_gpus, nccl_socket_ifname): - worker_group = WorkerGroup(num_workers=2, num_gpus_per_worker=1) - - if nccl_socket_ifname: - - def set_env_var(): - os.environ["NCCL_SOCKET_IFNAME"] = nccl_socket_ifname - - worker_group.execute(set_env_var) - - def assert_env_var_set(): - value = nccl_socket_ifname if nccl_socket_ifname else "TODO" - assert os.environ["NCCL_SOCKET_IFNAME"] == value - - torch_backend = _TorchBackend() - torch_backend.on_start(worker_group, backend_config=TorchConfig(backend="nccl")) - - worker_group.execute(assert_env_var_set) - - def test_torch_fail_on_nccl_timeout(ray_start_4_cpus_2_gpus): """Tests that TorchTrainer raises exception on NCCL timeouts.""" From 01325cd963d06ece62e1cef999b1a4bb62a23ecb Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Tue, 30 Jan 2024 10:11:19 -0800 Subject: [PATCH 3/5] lint Signed-off-by: Matthew Deng --- python/ray/train/tests/test_gpu.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index 34d1a8482a453..64aa7f988509b 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -16,9 +16,8 @@ from ray import train from ray.exceptions import RayTaskError from ray.train import ScalingConfig -from ray.train._internal.worker_group import WorkerGroup from ray.train.examples.pytorch.torch_linear_example import LinearDataset -from ray.train.torch.config import TorchConfig, _TorchBackend +from ray.train.torch.config import TorchConfig from ray.train.torch.torch_trainer import TorchTrainer from ray.train.trainer import TrainingFailedError From 031a29f923651ddcc6fda05f5a20f9ae76874995 Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Thu, 1 Feb 2024 13:44:24 -0800 Subject: [PATCH 4/5] docs Signed-off-by: Matthew Deng --- doc/source/train/user-guides/using-gpus.rst | 83 +++++++++++++-------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/doc/source/train/user-guides/using-gpus.rst b/doc/source/train/user-guides/using-gpus.rst index 537c035e85d6e..b34666af269b3 100644 --- a/doc/source/train/user-guides/using-gpus.rst +++ b/doc/source/train/user-guides/using-gpus.rst @@ -72,6 +72,58 @@ You can get the associated devices with :meth:`ray.train.torch.get_device`. trainer.fit() +(PyTorch) Setting the communication backend +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + This is an advanced setting. In most cases, you don't have to change this setting. + +You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a +:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`. + +See the `PyTorch API reference `__ +for valid options. + +.. testcode:: + :hide: + + num_training_workers = 1 + +.. testcode:: + + from ray.train.torch import TorchConfig, TorchTrainer + + trainer = TorchTrainer( + train_func, + scaling_config=ScalingConfig( + num_workers=num_training_workers, + use_gpu=True, + ), + torch_config=TorchConfig(backend="gloo"), + ) + +(NCCL) Setting the communication network interface +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using NCCL for distributed training, you can configure the network interface cards +that are used for communicating between GPUs by setting the +`NCCL_SOCKET_IFNAME `__ +environment variable. + +To ensure that the environment variable is set for all training workers, you can pass it +in a :ref:`Ray runtime environment `: + +.. testcode:: + :skipif: True + + import ray + + runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens5"}} + ray.init(runtime_env=runtime_env) + + trainer = TorchTrainer(...) + Setting the resources per worker -------------------------------- If you want to allocate more than one CPU or GPU per training worker, or if you @@ -113,37 +165,6 @@ will be assigned the same CUDA device. ) -Setting the communication backend (PyTorch) -------------------------------------------- - -.. note:: - - This is an advanced setting. In most cases, you don't have to change this setting. - -You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a -:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`. - -See the `PyTorch API reference `__ -for valid options. - -.. testcode:: - :hide: - - num_training_workers = 1 - -.. testcode:: - - from ray.train.torch import TorchConfig, TorchTrainer - - trainer = TorchTrainer( - train_func, - scaling_config=ScalingConfig( - num_workers=num_training_workers, - use_gpu=True, - ), - torch_config=TorchConfig(backend="gloo"), - ) - .. _train_trainer_resources: From a0d44d8d76e8375ff4d09cd34173f9f707fe0aed Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Fri, 2 Feb 2024 11:10:56 -0800 Subject: [PATCH 5/5] backend Signed-off-by: Matthew Deng --- doc/source/train/user-guides/using-gpus.rst | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/doc/source/train/user-guides/using-gpus.rst b/doc/source/train/user-guides/using-gpus.rst index 9777495e8492a..24526d552f6f8 100644 --- a/doc/source/train/user-guides/using-gpus.rst +++ b/doc/source/train/user-guides/using-gpus.rst @@ -107,15 +107,11 @@ You can get a list of associated devices with :meth:`ray.train.torch.get_devices (PyTorch) Setting the communication backend ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. note:: - - This is an advanced setting. In most cases, you don't have to change this setting. +PyTorch Distributed supports multiple `backends `__ +for communicating tensors across workers. By default Ray Train will use NCCL when ``use_gpu=True`` and Gloo otherwise. -You can set the PyTorch distributed communication backend (e.g. GLOO or NCCL) by passing a -:class:`~ray.train.torch.TorchConfig` to the :class:`~ray.train.torch.TorchTrainer`. - -See the `PyTorch API reference `__ -for valid options. +If you explictly want to override this setting, you can configure a :class:`~ray.train.torch.TorchConfig` +and pass it into the :class:`~ray.train.torch.TorchTrainer`. .. testcode:: :hide: @@ -130,7 +126,7 @@ for valid options. train_func, scaling_config=ScalingConfig( num_workers=num_training_workers, - use_gpu=True, + use_gpu=True, # Defaults to NCCL ), torch_config=TorchConfig(backend="gloo"), )