ray-project · matthewdeng · Jan 30, 2024 · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/python/ray/air/_internal/torch_utils.py b/python/ray/air/_internal/torch_utils.py
@@ -10,11 +10,11 @@
 from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed
 
 
-def get_device() -> Union[torch.device, List[torch.device]]:
+def get_devices() -> List[torch.device]:
     """Gets the correct torch device configured for this process.
 
-    Returns a list of devices if more than 1 GPU per worker
-    is requested.
+    Returns a list of torch CUDA devices allocated for the current worker.
+    If no GPUs are assigned, then it returns a list with a single CPU device.
 
     Assumes that `CUDA_VISIBLE_DEVICES` is set and is a
     superset of the `ray.get_gpu_ids()`.
@@ -55,11 +55,10 @@ def get_device() -> Union[torch.device, List[torch.device]]:
             device_ids.append(0)
 
         devices = [torch.device(f"cuda:{device_id}") for device_id in device_ids]
-        device = devices[0] if len(devices) == 1 else devices
     else:
-        device = torch.device("cpu")
+        devices = [torch.device("cpu")]
 
-    return device
+    return devices
 
 
 def convert_pandas_to_torch_tensor(

diff --git a/python/ray/air/util/torch_dist.py b/python/ray/air/util/torch_dist.py
@@ -17,7 +17,7 @@
 from ray.actor import ActorHandle
 from ray.train._internal.utils import get_address_and_port
 from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME
-from ray.air._internal.torch_utils import get_device
+from ray.air._internal.torch_utils import get_devices
 
 
 class TorchDistributedWorker(ABC):
@@ -183,9 +183,7 @@ def _shutdown_torch_distributed():
         return
 
     # Clean up cuda memory.
-    devices = get_device()
-    if not isinstance(devices, list):
-        devices = [devices]
+    devices = get_devices()
     for device in devices:
         with torch.cuda.device(device):
             torch.cuda.empty_cache()

@@ -335,8 +335,8 @@ def iter_torch_batches(
 
         from ray.air._internal.torch_utils import (
             convert_ndarray_batch_to_torch_tensor_batch,
-            get_device,
         )
+        from ray.train.torch import get_device
 
         if collate_fn is not None and (dtypes is not None or device != "auto"):
             raise ValueError(

@@ -158,9 +158,10 @@ def collate_fn(batch: Dict[str, np.ndarray]):
 
     # Test that we don't automatically set device if collate_fn is specified.
     with patch(
-        "ray.air._internal.torch_utils.get_device", lambda: torch.device("cuda")
+        "ray.air._internal.torch_utils.get_devices", lambda: [torch.device("cuda")]
     ):
-        assert ray.air._internal.torch_utils.get_device().type == "cuda"
+        devices = ray.air._internal.torch_utils.get_devices()
+        assert devices[0].type == "cuda"
 
         it.iter_batches = MagicMock()
         for batch in it.iter_torch_batches(collate_fn=collate_fn):

@@ -51,15 +51,6 @@ def import_lightning():  # noqa: F402
 LIGHTNING_REPORT_STAGE_KEY = "_report_on"
 
 
-def get_worker_root_device():
-    """Get the first torch device of the current worker if there are multiple."""
-    devices = ray.train.torch.get_device()
-    if isinstance(devices, list):
-        return devices[0]
-    else:
-        return devices
-
-
 @PublicAPI(stability="beta")
 class RayDDPStrategy(pl.strategies.DDPStrategy):
     """Subclass of DDPStrategy to ensure compatibility with Ray orchestration.
@@ -77,7 +68,7 @@ def __init__(self, *args, **kwargs):
 
     @property
     def root_device(self) -> torch.device:
-        return get_worker_root_device()
+        return ray.train.torch.get_device()
 
     @property
     def distributed_sampler_kwargs(self) -> Dict[str, Any]:
@@ -101,7 +92,7 @@ def __init__(self, *args, **kwargs):
 
     @property
     def root_device(self) -> torch.device:
-        return get_worker_root_device()
+        return ray.train.torch.get_device()
 
     @property
     def distributed_sampler_kwargs(self) -> Dict[str, Any]:
@@ -144,7 +135,7 @@ def __init__(self, *args, **kwargs):
 
     @property
     def root_device(self) -> torch.device:
-        return get_worker_root_device()
+        return ray.train.torch.get_device()
 
     @property
     def distributed_sampler_kwargs(self) -> Dict[str, Any]:

@@ -71,11 +71,7 @@ def train_fn():
             visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
             assert visible_devices == "1,2"
 
-        devices = (
-            sorted([device.index for device in train.torch.get_device()])
-            if num_gpus_per_worker > 1
-            else train.torch.get_device().index
-        )
+        devices = sorted([device.index for device in train.torch.get_devices()])
         write_rank_data(tmp_path, devices)
 
     trainer = TorchTrainer(
@@ -92,9 +88,9 @@ def train_fn():
     devices = list(rank_data.values())
 
     if num_gpus_per_worker == 0.5:
-        assert sorted(devices) == [0, 0, 1, 1]
+        assert sorted(devices) == [[0], [0], [1], [1]]
     elif num_gpus_per_worker == 1:
-        assert sorted(devices) == [0, 1]
+        assert sorted(devices) == [[0], [1]]
     elif num_gpus_per_worker == 2:
         assert sorted(devices[0]) == [0, 1]
     else:
@@ -108,11 +104,7 @@ def train_fn():
 def test_torch_get_device_dist(ray_2_node_2_gpu, num_gpus_per_worker, tmp_path):
     @patch("torch.cuda.is_available", lambda: True)
     def train_fn():
-        devices = (
-            sorted([device.index for device in train.torch.get_device()])
-            if num_gpus_per_worker > 1
-            else train.torch.get_device().index
-        )
+        devices = sorted([device.index for device in train.torch.get_devices()])
         write_rank_data(tmp_path, devices)
 
     trainer = TorchTrainer(
@@ -138,12 +130,12 @@ def train_fn():
         # 4 workers on node 1, 4 workers on node 2
         # `ray.get_gpu_ids()` returns [0], [0], [1], [1] on node 1
         # and [0], [0], [1], [1] on node 2
-        assert sorted(devices) == [0, 0, 0, 0, 1, 1, 1, 1]
+        assert sorted(devices) == [[0], [0], [0], [0], [1], [1], [1], [1]]
     elif num_gpus_per_worker == 1:
         # worker gpu topology:
         # 2 workers on node 1, 2 workers on node 2
         # `ray.get_gpu_ids()` returns [0], [1] on node 1 and [0], [1] on node 2
-        assert sorted(devices) == [0, 0, 1, 1]
+        assert sorted(devices) == [[0], [0], [1], [1]]
     elif num_gpus_per_worker == 2:
         # worker gpu topology:
         # 1 workers on node 1, 1 workers on node 2

@@ -215,11 +215,8 @@ def train_fn():
             # the other is taken by the other sample) so device index should be 0.
             # For the multiple GPU case, each worker has 2 visible devices so device
             # index should be either 0 or 1. It doesn't matter which.
-            devices = train.torch.get_device()
-            if isinstance(devices, list):
-                assert sorted([device.index for device in devices]) == [0, 1]
-            else:
-                assert train.torch.get_device().index == 0
+            device_ids = sorted([device.index for device in train.torch.get_devices()])
+            assert device_ids in [[0], [0, 1]]
 
         @ray.remote(num_cpus=0)
         class TrialActor:

@@ -15,7 +15,12 @@ def shutdown_only():
 def run_torch():
     from torch.utils.data import DataLoader, TensorDataset
 
-    from ray.train.torch import get_device, prepare_data_loader, prepare_model
+    from ray.train.torch import (
+        get_device,
+        get_devices,
+        prepare_data_loader,
+        prepare_model,
+    )
 
     def train_func():
         # Create dummy model and data loader
@@ -27,6 +32,7 @@ def train_func():
         prepare_data_loader(dataloader)
         prepare_model(model)
         get_device()
+        get_devices()
 
     trainer = TorchTrainer(
         train_func, scaling_config=ScalingConfig(num_workers=2, use_gpu=False)

@@ -17,6 +17,7 @@
     backward,
     enable_reproducibility,
     get_device,
+    get_devices,
     prepare_data_loader,
     prepare_model,
     prepare_optimizer,
@@ -28,6 +29,7 @@
     "TorchConfig",
     "accelerate",
     "get_device",
+    "get_devices",
     "prepare_model",
     "prepare_optimizer",
     "prepare_data_loader",

@@ -113,11 +113,9 @@ def _setup_torch_process_group(
 
 
 def _shutdown_torch(destroy_process_group=False):
-    from ray.air._internal.torch_utils import get_device
+    from ray.air._internal.torch_utils import get_devices
 
-    devices = get_device()
-    if not isinstance(devices, list):
-        devices = [devices]
+    devices = get_devices()
     if destroy_process_group:
         dist.destroy_process_group()
     if torch.cuda.is_available():
@@ -129,7 +127,7 @@ def _shutdown_torch(destroy_process_group=False):
 def _set_torch_distributed_env_vars():
     # Same env vars as in
     # https://pytorch.org/docs/stable/elastic/run.html#environment-variables
-    from ray.air._internal.torch_utils import get_device
+    from ray.train.torch import get_device
 
     context = ray.train.get_context()
     os.environ["LOCAL_RANK"] = str(context.get_local_rank())
@@ -140,8 +138,6 @@ def _set_torch_distributed_env_vars():
 
     # Makes sure Hugging Face Accelerate uses the correct device
     device = get_device()
-    if isinstance(device, list):
-        device = device[0]
     os.environ["ACCELERATE_TORCH_DEVICE"] = str(device)
 
 

@@ -39,11 +39,17 @@
 
 
 @PublicAPI(stability="stable")
-def get_device() -> Union[torch.device, List[torch.device]]:
+def get_device() -> torch.device:
     """Gets the correct torch device configured for this process.
 
-    Returns a list of devices if more than 1 GPU per worker
-    is requested.
+    Returns the torch device for the current worker. If more than 1 GPU is
+    requested per worker, returns the device with the minimal device index.
+
+    .. note::
+
+        If you requested multiple GPUs per worker, and want to get
+        the full list of torch devices, please use
+        :meth:`~ray.train.torch.get_devices`.
 
     Assumes that `CUDA_VISIBLE_DEVICES` is set and is a
     superset of the `ray.get_gpu_ids()`.
@@ -63,11 +69,48 @@ def get_device() -> Union[torch.device, List[torch.device]]:
         >>> # ray.get_gpu_ids() == [4,5]
         >>> # torch.cuda.is_available() == True
         >>> # get_device() == torch.device("cuda:4")
+
+
+        >>> # You can move model to device by:
+        >>> # model.to(ray.train.torch.get_device())
+        >>> #
+        >>> # instead of manually checking the device type:
+        >>> # model.to("cuda" if torch.cuda.is_available() else "cpu")
+    """
+    from ray.air._internal import torch_utils
+
+    record_extra_usage_tag(TagKey.TRAIN_TORCH_GET_DEVICE, "1")
+    return torch_utils.get_devices()[0]
+
+
+@PublicAPI(stability="alpha")
+def get_devices() -> List[torch.device]:
+    """Gets the correct torch device list configured for this process.
+
+    Assumes that `CUDA_VISIBLE_DEVICES` is set and is a
+    superset of the `ray.get_gpu_ids()`.
+
+    Example:
+        >>> # os.environ["CUDA_VISIBLE_DEVICES"] = "3,4"
+        >>> # ray.get_gpu_ids() == [3]
+        >>> # torch.cuda.is_available() == True
+        >>> # get_devices() == [torch.device("cuda:0")]
+
+        >>> # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4"
+        >>> # ray.get_gpu_ids() == [4]
+        >>> # torch.cuda.is_available() == True
+        >>> # get_devices() == [torch.device("cuda:4")]
+
+        >>> # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"
+        >>> # ray.get_gpu_ids() == [4,5]
+        >>> # torch.cuda.is_available() == True
+        >>> # get_devices() == [torch.device("cuda:4"), torch.device("cuda:5")]
     """
+
     from ray.air._internal import torch_utils
 
     record_extra_usage_tag(TagKey.TRAIN_TORCH_GET_DEVICE, "1")
-    return torch_utils.get_device()
+    return torch_utils.get_devices()
 
 
 @PublicAPI(stability="stable")

@@ -43,7 +43,7 @@
 torch, nn = try_import_torch()
 
 if torch:
-    from ray.air._internal.torch_utils import get_device
+    from ray.air._internal.torch_utils import get_devices
 
 
 logger = logging.getLogger(__name__)
@@ -298,12 +298,12 @@ def build(self) -> None:
         # TODO (Kourosh): Instead of using _TorchAccelerator, we should use the public
         #  API in ray.train but allow for session to be None without any errors raised.
         if self._use_gpu:
-            # get_device() returns the 0th device if
+            # get_devices() returns a list that contains the 0th device if
             # it is called from outside of a Ray Train session. Its necessary to give
             # the user the option to run on the gpu of their choice, so we enable that
             # option here via the local gpu id scaling config parameter.
             if self._distributed:
-                self._device = get_device()
+                self._device = get_devices()[0]
             else:
                 assert self._local_gpu_idx < torch.cuda.device_count(), (
                     f"local_gpu_idx {self._local_gpu_idx} is not a valid GPU id or is "