ray-project · matthewdeng · Feb 21, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
@@ -4,7 +4,17 @@
 import logging
 import os
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
 
 import ray
 from ray.actor import ActorHandle
@@ -88,6 +98,7 @@ def update_env_vars(env_vars: Dict[str, Any]):
 def construct_train_func(
     train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]],
     config: Optional[Dict[str, Any]],
+    train_func_context: Optional[ContextManager],
     fn_arg_name: Optional[str] = "train_func",
     discard_returns: bool = False,
 ) -> Callable[[], T]:
@@ -97,6 +108,8 @@ def construct_train_func(
             This can either take in no arguments or a ``config`` dict.
         config (Optional[Dict]): Configurations to pass into
             ``train_func``. If None then an empty Dict will be created.
+        train_func_context: Context manager for user's `train_func`, which executes
+            backend-specific logics before and after the training function.
         fn_arg_name (Optional[str]): The name of training function to use for error
             messages.
         discard_returns: Whether to discard any returns from train_func or not.
@@ -135,7 +148,8 @@ def discard_return_wrapper(*args, **kwargs):
         @functools.wraps(wrapped_train_func)
         def train_fn():
             try:
-                return wrapped_train_func(config)
+                with train_func_context:
+                    return wrapped_train_func(config)
             except Exception as e:
                 raise StartTraceback from e
 
@@ -144,7 +158,8 @@ def train_fn():
         @functools.wraps(wrapped_train_func)
         def train_fn():
             try:
-                return wrapped_train_func()
+                with train_func_context:
+                    return wrapped_train_func()
             except Exception as e:
                 raise StartTraceback from e
 

@@ -1,4 +1,5 @@
 import logging
+from contextlib import nullcontext
 from typing import TypeVar
 
 from ray.train._internal.utils import Singleton
@@ -19,6 +20,10 @@ class BackendConfig:
     def backend_cls(self):
         return Backend
 
+    @property
+    def train_func_context(self):
+        return nullcontext
+
     def _repr_html_(self) -> str:
         return make_table_html_repr(obj=self, title=type(self).__name__)
 

@@ -435,6 +435,7 @@ def training_loop(self) -> None:
         train_loop_per_worker = construct_train_func(
             self._train_loop_per_worker,
             self._train_loop_config,
+            train_func_context=self._backend_config.train_func_context(),
             fn_arg_name="train_loop_per_worker",
             discard_returns=True,
         )

@@ -64,6 +64,8 @@ def test_torch_get_device(
     ray.init(num_cpus=4, num_gpus=2)
 
     def train_fn():
+        # Confirm that the TorchConfig Prologue is effective
+        assert torch.cuda.current_device() == train.torch.get_device().index
         # Make sure environment variable is being set correctly.
         if cuda_visible_devices:
             visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
@@ -102,6 +104,8 @@ def train_fn():
 def test_torch_get_device_dist(ray_2_node_2_gpu, num_gpus_per_worker, tmp_path):
     @patch("torch.cuda.is_available", lambda: True)
     def train_fn():
+        # Confirm that the TorchConfig Prologue is effective
+        assert torch.cuda.current_device() == train.torch.get_device().index
         devices = sorted([device.index for device in train.torch.get_devices()])
         write_rank_data(tmp_path, devices)
 

@@ -1,5 +1,6 @@
 import functools
 import time
+from contextlib import nullcontext
 from unittest.mock import patch
 
 import pytest
@@ -91,7 +92,7 @@ def create_iterator(
 ):
     # Similar logic to the old Trainer.run_iterator().
 
-    train_func = construct_train_func(train_func, None)
+    train_func = construct_train_func(train_func, None, train_func_context=nullcontext)
 
     backend_executor = backend_executor_cls(
         backend_config=backend_config, num_workers=num_workers, max_retries=MAX_RETRIES

@@ -16,6 +16,19 @@
 logger = logging.getLogger(__name__)
 
 
+class TorchConfigContextManager:
+    def __enter__(self):
+        # Set default cuda device
+        if torch.cuda.is_available():
+            device = ray.train.torch.get_device()
+            if device.type == "cuda":
+                torch.cuda.set_device(device)
+
+    def __exit__(self, type, value, traceback):
+        # Propagate exceptions if any
+        return False
+
+
 @PublicAPI(stability="stable")
 @dataclass
 class TorchConfig(BackendConfig):
@@ -43,6 +56,10 @@ class TorchConfig(BackendConfig):
     def backend_cls(self):
         return _TorchBackend
 
+    @property
+    def train_func_context(self):
+        return TorchConfigContextManager
+
 
 def _setup_torch_process_group(
     backend: str,