From 54465cb004c5cdc6cb3896618d3ad974d5a38824 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Tue, 1 Aug 2023 18:24:11 -0700 Subject: [PATCH 1/4] Add trial_local_path property to storage ctx Signed-off-by: Justin Yu --- python/ray/train/_internal/storage.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/ray/train/_internal/storage.py b/python/ray/train/_internal/storage.py index c84930c5f1df7..de5fea5ae1264 100644 --- a/python/ray/train/_internal/storage.py +++ b/python/ray/train/_internal/storage.py @@ -338,6 +338,8 @@ class StorageContext: >>> storage.trial_dir_name = "trial_dir" >>> storage.trial_fs_path 'bucket/path/exp_name/trial_dir' + >>> storage.trial_local_path + '/tmp/ray_results/exp_name/trial_dir' >>> storage.current_checkpoint_index = 1 >>> storage.checkpoint_fs_path 'bucket/path/exp_name/trial_dir/checkpoint_000001' @@ -358,6 +360,10 @@ class StorageContext: '/tmp/ray_results' >>> storage.experiment_path '/tmp/ray_results/exp_name' + >>> storage.experiment_local_path + '/tmp/ray_results/exp_name' + >>> storage.experiment_fs_path + '/tmp/ray_results/exp_name' >>> storage.syncer is None True >>> storage.storage_filesystem # Auto-resolved # doctest: +ELLIPSIS @@ -495,6 +501,18 @@ def experiment_local_path(self) -> str: """ return os.path.join(self.storage_local_path, self.experiment_dir_name) + @property + def trial_local_path(self) -> str: + """The local filesystem path to the trial directory. + + Raises a ValueError if `trial_dir_name` is not set beforehand. + """ + if self.trial_dir_name is None: + raise RuntimeError( + "Should not access `trial_local_path` without setting `trial_dir_name`" + ) + return os.path.join(self.experiment_local_path, self.trial_dir_name) + @property def trial_fs_path(self) -> str: """The trial directory path on the `storage_filesystem`. From b6fd97ab88ff1f2ee995878f006f8ee3f8d21867 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Tue, 1 Aug 2023 18:24:58 -0700 Subject: [PATCH 2/4] Don't chdir in lightning trainer anymore Signed-off-by: Justin Yu --- .../ray/train/lightning/lightning_trainer.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/ray/train/lightning/lightning_trainer.py b/python/ray/train/lightning/lightning_trainer.py index 465eae28e6d33..74ab2973bd983 100644 --- a/python/ray/train/lightning/lightning_trainer.py +++ b/python/ray/train/lightning/lightning_trainer.py @@ -517,13 +517,19 @@ def restore( def _lightning_train_loop_per_worker(config): """Per-worker training loop for a Lightning Trainer.""" - # Change the working directory for all workers to the same directory. - # This aligns with Lightning's settings and avoids inconsistency. Otherwise, - # each worker will have a different log and checkpoint directory if they are - # using relative paths. - working_dir = os.path.join(session.get_trial_dir(), "rank_all") - os.makedirs(working_dir, exist_ok=True) - os.chdir(working_dir) + from ray.train._internal.storage import _use_storage_context + + # TODO(justinvyu)/NOTE: This is no longer needed, because we do not switch to + # a rank-specific working directory in the new persistence mode. + # Lightning requires each worker to be in the same working directory. + if not _use_storage_context(): + # Change the working directory for all workers to the same directory. + # This aligns with Lightning's settings and avoids inconsistency. Otherwise, + # each worker will have a different log and checkpoint directory if they are + # using relative paths. + working_dir = os.path.join(session.get_trial_dir(), "rank_all") + os.makedirs(working_dir, exist_ok=True) + os.chdir(working_dir) if not config["lightning_config"]: raise RuntimeError("'lightning_config' not specified in LightningTrainer!") From 899df726e399303f0958d3a6099d79c6e58e2106 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Tue, 1 Aug 2023 18:26:44 -0700 Subject: [PATCH 3/4] Only chdir to storage_ctx.trial_local_path Signed-off-by: Justin Yu --- python/ray/train/_internal/session.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/python/ray/train/_internal/session.py b/python/ray/train/_internal/session.py index 2f50f0f4cf6ac..c77a3a474a580 100644 --- a/python/ray/train/_internal/session.py +++ b/python/ray/train/_internal/session.py @@ -142,12 +142,17 @@ def noop(x): encode_data_fn = noop self._encode_data_fn = encode_data_fn - # TODO(xwjiang): Legacy Ray Train trainer clean up! - if trial_info: - # Change the working directory to `logdir`. - logdir = os.path.join(trial_info.logdir, f"rank_{self.world_rank}") - os.makedirs(logdir, exist_ok=True) - os.chdir(logdir) + if _use_storage_context(): + # Change the working directory to the local trial directory. + # -> All workers on the same node share a working directory. + os.makedirs(storage.trial_local_path, exist_ok=True) + os.chdir(storage.trial_local_path) + else: + if trial_info: + # Change the working directory to `logdir`. + logdir = os.path.join(trial_info.logdir, f"rank_{self.world_rank}") + os.makedirs(logdir, exist_ok=True) + os.chdir(logdir) # This lock is used to control the execution of the training thread. self.continue_lock = threading.Semaphore(0) From 938e2257dfe1e6d0fde3f317a9317290034019b9 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Tue, 1 Aug 2023 18:26:51 -0700 Subject: [PATCH 4/4] Update dummy test Signed-off-by: Justin Yu --- python/ray/train/tests/test_new_persistence.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/ray/train/tests/test_new_persistence.py b/python/ray/train/tests/test_new_persistence.py index aad4554c7ea81..f70e6d74a7faf 100644 --- a/python/ray/train/tests/test_new_persistence.py +++ b/python/ray/train/tests/test_new_persistence.py @@ -172,6 +172,8 @@ def dummy_train_fn(config): assert train_session.storage assert train_session.storage.checkpoint_fs_path + assert os.getcwd() == train_session.storage.trial_local_path + trainer = DataParallelTrainer( dummy_train_fn, scaling_config=train.ScalingConfig(num_workers=2),