From e46dc8994b636c77ab2a497ebae4bb9b7a5e48ce Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 00:37:24 -0700 Subject: [PATCH 01/38] No-op on sync trial dir + raise error with call to action upon missing checkpoint Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 37 +++++++++++++++-- python/ray/tune/tests/test_syncer_callback.py | 40 +++++++++++++++++++ 2 files changed, 73 insertions(+), 4 deletions(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 5694eab731fc9..5ddaf3018cbe1 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -787,6 +787,9 @@ def _remote_trial_logdir(self, trial: "Trial"): def _sync_trial_dir( self, trial: "Trial", force: bool = False, wait: bool = True ) -> bool: + if not os.environ.get("AIR_ENABLE_DEPRECATED_SYNC_TO_HEAD_NODE"): + return True + if not self._enabled or trial.uses_cloud_checkpointing: return False @@ -893,10 +896,36 @@ def on_checkpoint( if self._sync_trial_dir( trial, force=trial.sync_on_checkpoint, wait=True ) and not os.path.exists(checkpoint.dir_or_data): - raise TuneError( - f"Trial {trial}: Checkpoint path {checkpoint.dir_or_data} not " - "found after successful sync down." - ) + if not os.environ.get("AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE"): + raise DeprecationWarning( + "Ray AIR no longer supports the synchronization of the trial " + "directory from worker nodes to the head node. This means that the " + "checkpoints of trials scheduled on worker nodes will not be " + "accessible during the run (e.g., resuming from a checkpoint " + "after a failure) or after the run " + "(e.g., loading the checkpoint of a trial that ran on an already " + "terminated worker node).\n\n" + "To fix this issue, configure AIR to use either:\n" + "(1) Cloud storage: `RunConfig(storage_path='s3://your/bucket')`\n" + "(2) A network filesystem mounted on all nodes: " + "`RunConfig(storage_path='/mnt/path/to/nfs_storage')`\n" + "See here for a full guide on how to configure these " + "persistent storage options: " + "https://docs.ray.io/en/master/tune/tutorials/tune-storage.html\n\n" + "Other notes:\n" + "- See here for a thread explaining why this functionality is " + "being removed: \n" + # TODO(justinvyu): put in the link to the REP + "- To re-enable the head node syncing behavior, set the " + "environment variable AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE=1\n" + " - **Note that this functionality will be fully removed in " + "Ray 2.7.**" + ) + else: + raise TuneError( + f"Trial {trial}: Checkpoint path {checkpoint.dir_or_data} not " + "found after successful sync down." + ) def wait_for_all(self): # Remove any sync processes as needed, and only wait on the remaining ones. diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index eb49e00acd8d4..b6ff924ae736f 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -566,6 +566,46 @@ def train_fn(config): assert_file(False, tmp_target, "save_to_object1234") +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. +def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): + """Checks that an error is raised when default syncing is disabled, and a + checkpoint cannot be found on the driver (since it was not synced).""" + syncer_callback = SyncerCallback(sync_period=0) + trial = MockTrial(trial_id="a", logdir=None) + + with pytest.raises(DeprecationWarning): + syncer_callback.on_checkpoint( + iteration=1, + trials=[], + trial=trial, + checkpoint=_TrackedCheckpoint( + dir_or_data="/does/not/exist", storage_mode=CheckpointStorage.PERSISTENT + ), + ) + + monkeypatch.setenv("AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE", "1") + with pytest.raises(TuneError): + syncer_callback.on_checkpoint( + iteration=1, + trials=[], + trial=trial, + checkpoint=_TrackedCheckpoint( + dir_or_data="/does/not/exist", storage_mode=CheckpointStorage.PERSISTENT + ), + ) + + path_that_exists = tmp_path / "exists" + path_that_exists.mkdir() + syncer_callback.on_checkpoint( + iteration=1, + trials=[], + trial=trial, + checkpoint=_TrackedCheckpoint( + dir_or_data=str(path_that_exists), storage_mode=CheckpointStorage.PERSISTENT + ), + ) + + if __name__ == "__main__": import sys From ec10d0d3eb9b94c0fc96def0c5e217b885eab84b Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 01:18:58 -0700 Subject: [PATCH 02/38] Some fixes Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 22 ++++++++++++------- python/ray/tune/tests/test_syncer_callback.py | 8 +++++-- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 5ddaf3018cbe1..bd17fa00da89e 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -787,8 +787,8 @@ def _remote_trial_logdir(self, trial: "Trial"): def _sync_trial_dir( self, trial: "Trial", force: bool = False, wait: bool = True ) -> bool: - if not os.environ.get("AIR_ENABLE_DEPRECATED_SYNC_TO_HEAD_NODE"): - return True + if not os.environ.get("AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE"): + return False if not self._enabled or trial.uses_cloud_checkpointing: return False @@ -893,10 +893,8 @@ def on_checkpoint( if checkpoint.storage_mode == CheckpointStorage.MEMORY: return - if self._sync_trial_dir( - trial, force=trial.sync_on_checkpoint, wait=True - ) and not os.path.exists(checkpoint.dir_or_data): - if not os.environ.get("AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE"): + if not os.environ.get("AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE"): + if not os.path.exists(checkpoint.dir_or_data): raise DeprecationWarning( "Ray AIR no longer supports the synchronization of the trial " "directory from worker nodes to the head node. This means that the " @@ -915,13 +913,21 @@ def on_checkpoint( "Other notes:\n" "- See here for a thread explaining why this functionality is " "being removed: \n" - # TODO(justinvyu): put in the link to the REP + # TODO(justinvyu): put in the link to the REP/issue "- To re-enable the head node syncing behavior, set the " "environment variable AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE=1\n" " - **Note that this functionality will be fully removed in " "Ray 2.7.**" ) - else: + # else: + # No need to raise an error about syncing, + # since the checkpoint lives on the head node. + else: + # Old head node syncing codepath + synced = self._sync_trial_dir( + trial, force=trial.sync_on_checkpoint, wait=True + ) + if synced and not os.path.exists(checkpoint.dir_or_data): raise TuneError( f"Trial {trial}: Checkpoint path {checkpoint.dir_or_data} not " "found after successful sync down." diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index b6ff924ae736f..16c0042a6dedb 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -568,8 +568,12 @@ def train_fn(config): # TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): - """Checks that an error is raised when default syncing is disabled, and a - checkpoint cannot be found on the driver (since it was not synced).""" + """Checks that an error is raised when head node syncing not enabled by default, + and a checkpoint cannot be found on the driver (since it was not synced). + + Also checks that no error is raised if the checkpoint does exist on the local node. + This covers the single-node case where all trials write to local disk. + Only force the user to switch to cloud storage / NFS in the multi-node case.""" syncer_callback = SyncerCallback(sync_period=0) trial = MockTrial(trial_id="a", logdir=None) From 6744ddf16c52ede47fd254379cdd3825b70ff70f Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 01:22:40 -0700 Subject: [PATCH 03/38] Add env var as a tracked constant Signed-off-by: Justin Yu --- python/ray/air/constants.py | 6 ++++++ python/ray/tune/syncer.py | 12 ++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/python/ray/air/constants.py b/python/ray/air/constants.py index d58234a01a58c..b59c97bd10bad 100644 --- a/python/ray/air/constants.py +++ b/python/ray/air/constants.py @@ -93,10 +93,16 @@ # as Trainable) DISABLE_LAZY_CHECKPOINTING_ENV = "TRAIN_DISABLE_LAZY_CHECKPOINTING" +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. +# Whether or not the sync-to-head behavior is enabled by default. +# If unset, running AIR on a multi-node cluster with checkpointing will raise +# an error telling the user to switch to cloud/NFS. +REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE = "AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE" # NOTE: When adding a new environment variable, please track it in this list. # TODO(ml-team): Most env var constants should get moved here. AIR_ENV_VARS = { + REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV, DISABLE_LAZY_CHECKPOINTING_ENV, "RAY_AIR_FULL_TRACEBACKS", diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index bd17fa00da89e..7fd7945c3b018 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -40,7 +40,11 @@ delete_at_uri, is_non_local_path_uri, ) -from ray.air.constants import LAZY_CHECKPOINT_MARKER_FILE, TRAINING_ITERATION +from ray.air.constants import ( + LAZY_CHECKPOINT_MARKER_FILE, + REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, + TRAINING_ITERATION, +) from ray.exceptions import RayActorError from ray.tune import TuneError from ray.tune.callback import Callback @@ -787,7 +791,7 @@ def _remote_trial_logdir(self, trial: "Trial"): def _sync_trial_dir( self, trial: "Trial", force: bool = False, wait: bool = True ) -> bool: - if not os.environ.get("AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE"): + if not os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE): return False if not self._enabled or trial.uses_cloud_checkpointing: @@ -893,7 +897,7 @@ def on_checkpoint( if checkpoint.storage_mode == CheckpointStorage.MEMORY: return - if not os.environ.get("AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE"): + if not os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE): if not os.path.exists(checkpoint.dir_or_data): raise DeprecationWarning( "Ray AIR no longer supports the synchronization of the trial " @@ -915,7 +919,7 @@ def on_checkpoint( "being removed: \n" # TODO(justinvyu): put in the link to the REP/issue "- To re-enable the head node syncing behavior, set the " - "environment variable AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE=1\n" + f"environment variable {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE}=1\n" " - **Note that this functionality will be fully removed in " "Ray 2.7.**" ) From d0e639e1fc054d4cd6c8bed852e5e6fd231986d7 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 17:53:02 -0700 Subject: [PATCH 04/38] Add a warning log message to cover the multi-node + no checkpoints + artifacts case Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 92 +++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 7fd7945c3b018..38e6db6231daf 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -79,6 +79,31 @@ f"./{LAZY_CHECKPOINT_MARKER_FILE}", ] +_SYNC_TO_HEAD_DEPRECATION_MESSAGE = ( + "Ray AIR no longer supports the synchronization of checkpoints and other " + "artifacts from worker nodes to the head node. This means that the " + "checkpoints and artifacts saved by trials scheduled on worker nodes will not be " + "accessible during the run (e.g., resuming from a checkpoint " + "after a failure) or after the run " + "(e.g., loading the checkpoint of a trial that ran on an already " + "terminated worker node).\n\n" + "To fix this issue, configure AIR to use either:\n" + "(1) Cloud storage: `RunConfig(storage_path='s3://your/bucket')`\n" + "(2) A network filesystem mounted on all nodes: " + "`RunConfig(storage_path='/mnt/path/to/nfs_storage')`\n" + "See here for a full guide on how to configure these " + "persistent storage options: " + "https://docs.ray.io/en/master/tune/tutorials/tune-storage.html\n\n" + "Other notes:\n" + "- See here for a thread explaining why this functionality is " + "being removed: \n" + # TODO(justinvyu): put in the link to the REP/issue + "- To re-enable the head node syncing behavior, set the " + f"environment variable {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE}=1\n" + " - **Note that this functionality will be fully removed in " + "Ray 2.7.**" +) + @PublicAPI @dataclass @@ -791,20 +816,9 @@ def _remote_trial_logdir(self, trial: "Trial"): def _sync_trial_dir( self, trial: "Trial", force: bool = False, wait: bool = True ) -> bool: - if not os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE): - return False - if not self._enabled or trial.uses_cloud_checkpointing: return False - sync_process = self._get_trial_sync_process(trial) - - # Always run if force=True - # Otherwise, only run if we should sync (considering sync period) - # and if there is no sync currently still running. - if not force and (not self._should_sync(trial) or sync_process.is_running): - return False - source_ip = self._trial_ips.get(trial.trial_id, None) if not source_ip: @@ -822,6 +836,28 @@ def _sync_trial_dir( self._trial_ips[trial.trial_id] = source_ip + if not os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE): + # Only log a warning for remote trials, since + # this only affects artifacts that are saved on worker nodes. + if source_ip != ray.util.get_node_ip_address(): + if log_once(f"sync_trial_dir_{trial.trial_id}"): + logger.warning( + "The contents of the trial directory for trial " + f"{trial.trial_id}, " + "which includes checkpoints and artifacts, were not synced " + "to the head node. See below for more info:\n\n" + + _SYNC_TO_HEAD_DEPRECATION_MESSAGE + ) + return False + + sync_process = self._get_trial_sync_process(trial) + + # Always run if force=True + # Otherwise, only run if we should sync (considering sync period) + # and if there is no sync currently still running. + if not force and (not self._should_sync(trial) or sync_process.is_running): + return False + try: sync_process.wait() except TuneError as e: @@ -898,34 +934,16 @@ def on_checkpoint( return if not os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE): + # If we have saved a checkpoint, but it's not accessible on the driver, + # that means that it lives on some other node and would be synced to head + # prior to Ray 2.6. if not os.path.exists(checkpoint.dir_or_data): - raise DeprecationWarning( - "Ray AIR no longer supports the synchronization of the trial " - "directory from worker nodes to the head node. This means that the " - "checkpoints of trials scheduled on worker nodes will not be " - "accessible during the run (e.g., resuming from a checkpoint " - "after a failure) or after the run " - "(e.g., loading the checkpoint of a trial that ran on an already " - "terminated worker node).\n\n" - "To fix this issue, configure AIR to use either:\n" - "(1) Cloud storage: `RunConfig(storage_path='s3://your/bucket')`\n" - "(2) A network filesystem mounted on all nodes: " - "`RunConfig(storage_path='/mnt/path/to/nfs_storage')`\n" - "See here for a full guide on how to configure these " - "persistent storage options: " - "https://docs.ray.io/en/master/tune/tutorials/tune-storage.html\n\n" - "Other notes:\n" - "- See here for a thread explaining why this functionality is " - "being removed: \n" - # TODO(justinvyu): put in the link to the REP/issue - "- To re-enable the head node syncing behavior, set the " - f"environment variable {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE}=1\n" - " - **Note that this functionality will be fully removed in " - "Ray 2.7.**" - ) + raise DeprecationWarning(_SYNC_TO_HEAD_DEPRECATION_MESSAGE) # else: - # No need to raise an error about syncing, - # since the checkpoint lives on the head node. + # No need to raise an error about syncing, since the driver can find + # the checkpoint, because either: + # - the checkpoint lives on the head node + # - a shared filesystem is used else: # Old head node syncing codepath synced = self._sync_trial_dir( From 6544a28409b0bc9f5dddc41a4d0cb3d11dea0ced Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 17:53:14 -0700 Subject: [PATCH 05/38] Update the test Signed-off-by: Justin Yu --- python/ray/tune/tests/test_syncer_callback.py | 52 +++++++++++++++---- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index 16c0042a6dedb..be555d1957a5a 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -10,7 +10,7 @@ import ray.util from ray.air._internal.checkpoint_manager import CheckpointStorage, _TrackedCheckpoint -from ray.air.constants import TRAINING_ITERATION +from ray.air.constants import TRAINING_ITERATION, REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE from ray.exceptions import RayActorError from ray.tune import TuneError from ray.tune.logger import NoopLogger @@ -119,14 +119,20 @@ def assert_file(exists: bool, root: str, path: str = ""): class MockTrial: - def __init__(self, trial_id: str, logdir: str, on_dead_node: bool = False): + def __init__( + self, + trial_id: str, + logdir: str, + on_dead_node: bool = False, + runner_ip: str = None, + ): self.trial_id = trial_id self.uses_cloud_checkpointing = False self.sync_on_checkpoint = True self.logdir = logdir self.local_path = logdir - self._local_ip = ray.util.get_node_ip_address() + self._local_ip = runner_ip or ray.util.get_node_ip_address() self._on_dead_node = on_dead_node def get_runner_ip(self): @@ -568,15 +574,11 @@ def train_fn(config): # TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): - """Checks that an error is raised when head node syncing not enabled by default, - and a checkpoint cannot be found on the driver (since it was not synced). - - Also checks that no error is raised if the checkpoint does exist on the local node. - This covers the single-node case where all trials write to local disk. - Only force the user to switch to cloud storage / NFS in the multi-node case.""" syncer_callback = SyncerCallback(sync_period=0) trial = MockTrial(trial_id="a", logdir=None) + # Raise a deprecation error if checkpointing in a multi-node cluster + monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE) with pytest.raises(DeprecationWarning): syncer_callback.on_checkpoint( iteration=1, @@ -587,7 +589,8 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): ), ) - monkeypatch.setenv("AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE", "1") + # Setting the env var raises the original TuneError instead of a deprecation + monkeypatch.setenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, "1") with pytest.raises(TuneError): syncer_callback.on_checkpoint( iteration=1, @@ -598,6 +601,9 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): ), ) + # Make sure we don't raise an error if running on a single node or using NFS, + # where the checkpoint can be accessed from the driver. + monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE) path_that_exists = tmp_path / "exists" path_that_exists.mkdir() syncer_callback.on_checkpoint( @@ -610,6 +616,32 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): ) +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. +def test_head_node_syncing_disabled_warning(propagate_logs, caplog): + syncer_callback = SyncerCallback(sync_period=0) + remote_trial_a = MockTrial(trial_id="a", logdir=None, runner_ip="remote") + remote_trial_b = MockTrial(trial_id="b", logdir=None, runner_ip="remote") + local_trial_c = MockTrial(trial_id="c", logdir=None) + + with caplog.at_level(logging.WARNING): + # Any attempts to sync from remote trials should no-op. + # Instead, print a warning message to the user explaining that + # no checkpoints or artifacts are pulled to the head node. + syncer_callback._sync_trial_dir(remote_trial_a) + syncer_callback._sync_trial_dir(remote_trial_b) + + # Syncing multiple times shouldn't spam the logs + syncer_callback._sync_trial_dir(remote_trial_a) + syncer_callback._sync_trial_dir(remote_trial_b) + + # No warning for attempting to sync a local trial dir + syncer_callback._sync_trial_dir(local_trial_c) + + assert caplog.text.count("The contents of the trial directory for trial a") == 1 + assert caplog.text.count("The contents of the trial directory for trial b") == 1 + assert caplog.text.count("The contents of the trial directory for trial c") == 0 + + if __name__ == "__main__": import sys From b7928429bc4d9b41f6a8dfc9b268c63ae183b384 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 17:55:11 -0700 Subject: [PATCH 06/38] Make sure no warning is logged if syncer callback is disabled Signed-off-by: Justin Yu --- python/ray/tune/tests/test_syncer_callback.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index be555d1957a5a..f40863887bf26 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -641,6 +641,12 @@ def test_head_node_syncing_disabled_warning(propagate_logs, caplog): assert caplog.text.count("The contents of the trial directory for trial b") == 1 assert caplog.text.count("The contents of the trial directory for trial c") == 0 + disabled_syncer_callback = SyncerCallback(enabled=False) + remote_trial_d = MockTrial(trial_id="d", logdir=None, runner_ip="remote") + with caplog.at_level(logging.WARNING): + disabled_syncer_callback._sync_trial_dir(remote_trial_d) + assert caplog.text.count("The contents of the trial directory for trial d") == 0 + if __name__ == "__main__": import sys From d79d0b2a7fa8bfeaaddf38c2744594ce0e898eed Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 17:57:12 -0700 Subject: [PATCH 07/38] Fix test Signed-off-by: Justin Yu --- python/ray/tune/tests/test_syncer_callback.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index f40863887bf26..56ab27611a63f 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -578,7 +578,7 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): trial = MockTrial(trial_id="a", logdir=None) # Raise a deprecation error if checkpointing in a multi-node cluster - monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE) + monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, raising=False) with pytest.raises(DeprecationWarning): syncer_callback.on_checkpoint( iteration=1, @@ -603,7 +603,7 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): # Make sure we don't raise an error if running on a single node or using NFS, # where the checkpoint can be accessed from the driver. - monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE) + monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, raising=False) path_that_exists = tmp_path / "exists" path_that_exists.mkdir() syncer_callback.on_checkpoint( From 837dfcfdf67e0123ed82c7c281856230e882c9d2 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 17:57:46 -0700 Subject: [PATCH 08/38] Add link to GH issue Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 38e6db6231daf..cda7aa1b37d75 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -96,7 +96,7 @@ "https://docs.ray.io/en/master/tune/tutorials/tune-storage.html\n\n" "Other notes:\n" "- See here for a thread explaining why this functionality is " - "being removed: \n" + "being removed: https://github.com/ray-project/ray/issues/37177\n" # TODO(justinvyu): put in the link to the REP/issue "- To re-enable the head node syncing behavior, set the " f"environment variable {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE}=1\n" From 0fe99dd86b5a592ceb72740f203986a885496c69 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 18:18:19 -0700 Subject: [PATCH 09/38] Improve the error message Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index cda7aa1b37d75..b92fef90e0ec5 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -91,17 +91,15 @@ "(1) Cloud storage: `RunConfig(storage_path='s3://your/bucket')`\n" "(2) A network filesystem mounted on all nodes: " "`RunConfig(storage_path='/mnt/path/to/nfs_storage')`\n" - "See here for a full guide on how to configure these " - "persistent storage options: " - "https://docs.ray.io/en/master/tune/tutorials/tune-storage.html\n\n" - "Other notes:\n" - "- See here for a thread explaining why this functionality is " - "being removed: https://github.com/ray-project/ray/issues/37177\n" - # TODO(justinvyu): put in the link to the REP/issue - "- To re-enable the head node syncing behavior, set the " + "See this Github issue for more details on transitioning to cloud storage/NFS " + "as well as an explanation on why this functionality is " + "being removed: https://github.com/ray-project/ray/issues/37177\n\n" + "Other temporary workarounds:\n" + "- If you want to avoid errors/warnings and continue running with " + "syncing explicitly turned off, set `RunConfig(SyncConfig(syncer=None))`\n" + "- Or, to re-enable the head node syncing behavior, set the " f"environment variable {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE}=1\n" - " - **Note that this functionality will be fully removed in " - "Ray 2.7.**" + " - **Note that this functionality will be fully removed in Ray 2.7.**" ) From 77c62ab2648b03f840eb63b746b5a4070dc67f90 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 18:36:20 -0700 Subject: [PATCH 10/38] Enable env var for legacy tests Signed-off-by: Justin Yu --- python/ray/tune/tests/test_syncer_callback.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index 56ab27611a63f..a980f2279022c 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -46,6 +46,11 @@ def ray_start_2_cpus(): ray.shutdown() +@pytest.fixture(autouse=True) +def enable_legacy_head_node_syncing(monkeypatch): + monkeypatch.setenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, "1") + + @pytest.fixture def temp_data_dirs(): tmp_source = os.path.realpath(tempfile.mkdtemp()) @@ -617,7 +622,8 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): # TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. -def test_head_node_syncing_disabled_warning(propagate_logs, caplog): +def test_head_node_syncing_disabled_warning(propagate_logs, caplog, monkeypatch): + monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, raising=False) syncer_callback = SyncerCallback(sync_period=0) remote_trial_a = MockTrial(trial_id="a", logdir=None, runner_ip="remote") remote_trial_b = MockTrial(trial_id="b", logdir=None, runner_ip="remote") From 19b8d9ca8d5f432e9ec4acbec11e54c284d66217 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 18:39:02 -0700 Subject: [PATCH 11/38] Set env var for multinode legacy sync test Signed-off-by: Justin Yu --- python/ray/tune/tests/test_multinode_sync.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/ray/tune/tests/test_multinode_sync.py b/python/ray/tune/tests/test_multinode_sync.py index cd9a9730b9eeb..5620b72ef7c08 100644 --- a/python/ray/tune/tests/test_multinode_sync.py +++ b/python/ray/tune/tests/test_multinode_sync.py @@ -3,11 +3,13 @@ import sys import time import unittest +from unittest.mock import patch from typing import List import ray from ray import tune from ray.air.config import CheckpointConfig +from ray.air.constants import REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE from ray.air.util.node import _force_on_node from ray.autoscaler._private.fake_multi_node.node_provider import FAKE_HEAD_NODE_ID from ray.autoscaler._private.fake_multi_node.test_utils import DockerCluster @@ -167,6 +169,7 @@ def on_step_begin(self, iteration, trials, **info): callbacks=[FailureInjectionCallback()], ) + @patch.dict(os.environ, {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE: "1"}) def testCheckpointSync(self): """Test that checkpoints are correctly synced. From b29ac3ba58ad61d534fbc24b2d5c43af9088c14a Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 18:49:34 -0700 Subject: [PATCH 12/38] Set env var for release test for head node syncing Signed-off-by: Justin Yu --- release/tune_tests/cloud_tests/workloads/_tune_script.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/release/tune_tests/cloud_tests/workloads/_tune_script.py b/release/tune_tests/cloud_tests/workloads/_tune_script.py index 4b4ba903a6c05..06a0dc1e431a4 100644 --- a/release/tune_tests/cloud_tests/workloads/_tune_script.py +++ b/release/tune_tests/cloud_tests/workloads/_tune_script.py @@ -7,6 +7,7 @@ import ray from ray import tune from ray.air import Checkpoint, session +from ray.air.constants import REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE from ray.rllib.algorithms.callbacks import DefaultCallbacks from ray.rllib.algorithms.ppo import PPO @@ -100,6 +101,10 @@ def run_tune( else: raise RuntimeError(f"Unknown trainable: {trainable}") + if not no_syncer and storage_path is None: + # syncer="auto" + storage_path=None -> legacy head node syncing path + os.environ[REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE] = "1" + tune.run( train, name=experiment_name, From ce13f18f97bb619a761efb816096c736c878b90f Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 19:12:16 -0700 Subject: [PATCH 13/38] Add some sanity checks for multi-node at the start of head node syncing release test Signed-off-by: Justin Yu --- .../cloud_tests/workloads/run_cloud_test.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py index 6093007e4cfb4..6852ff47cd5a2 100644 --- a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py +++ b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py @@ -35,6 +35,7 @@ import json import os import platform +import pytest import re import shutil import signal @@ -45,6 +46,8 @@ import ray import ray.cloudpickle as pickle +from ray import air, tune +from ray.air import Checkpoint, session from ray.tune.execution.trial_runner import _find_newest_experiment_checkpoint from ray.tune.utils.serialization import TuneFunctionDecoder @@ -1022,6 +1025,58 @@ def after_experiments(): ) +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. +def test_head_node_syncing_disabled_error(): + """Tests that head node syncing is disabled properly in a multi-node setting. + Runs a 4 trial Tune run, where each trial uses 2 CPUs. + The cluster config = 4 nodes, each with 2 CPUs, so head node syncing + would have been required to synchronize checkpoints. + """ + + # Raise an error for checkpointing + no storage path + def train_fn(config): + session.report({"score": 1}, Checkpoint.from_dict({"dummy": 1})) + + tuner = tune.Tuner( + tune.with_resources(train_fn, {"CPU": 2.0}), + run_config=air.RunConfig( + storage_path=None, failure_config=air.FailureConfig(fail_fast=True) + ), + tune_config=tune.TuneConfig(num_samples=4), + ) + with pytest.raises(DeprecationWarning): + tuner.fit() + print("Success: checkpointing without a storage path raises an error") + + # Workaround: continue running, with syncing explicitly disabled + tuner = tune.Tuner( + tune.with_resources(train_fn, {"CPU": 2.0}), + run_config=air.RunConfig( + storage_path=None, + failure_config=air.FailureConfig(fail_fast=True), + sync_config=tune.SyncConfig(syncer=None), + ), + tune_config=tune.TuneConfig(num_samples=4), + ) + tuner.fit() + print("Success: explicitly disabling syncing is a sufficient workaround") + + # Not hard failing for multi-node with no checkpointing + def train_fn_no_checkpoint(config): + session.report({"score": 1}) + + tuner = tune.Tuner( + tune.with_resources(train_fn_no_checkpoint, {"CPU": 2.0}), + run_config=air.RunConfig( + storage_path=None, failure_config=air.FailureConfig(fail_fast=True) + ), + tune_config=tune.TuneConfig(num_samples=4), + ) + tuner.fit() + print("Success: a multi-node experiment without checkpoint still runs") + + +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. def test_ssh_sync(): """ SSH syncing, so: @@ -1049,6 +1104,9 @@ def test_ssh_sync(): - All trials progressed with training """ + # Some preliminary checks that head node syncing is deprecated correctly. + test_head_node_syncing_disabled_error() + experiment_name = "cloud_ssh_sync" indicator_file = f"/tmp/{experiment_name}_indicator" From 409a4f3d6f0cc6e657208418b76e41bd9836780f Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 19:16:50 -0700 Subject: [PATCH 14/38] Add RAY_ prefix Signed-off-by: Justin Yu --- python/ray/air/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/air/constants.py b/python/ray/air/constants.py index b59c97bd10bad..62a31a8c6bb5b 100644 --- a/python/ray/air/constants.py +++ b/python/ray/air/constants.py @@ -97,7 +97,7 @@ # Whether or not the sync-to-head behavior is enabled by default. # If unset, running AIR on a multi-node cluster with checkpointing will raise # an error telling the user to switch to cloud/NFS. -REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE = "AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE" +REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE = "RAY_AIR_REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE" # NOTE: When adding a new environment variable, please track it in this list. # TODO(ml-team): Most env var constants should get moved here. From 74d5695f0563fe7f7a6c0b47ee8051bd09ff87c4 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 23:30:16 -0700 Subject: [PATCH 15/38] Fix session.report call Signed-off-by: Justin Yu --- release/tune_tests/cloud_tests/workloads/run_cloud_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py index 6852ff47cd5a2..397d20f2256e8 100644 --- a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py +++ b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py @@ -1035,7 +1035,7 @@ def test_head_node_syncing_disabled_error(): # Raise an error for checkpointing + no storage path def train_fn(config): - session.report({"score": 1}, Checkpoint.from_dict({"dummy": 1})) + session.report({"score": 1}, checkpoint=Checkpoint.from_dict({"dummy": 1})) tuner = tune.Tuner( tune.with_resources(train_fn, {"CPU": 2.0}), From b811337f4c119c06330f5a7d23b7d1e8bbd0e443 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Thu, 6 Jul 2023 23:30:35 -0700 Subject: [PATCH 16/38] Don't show error if syncing is manually disabled Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index b92fef90e0ec5..e93c63676fc3f 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -928,6 +928,9 @@ def on_checkpoint( checkpoint: _TrackedCheckpoint, **info, ): + if not self._enabled or trial.uses_cloud_checkpointing: + return + if checkpoint.storage_mode == CheckpointStorage.MEMORY: return From 818ee1b3d04324ebefc1dbd9bc6dde25f1e1490b Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 00:07:33 -0700 Subject: [PATCH 17/38] Set env var for multinode test properly Signed-off-by: Justin Yu --- python/ray/tune/tests/test_multinode_sync.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/ray/tune/tests/test_multinode_sync.py b/python/ray/tune/tests/test_multinode_sync.py index 5620b72ef7c08..fbcecbf86d51e 100644 --- a/python/ray/tune/tests/test_multinode_sync.py +++ b/python/ray/tune/tests/test_multinode_sync.py @@ -169,7 +169,6 @@ def on_step_begin(self, iteration, trials, **info): callbacks=[FailureInjectionCallback()], ) - @patch.dict(os.environ, {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE: "1"}) def testCheckpointSync(self): """Test that checkpoints are correctly synced. @@ -209,7 +208,11 @@ def testCheckpointSync(self): ) # Connect via Ray client and wait until all nodes are there self.cluster.start() - self.cluster.connect(client=True, timeout=120) + self.cluster.connect( + client=True, + timeout=120, + runtime_env={"env_vars": {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE: "1"}}, + ) self.cluster.wait_for_resources({"CPU": 12}) # This train function trains for 10 iterations per run From b489ff63a909700404e25dd8f2ff9047dd9dfd63 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 10:13:36 -0700 Subject: [PATCH 18/38] use fail_fast=raise to catch deprecation error Signed-off-by: Justin Yu --- release/tune_tests/cloud_tests/workloads/run_cloud_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py index 397d20f2256e8..beff05d56a03c 100644 --- a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py +++ b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py @@ -1040,7 +1040,7 @@ def train_fn(config): tuner = tune.Tuner( tune.with_resources(train_fn, {"CPU": 2.0}), run_config=air.RunConfig( - storage_path=None, failure_config=air.FailureConfig(fail_fast=True) + storage_path=None, failure_config=air.FailureConfig(fail_fast="raise") ), tune_config=tune.TuneConfig(num_samples=4), ) @@ -1053,7 +1053,7 @@ def train_fn(config): tune.with_resources(train_fn, {"CPU": 2.0}), run_config=air.RunConfig( storage_path=None, - failure_config=air.FailureConfig(fail_fast=True), + failure_config=air.FailureConfig(fail_fast="raise"), sync_config=tune.SyncConfig(syncer=None), ), tune_config=tune.TuneConfig(num_samples=4), @@ -1068,7 +1068,7 @@ def train_fn_no_checkpoint(config): tuner = tune.Tuner( tune.with_resources(train_fn_no_checkpoint, {"CPU": 2.0}), run_config=air.RunConfig( - storage_path=None, failure_config=air.FailureConfig(fail_fast=True) + storage_path=None, failure_config=air.FailureConfig(fail_fast="raise") ), tune_config=tune.TuneConfig(num_samples=4), ) From 1a83c83b0790666276301cdaf5c1541b36476e71 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 10:13:43 -0700 Subject: [PATCH 19/38] Fix lint Signed-off-by: Justin Yu --- python/ray/tune/tests/test_multinode_sync.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/tune/tests/test_multinode_sync.py b/python/ray/tune/tests/test_multinode_sync.py index fbcecbf86d51e..3d3e591eba013 100644 --- a/python/ray/tune/tests/test_multinode_sync.py +++ b/python/ray/tune/tests/test_multinode_sync.py @@ -3,7 +3,6 @@ import sys import time import unittest -from unittest.mock import patch from typing import List import ray From b621f07e0455b7790389387741b303f5267cc222 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 14:51:48 -0700 Subject: [PATCH 20/38] Only log warning once across all trials Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 10 ++------- python/ray/tune/tests/test_syncer_callback.py | 21 +++++++++++-------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index e93c63676fc3f..1182c175ceb0b 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -838,14 +838,8 @@ def _sync_trial_dir( # Only log a warning for remote trials, since # this only affects artifacts that are saved on worker nodes. if source_ip != ray.util.get_node_ip_address(): - if log_once(f"sync_trial_dir_{trial.trial_id}"): - logger.warning( - "The contents of the trial directory for trial " - f"{trial.trial_id}, " - "which includes checkpoints and artifacts, were not synced " - "to the head node. See below for more info:\n\n" - + _SYNC_TO_HEAD_DEPRECATION_MESSAGE - ) + if log_once(f"deprecated_head_node_sync"): + logger.warning(_SYNC_TO_HEAD_DEPRECATION_MESSAGE) return False sync_process = self._get_trial_sync_process(trial) diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index a980f2279022c..e1d124c6588c3 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -16,6 +16,7 @@ from ray.tune.logger import NoopLogger from ray.tune.result import TIME_TOTAL_S from ray.tune.syncer import ( + _SYNC_TO_HEAD_DEPRECATION_MESSAGE, DEFAULT_SYNC_PERIOD, SyncConfig, SyncerCallback, @@ -630,28 +631,30 @@ def test_head_node_syncing_disabled_warning(propagate_logs, caplog, monkeypatch) local_trial_c = MockTrial(trial_id="c", logdir=None) with caplog.at_level(logging.WARNING): + # The log should only be displayed once for the first remote trial. + syncer_callback._sync_trial_dir(local_trial_c) + assert caplog.text.count(_SYNC_TO_HEAD_DEPRECATION_MESSAGE) == 0 + # Any attempts to sync from remote trials should no-op. # Instead, print a warning message to the user explaining that # no checkpoints or artifacts are pulled to the head node. syncer_callback._sync_trial_dir(remote_trial_a) - syncer_callback._sync_trial_dir(remote_trial_b) + assert caplog.text.count(_SYNC_TO_HEAD_DEPRECATION_MESSAGE) == 1 - # Syncing multiple times shouldn't spam the logs - syncer_callback._sync_trial_dir(remote_trial_a) + # More sync attempts shouldn't add any extra warnings. syncer_callback._sync_trial_dir(remote_trial_b) - - # No warning for attempting to sync a local trial dir + syncer_callback._sync_trial_dir(remote_trial_a) syncer_callback._sync_trial_dir(local_trial_c) - assert caplog.text.count("The contents of the trial directory for trial a") == 1 - assert caplog.text.count("The contents of the trial directory for trial b") == 1 - assert caplog.text.count("The contents of the trial directory for trial c") == 0 + assert caplog.text.count(_SYNC_TO_HEAD_DEPRECATION_MESSAGE) == 1 disabled_syncer_callback = SyncerCallback(enabled=False) remote_trial_d = MockTrial(trial_id="d", logdir=None, runner_ip="remote") + caplog.clear() with caplog.at_level(logging.WARNING): + # No warning if syncing is explicitly disabled disabled_syncer_callback._sync_trial_dir(remote_trial_d) - assert caplog.text.count("The contents of the trial directory for trial d") == 0 + assert caplog.text.count(_SYNC_TO_HEAD_DEPRECATION_MESSAGE) == 0 if __name__ == "__main__": From 0afda7b75013fbde93128848a20f43e2ac2f08a2 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 14:54:50 -0700 Subject: [PATCH 21/38] env_var=0 -> flag not enabled Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 4 ++-- python/ray/tune/tests/test_syncer_callback.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 1182c175ceb0b..08a6456ab2d0d 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -834,7 +834,7 @@ def _sync_trial_dir( self._trial_ips[trial.trial_id] = source_ip - if not os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE): + if not bool(int(os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE))): # Only log a warning for remote trials, since # this only affects artifacts that are saved on worker nodes. if source_ip != ray.util.get_node_ip_address(): @@ -928,7 +928,7 @@ def on_checkpoint( if checkpoint.storage_mode == CheckpointStorage.MEMORY: return - if not os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE): + if not bool(int(os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE))): # If we have saved a checkpoint, but it's not accessible on the driver, # that means that it lives on some other node and would be synced to head # prior to Ray 2.6. diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index e1d124c6588c3..6d8141fad9e1d 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -584,7 +584,7 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): trial = MockTrial(trial_id="a", logdir=None) # Raise a deprecation error if checkpointing in a multi-node cluster - monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, raising=False) + monkeypatch.setenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, "0") with pytest.raises(DeprecationWarning): syncer_callback.on_checkpoint( iteration=1, @@ -609,7 +609,7 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): # Make sure we don't raise an error if running on a single node or using NFS, # where the checkpoint can be accessed from the driver. - monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, raising=False) + monkeypatch.setenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, "0") path_that_exists = tmp_path / "exists" path_that_exists.mkdir() syncer_callback.on_checkpoint( @@ -624,7 +624,7 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): # TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. def test_head_node_syncing_disabled_warning(propagate_logs, caplog, monkeypatch): - monkeypatch.delenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, raising=False) + monkeypatch.setenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, "0") syncer_callback = SyncerCallback(sync_period=0) remote_trial_a = MockTrial(trial_id="a", logdir=None, runner_ip="remote") remote_trial_b = MockTrial(trial_id="b", logdir=None, runner_ip="remote") From ec4467b9d07e6b08df84f8eb2bc0acdd5ee86116 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 17:26:31 -0700 Subject: [PATCH 22/38] use /mnt/cluster_storage for release tests that checkpoint across multiple nodes Signed-off-by: Justin Yu --- python/ray/train/examples/pytorch/torch_linear_example.py | 5 +++-- python/ray/train/examples/tf/tensorflow_mnist_example.py | 8 ++++++-- .../air_benchmarks/workloads/xgboost_benchmark.py | 5 ++++- release/air_tests/frequent_pausing/script.py | 3 ++- release/air_tests/horovod/workloads/horovod_tune_test.py | 1 + .../workloads/torch_tune_serve_test.py | 1 + release/lightning_tests/workloads/test_trainer.py | 3 ++- release/lightning_tests/workloads/test_tuner.py | 1 + .../ml_user_tests/train/train_tensorflow_mnist_test.py | 4 +++- release/ml_user_tests/train/train_torch_linear_test.py | 4 +++- .../workloads/test_long_running_large_checkpoints.py | 1 + 11 files changed, 27 insertions(+), 9 deletions(-) diff --git a/python/ray/train/examples/pytorch/torch_linear_example.py b/python/ray/train/examples/pytorch/torch_linear_example.py index 647b51a0db6b2..4fc050605b276 100644 --- a/python/ray/train/examples/pytorch/torch_linear_example.py +++ b/python/ray/train/examples/pytorch/torch_linear_example.py @@ -6,7 +6,7 @@ import torch.nn as nn import ray.train as train from ray.train.torch import TorchTrainer, TorchCheckpoint -from ray.air.config import ScalingConfig +from ray.air.config import RunConfig, ScalingConfig class LinearDataset(torch.utils.data.Dataset): @@ -85,12 +85,13 @@ def train_func(config): return results -def train_linear(num_workers=2, use_gpu=False, epochs=3): +def train_linear(num_workers=2, use_gpu=False, epochs=3, storage_path=None): config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + run_config=RunConfig(storage_path=storage_path), ) result = trainer.fit() diff --git a/python/ray/train/examples/tf/tensorflow_mnist_example.py b/python/ray/train/examples/tf/tensorflow_mnist_example.py index 730279a983eff..46fd187ab5dad 100644 --- a/python/ray/train/examples/tf/tensorflow_mnist_example.py +++ b/python/ray/train/examples/tf/tensorflow_mnist_example.py @@ -12,7 +12,7 @@ from ray.train.tensorflow import TensorflowTrainer from ray.air.integrations.keras import ReportCheckpointCallback -from ray.air.config import ScalingConfig +from ray.air.config import RunConfig, ScalingConfig def mnist_dataset(batch_size: int) -> tf.data.Dataset: @@ -79,13 +79,17 @@ def train_func(config: dict): def train_tensorflow_mnist( - num_workers: int = 2, use_gpu: bool = False, epochs: int = 4 + num_workers: int = 2, + use_gpu: bool = False, + epochs: int = 4, + storage_path: str = None, ) -> Result: config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), + run_config=RunConfig(storage_path=storage_path), ) results = trainer.fit() return results diff --git a/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py index 8de20406143c8..6bc912c3b5947 100644 --- a/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py +++ b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py @@ -15,7 +15,7 @@ XGBoostPredictor, ) from ray.train.batch_predictor import BatchPredictor -from ray.air.config import ScalingConfig +from ray.air.config import RunConfig, ScalingConfig _XGB_MODEL_PATH = "model.json" _TRAINING_TIME_THRESHOLD = 1000 @@ -97,6 +97,9 @@ def run_xgboost_training(data_path: str, num_workers: int, cpus_per_worker: int) label_column="labels", params=params, datasets={"train": ds}, + run_config=RunConfig( + storage_path="/mnt/cluster_storage", name="xgboost_benchmark" + ), ) result = trainer.fit() checkpoint = XGBoostCheckpoint.from_checkpoint(result.checkpoint) diff --git a/release/air_tests/frequent_pausing/script.py b/release/air_tests/frequent_pausing/script.py index 8d1468bb513a7..0cbdf8f1bb669 100644 --- a/release/air_tests/frequent_pausing/script.py +++ b/release/air_tests/frequent_pausing/script.py @@ -16,7 +16,7 @@ import numpy as np -from ray.air import session +from ray.air import session, RunConfig from ray.air.checkpoint import Checkpoint from ray.tune.schedulers.trial_scheduler import FIFOScheduler, TrialScheduler from ray.tune.tune_config import TuneConfig @@ -49,6 +49,7 @@ def on_trial_result(self, trial_runner, trial, result): tuner = Tuner( func, tune_config=TuneConfig(num_samples=2, scheduler=FrequentPausesScheduler()), + run_config=RunConfig(storage_path="/mnt/cluster_storage", name="frequent_pausing"), ) tuner.fit() diff --git a/release/air_tests/horovod/workloads/horovod_tune_test.py b/release/air_tests/horovod/workloads/horovod_tune_test.py index 58e32e8e22534..314c73c6cd1b4 100755 --- a/release/air_tests/horovod/workloads/horovod_tune_test.py +++ b/release/air_tests/horovod/workloads/horovod_tune_test.py @@ -183,6 +183,7 @@ def train_loop_per_worker(config): failure_config=FailureConfig(fail_fast=False), checkpoint_config=CheckpointConfig(num_to_keep=1), callbacks=[ProgressCallback()], + storage_path="/mnt/cluster_storage", ), ) diff --git a/release/golden_notebook_tests/workloads/torch_tune_serve_test.py b/release/golden_notebook_tests/workloads/torch_tune_serve_test.py index c843953e75684..2c02d436ba5ee 100644 --- a/release/golden_notebook_tests/workloads/torch_tune_serve_test.py +++ b/release/golden_notebook_tests/workloads/torch_tune_serve_test.py @@ -127,6 +127,7 @@ def train_mnist(test_mode=False, num_workers=1, use_gpu=False): ), run_config=RunConfig( verbose=1, + storage_path="/mnt/cluster_storage", ), ) diff --git a/release/lightning_tests/workloads/test_trainer.py b/release/lightning_tests/workloads/test_trainer.py index 117f6ee85ab9b..da3ee4f5406f3 100644 --- a/release/lightning_tests/workloads/test_trainer.py +++ b/release/lightning_tests/workloads/test_trainer.py @@ -4,7 +4,7 @@ from pytorch_lightning.loggers.csv_logs import CSVLogger import ray -from ray.air.config import ScalingConfig +from ray.air.config import RunConfig, ScalingConfig from ray.train.lightning import LightningTrainer, LightningConfigBuilder from lightning_test_utils import MNISTClassifier, MNISTDataModule @@ -34,6 +34,7 @@ trainer = LightningTrainer( lightning_config=lightning_config, scaling_config=scaling_config, + run_config=RunConfig(storage_path="/mnt/cluster_storage"), ) result = trainer.fit() diff --git a/release/lightning_tests/workloads/test_tuner.py b/release/lightning_tests/workloads/test_tuner.py index 37c880cc7c67f..eb71293048f89 100644 --- a/release/lightning_tests/workloads/test_tuner.py +++ b/release/lightning_tests/workloads/test_tuner.py @@ -53,6 +53,7 @@ lightning_trainer, param_space={"lightning_config": lightning_config}, run_config=ray.air.RunConfig( + storage_path="/mnt/cluster_storage", name="release-tuner-test", verbose=2, checkpoint_config=CheckpointConfig( diff --git a/release/ml_user_tests/train/train_tensorflow_mnist_test.py b/release/ml_user_tests/train/train_tensorflow_mnist_test.py index e158c7506dc52..627c112d8a2d9 100644 --- a/release/ml_user_tests/train/train_tensorflow_mnist_test.py +++ b/release/ml_user_tests/train/train_tensorflow_mnist_test.py @@ -16,7 +16,9 @@ else: ray.init(address="auto") - train_tensorflow_mnist(num_workers=6, use_gpu=True, epochs=20) + train_tensorflow_mnist( + num_workers=6, use_gpu=True, epochs=20, storage_path="/mnt/cluster_storage" + ) taken = time.time() - start result = { diff --git a/release/ml_user_tests/train/train_torch_linear_test.py b/release/ml_user_tests/train/train_torch_linear_test.py index d76a8433ee2eb..48610020ea668 100644 --- a/release/ml_user_tests/train/train_torch_linear_test.py +++ b/release/ml_user_tests/train/train_torch_linear_test.py @@ -17,7 +17,9 @@ else: ray.init(address="auto") - results = train_linear(num_workers=6, use_gpu=True, epochs=20) + results = train_linear( + num_workers=6, use_gpu=True, epochs=20, storage_path="/mnt/cluster_storage" + ) taken = time.time() - start result = {"time_taken": taken} diff --git a/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py b/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py index 9298cd457ce55..3202fcdab36d4 100644 --- a/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py +++ b/release/tune_tests/scalability_tests/workloads/test_long_running_large_checkpoints.py @@ -42,6 +42,7 @@ def main(smoke_test: bool = False): resources_per_trial={"cpu": 1}, sync_config=tune.SyncConfig(syncer="auto"), callbacks=[callback], + storage_path="/mnt/cluster_storage", ) From 25ec545e13cd56ec75acc402b6901bc1a54e9415 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 17:39:53 -0700 Subject: [PATCH 23/38] Update dolly finetuning Signed-off-by: Justin Yu --- .../dolly_lightning_fsdp_finetuning.ipynb | 48 ++++++++++++++++--- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb b/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb index a57b9e34dfe49..7a06fb7990e63 100644 --- a/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb +++ b/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb @@ -139,6 +139,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -321,11 +322,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Fine-tune with LightningTrainer\n", - "\n", - "```{note}\n", - "Here we save the checkpoints to the local file system. You can also upload the checkpoints to cloud storage by setting S3 bucket URI to {class}`air.RunConfig(storage_path=S3_BUCKET_URI) `. See {ref}`train-run-config` for an example.\n", - "```" + "## Fine-tune with LightningTrainer" ] }, { @@ -387,6 +384,43 @@ "lightning_config.trainer(callbacks=[DollyV2ProgressBar(num_iters_per_epoch)])" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```{note}\n", + "Since this example runs with multiple nodes, we need to persist checkpoints\n", + "and other outputs to some external storage for access after training has completed.\n", + "**You should set up cloud storage or NFS, then replace `storage_path` with your own cloud bucket URI or NFS path.**\n", + "\n", + "See the :ref:`storage guide ` for more details.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "storage_path=\"s3://your-bucket-here\" # TODO: Set up cloud storage\n", + "# storage_path=\"/mnt/path/to/nfs\" # TODO: Alternatively, set up NFS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "storage_path = \"/mnt/cluster_storage\"" + ] + }, { "cell_type": "code", "execution_count": 9, @@ -921,9 +955,10 @@ "from ray.tune.syncer import SyncConfig\n", "# Save AIR checkpoints according to the performance on validation set\n", "run_config = RunConfig(\n", + " storage_path=storage_path,\n", " name=\"finetune_dolly-v2-7b\",\n", " checkpoint_config=CheckpointConfig(),\n", - " sync_config=SyncConfig(sync_artifacts=False)\n", + " sync_config=SyncConfig(sync_artifacts=False),\n", ")\n", "\n", "# Scale the DDP training workload across 16 GPUs\n", @@ -954,6 +989,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ From 84142db61727c17f126679a6dbbaa5c22b917844 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 17:46:26 -0700 Subject: [PATCH 24/38] Fix ml rllib connect test Signed-off-by: Justin Yu --- release/ml_user_tests/tune_rllib/run_connect_tests.py | 2 +- rllib/examples/tune/framework.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/release/ml_user_tests/tune_rllib/run_connect_tests.py b/release/ml_user_tests/tune_rllib/run_connect_tests.py index 5da1888a94294..8decd32f27332 100644 --- a/release/ml_user_tests/tune_rllib/run_connect_tests.py +++ b/release/ml_user_tests/tune_rllib/run_connect_tests.py @@ -19,7 +19,7 @@ ray.init(address="auto") start_time = time.time() - results = run() + results = run(storage_path="/mnt/cluster_storage") exp_analysis = results._experiment_analysis end_time = time.time() diff --git a/rllib/examples/tune/framework.py b/rllib/examples/tune/framework.py index 304b549708e93..fb2042090f927 100644 --- a/rllib/examples/tune/framework.py +++ b/rllib/examples/tune/framework.py @@ -14,7 +14,7 @@ logger = logging.getLogger("tune_framework") -def run(smoke_test=False): +def run(smoke_test=False, storage_path: str = None): stop = {"training_iteration": 1 if smoke_test else 50} num_workers = 1 if smoke_test else 20 num_gpus = 0 if smoke_test else 1 @@ -62,6 +62,7 @@ def run(smoke_test=False): sort_by_metric=True, max_report_frequency=30, ), + storage_path=storage_path, ), tune_config=tune.TuneConfig( num_samples=1, From 386d93c1088277183d9a775dad22812ec8be53ca Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 17:46:49 -0700 Subject: [PATCH 25/38] Fix lint Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 08a6456ab2d0d..33e1e167b5535 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -838,7 +838,7 @@ def _sync_trial_dir( # Only log a warning for remote trials, since # this only affects artifacts that are saved on worker nodes. if source_ip != ray.util.get_node_ip_address(): - if log_once(f"deprecated_head_node_sync"): + if log_once("deprecated_head_node_sync"): logger.warning(_SYNC_TO_HEAD_DEPRECATION_MESSAGE) return False From da500eba4b5938fcf764e5fdfa2946f5a33d510a Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 17:50:29 -0700 Subject: [PATCH 26/38] Fix mmt workspace template Signed-off-by: Justin Yu --- doc/source/templates/02_many_model_training/start.ipynb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/source/templates/02_many_model_training/start.ipynb b/doc/source/templates/02_many_model_training/start.ipynb index e1a76f0e9ba76..89cdd345002e0 100644 --- a/doc/source/templates/02_many_model_training/start.ipynb +++ b/doc/source/templates/02_many_model_training/start.ipynb @@ -75,7 +75,7 @@ "from statsforecast.models import AutoARIMA, AutoETS, MSTL\n", "\n", "from ray import tune\n", - "from ray.air import session\n" + "from ray.air import session, RunConfig\n" ] }, { @@ -265,7 +265,12 @@ "metadata": {}, "outputs": [], "source": [ - "tuner = tune.Tuner(trainable, param_space=param_space)\n", + "tuner = tune.Tuner(\n", + " trainable,\n", + " param_space=param_space,\n", + " # Experiment results are saved to a shared filesystem available to all nodes.\n", + " run_config=RunConfig(storage_path=\"/mnt/cluster_storage\"),\n", + ")\n", "result_grid = tuner.fit()\n" ] }, From 77f17c3e11af015279fb4e6b26ac0ec387e6e29a Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 7 Jul 2023 18:05:33 -0700 Subject: [PATCH 27/38] Fix all rllib release tests Signed-off-by: Justin Yu --- release/long_running_tests/workloads/apex.py | 1 + release/long_running_tests/workloads/impala.py | 1 + .../smoke_test_basic_multi_node_training_learner.py | 4 ++++ rllib/utils/test_utils.py | 6 ++++++ 4 files changed, 12 insertions(+) diff --git a/release/long_running_tests/workloads/apex.py b/release/long_running_tests/workloads/apex.py index 3b2cc5e0a217c..4aee3c40db3f2 100644 --- a/release/long_running_tests/workloads/apex.py +++ b/release/long_running_tests/workloads/apex.py @@ -52,6 +52,7 @@ "min_time_s_per_iteration": 10, "min_sample_timesteps_per_iteration": 10, }, + "storage_path": "/mnt/cluster_storage", } }, callbacks=[ProgressCallback()], diff --git a/release/long_running_tests/workloads/impala.py b/release/long_running_tests/workloads/impala.py index 9660dd1ee214f..d727d1ec7341a 100644 --- a/release/long_running_tests/workloads/impala.py +++ b/release/long_running_tests/workloads/impala.py @@ -56,6 +56,7 @@ "rollout_fragment_length": 50, "train_batch_size": 100, }, + "storage_path": "/mnt/cluster_storage", }, }, callbacks=[ProgressCallback()], diff --git a/release/rllib_tests/smoke_tests/smoke_test_basic_multi_node_training_learner.py b/release/rllib_tests/smoke_tests/smoke_test_basic_multi_node_training_learner.py index f332d7b6b808b..c54bcfb8bde77 100644 --- a/release/rllib_tests/smoke_tests/smoke_test_basic_multi_node_training_learner.py +++ b/release/rllib_tests/smoke_tests/smoke_test_basic_multi_node_training_learner.py @@ -10,6 +10,7 @@ def run_with_tuner_n_rollout_worker_2_gpu(config): "PPO", param_space=config, run_config=air.RunConfig( + storage_path="/mnt/cluster_storage", stop={"timesteps_total": 128}, failure_config=air.FailureConfig(fail_fast=True), ), @@ -24,6 +25,7 @@ def run_with_tuner_0_rollout_worker_2_gpu(config): "PPO", param_space=config, run_config=air.RunConfig( + storage_path="/mnt/cluster_storage", stop={"timesteps_total": 128}, failure_config=air.FailureConfig(fail_fast=True), ), @@ -43,6 +45,7 @@ def run_tuner_n_rollout_workers_0_gpu(config): "PPO", param_space=config, run_config=air.RunConfig( + storage_path="/mnt/cluster_storage", stop={"timesteps_total": 128}, failure_config=air.FailureConfig(fail_fast=True), ), @@ -62,6 +65,7 @@ def run_tuner_n_rollout_workers_1_gpu_local(config): "PPO", param_space=config, run_config=air.RunConfig( + storage_path="/mnt/cluster_storage", stop={"timesteps_total": 128}, failure_config=air.FailureConfig(fail_fast=True), ), diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 557eaf2e552b1..af8af0450d6a0 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -822,6 +822,12 @@ def should_check_eval(experiment): # If an experiment passes, we'll remove it from this dict. experiments_to_run = experiments.copy() + # When running as a release test, use `/mnt/cluster_storage` as the storage path. + release_test_storage_path = "/mnt/cluster_storage" + if os.path.exists(): + for experiment in experiments_to_run: + experiment["storage_path"] = release_test_storage_path + try: ray.init(address="auto") except ConnectionError: From 1ec6f5a38a7f62b68719f261e287fa6729984549 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Sun, 9 Jul 2023 17:42:09 -0700 Subject: [PATCH 28/38] Fix default if env var not set Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 33e1e167b5535..e2a5cfcdccb46 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -834,7 +834,7 @@ def _sync_trial_dir( self._trial_ips[trial.trial_id] = source_ip - if not bool(int(os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE))): + if not bool(int(os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, "0"))): # Only log a warning for remote trials, since # this only affects artifacts that are saved on worker nodes. if source_ip != ray.util.get_node_ip_address(): @@ -928,7 +928,7 @@ def on_checkpoint( if checkpoint.storage_mode == CheckpointStorage.MEMORY: return - if not bool(int(os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE))): + if not bool(int(os.environ.get(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, "0"))): # If we have saved a checkpoint, but it's not accessible on the driver, # that means that it lives on some other node and would be synced to head # prior to Ray 2.6. From 7e62da021b5e9181faf1a93f9633739ac8f4b2b3 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Sun, 9 Jul 2023 17:49:16 -0700 Subject: [PATCH 29/38] Fix ref in docs Signed-off-by: Justin Yu --- .../ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb b/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb index 7a06fb7990e63..e6eec57f01ba9 100644 --- a/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb +++ b/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb @@ -394,7 +394,7 @@ "and other outputs to some external storage for access after training has completed.\n", "**You should set up cloud storage or NFS, then replace `storage_path` with your own cloud bucket URI or NFS path.**\n", "\n", - "See the :ref:`storage guide ` for more details.\n", + "See the [storage guide](tune-storage-options) for more details.\n", "```" ] }, From 54ab6d3c029cc6f10baa7a8a87c13af7493ef488 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Mon, 10 Jul 2023 11:45:29 -0700 Subject: [PATCH 30/38] Mark modified release tests as affected Signed-off-by: Justin Yu Finish adding affected test flags Signed-off-by: Justin Yu --- release/release_tests.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 3cb3f154c2734..ff464f964feb4 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -123,6 +123,7 @@ frequency: nightly-3x team: ml python: "3.8" + affected: true cluster: byod: runtime_env: @@ -155,6 +156,7 @@ frequency: weekly team: ml python: "3.8" + affected: true cluster: byod: type: gpu @@ -226,6 +228,7 @@ python: "3.8" frequency: nightly team: ml + affected: true cluster: # byod: # type: gpu @@ -894,6 +897,7 @@ python: "3.8" + affected: true frequency: weekly team: ml cluster: @@ -964,6 +968,7 @@ group: Workspace templates working_dir: workspace_templates/02_many_model_training python: "3.9" + affected: true frequency: nightly-3x team: ml cluster: @@ -1521,6 +1526,7 @@ frequency: nightly-3x team: ml python: "3.8" + affected: true cluster: byod: type: gpu @@ -1551,6 +1557,7 @@ frequency: nightly-3x team: ml python: "3.8" + affected: true cluster: byod: type: gpu @@ -1646,6 +1653,7 @@ frequency: nightly-3x team: ml python: "3.8" + affected: true cluster: byod: runtime_env: @@ -1678,6 +1686,7 @@ frequency: nightly-3x team: ml python: "3.8" + affected: true cluster: byod: runtime_env: @@ -1772,6 +1781,7 @@ frequency: nightly-3x team: ml python: "3.8" + affected: true cluster: byod: type: gpu @@ -1808,6 +1818,7 @@ frequency: nightly team: ml python: "3.8" + affected: true cluster: byod: {} cluster_env: app_config.yaml @@ -1836,6 +1847,7 @@ frequency: nightly team: ml python: "3.8" + affected: true cluster: byod: {} cluster_env: app_config.yaml @@ -1864,6 +1876,7 @@ frequency: nightly team: ml python: "3.8" + affected: true cluster: byod: {} cluster_env: app_config.yaml @@ -2116,6 +2129,7 @@ frequency: weekly team: ml python: "3.8" + affected: true cluster: byod: {} cluster_env: app_config.yaml @@ -2329,6 +2343,7 @@ frequency: nightly-3x team: ml python: "3.8" + affected: true cluster: byod: type: gpu @@ -2403,6 +2418,7 @@ frequency: weekly team: rllib python: "3.8" + affected: true cluster: byod: type: gpu @@ -2449,6 +2465,7 @@ frequency: weekly team: rllib python: "3.8" + affected: true cluster: byod: type: gpu @@ -3726,6 +3743,7 @@ frequency: nightly team: rllib python: "3.8" + affected: true cluster: byod: type: gpu @@ -4545,6 +4563,7 @@ frequency: nightly team: rllib python: "3.8" + affected: true cluster: byod: type: gpu From a0c59543b22d914c765213312f26d347fae5ea46 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Mon, 10 Jul 2023 16:52:11 -0700 Subject: [PATCH 31/38] add to schema + change to string for filtering Signed-off-by: Justin Yu --- release/ray_release/schema.json | 10 +++++---- release/release_tests.yaml | 38 ++++++++++++++++----------------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/release/ray_release/schema.json b/release/ray_release/schema.json index e3d2bf029aa6d..a8bdc6d275543 100644 --- a/release/ray_release/schema.json +++ b/release/ray_release/schema.json @@ -53,6 +53,9 @@ }, "alert": { "type": "string" + }, + "affected": { + "type": "string" } }, "required": [ @@ -107,7 +110,7 @@ "cu118" ] }, - "post_build_script":{ + "post_build_script": { "type": "string" }, "pip": { @@ -117,8 +120,7 @@ "type": "array" } }, - "required": [ - ], + "required": [], "title": "Byod" }, "Run": { @@ -215,4 +217,4 @@ ] } } -} +} \ No newline at end of file diff --git a/release/release_tests.yaml b/release/release_tests.yaml index f5b6f9adaf4c1..629055e612438 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -123,7 +123,7 @@ frequency: nightly-3x team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: runtime_env: @@ -156,7 +156,7 @@ frequency: weekly team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: type: gpu @@ -228,7 +228,7 @@ python: "3.8" frequency: nightly team: ml - affected: true + affected: "true" cluster: # byod: # type: gpu @@ -897,7 +897,7 @@ python: "3.8" - affected: true + affected: "true" frequency: weekly team: ml cluster: @@ -945,7 +945,7 @@ group: Workspace templates working_dir: workspace_templates/02_many_model_training python: "3.9" - affected: true + affected: "true" frequency: nightly-3x team: ml cluster: @@ -1503,7 +1503,7 @@ frequency: nightly-3x team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: type: gpu @@ -1534,7 +1534,7 @@ frequency: nightly-3x team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: type: gpu @@ -1630,7 +1630,7 @@ frequency: nightly-3x team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: runtime_env: @@ -1663,7 +1663,7 @@ frequency: nightly-3x team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: runtime_env: @@ -1758,7 +1758,7 @@ frequency: nightly-3x team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: type: gpu @@ -1795,7 +1795,7 @@ frequency: nightly team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: {} cluster_env: app_config.yaml @@ -1824,7 +1824,7 @@ frequency: nightly team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: {} cluster_env: app_config.yaml @@ -1853,7 +1853,7 @@ frequency: nightly team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: {} cluster_env: app_config.yaml @@ -2106,7 +2106,7 @@ frequency: weekly team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: {} cluster_env: app_config.yaml @@ -2320,7 +2320,7 @@ frequency: nightly-3x team: ml python: "3.8" - affected: true + affected: "true" cluster: byod: type: gpu @@ -2395,7 +2395,7 @@ frequency: weekly team: rllib python: "3.8" - affected: true + affected: "true" cluster: byod: type: gpu @@ -2442,7 +2442,7 @@ frequency: weekly team: rllib python: "3.8" - affected: true + affected: "true" cluster: byod: type: gpu @@ -3720,7 +3720,7 @@ frequency: nightly team: rllib python: "3.8" - affected: true + affected: "true" cluster: byod: type: gpu @@ -4540,7 +4540,7 @@ frequency: nightly team: rllib python: "3.8" - affected: true + affected: "true" cluster: byod: type: gpu From 28d22d82958a4bbe910c68bd95a9f52b68932a9a Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Mon, 10 Jul 2023 20:35:48 -0700 Subject: [PATCH 32/38] Fix typo in rllib test launcher Signed-off-by: Justin Yu --- rllib/utils/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index af8af0450d6a0..1a727a2e80d89 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -824,7 +824,7 @@ def should_check_eval(experiment): # When running as a release test, use `/mnt/cluster_storage` as the storage path. release_test_storage_path = "/mnt/cluster_storage" - if os.path.exists(): + if os.path.exists(release_test_storage_path): for experiment in experiments_to_run: experiment["storage_path"] = release_test_storage_path From 6ca0839e6bfd5124b7e195f8ede1c26cad6ef472 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Mon, 10 Jul 2023 23:39:57 -0700 Subject: [PATCH 33/38] typo Signed-off-by: Justin Yu --- rllib/utils/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index 1a727a2e80d89..a69e21a492826 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -825,8 +825,8 @@ def should_check_eval(experiment): # When running as a release test, use `/mnt/cluster_storage` as the storage path. release_test_storage_path = "/mnt/cluster_storage" if os.path.exists(release_test_storage_path): - for experiment in experiments_to_run: - experiment["storage_path"] = release_test_storage_path + for k, e in experiments_to_run.items(): + e["storage_path"] = release_test_storage_path try: ray.init(address="auto") From e0e6eae19fed7f369a09bb172799e3550d040033 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Tue, 11 Jul 2023 11:38:52 -0700 Subject: [PATCH 34/38] update message to say removal in 2.8 Signed-off-by: Justin Yu --- python/ray/tune/syncer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index e2a5cfcdccb46..49534ed471ec9 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -99,7 +99,7 @@ "syncing explicitly turned off, set `RunConfig(SyncConfig(syncer=None))`\n" "- Or, to re-enable the head node syncing behavior, set the " f"environment variable {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE}=1\n" - " - **Note that this functionality will be fully removed in Ray 2.7.**" + " - **Note that this functionality will be fully removed in Ray 2.8.**" ) From a0c85ccc4248a2b637073163147353a1ac6def83 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Tue, 11 Jul 2023 11:39:15 -0700 Subject: [PATCH 35/38] Revert "add to schema + change to string for filtering" This reverts commit a0c59543b22d914c765213312f26d347fae5ea46. Revert changes to release tests yaml Signed-off-by: Justin Yu --- release/ray_release/schema.json | 10 ++++------ release/release_tests.yaml | 19 ------------------- 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/release/ray_release/schema.json b/release/ray_release/schema.json index a8bdc6d275543..e3d2bf029aa6d 100644 --- a/release/ray_release/schema.json +++ b/release/ray_release/schema.json @@ -53,9 +53,6 @@ }, "alert": { "type": "string" - }, - "affected": { - "type": "string" } }, "required": [ @@ -110,7 +107,7 @@ "cu118" ] }, - "post_build_script": { + "post_build_script":{ "type": "string" }, "pip": { @@ -120,7 +117,8 @@ "type": "array" } }, - "required": [], + "required": [ + ], "title": "Byod" }, "Run": { @@ -217,4 +215,4 @@ ] } } -} \ No newline at end of file +} diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 629055e612438..edcf8a19304c4 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -123,7 +123,6 @@ frequency: nightly-3x team: ml python: "3.8" - affected: "true" cluster: byod: runtime_env: @@ -156,7 +155,6 @@ frequency: weekly team: ml python: "3.8" - affected: "true" cluster: byod: type: gpu @@ -228,7 +226,6 @@ python: "3.8" frequency: nightly team: ml - affected: "true" cluster: # byod: # type: gpu @@ -897,7 +894,6 @@ python: "3.8" - affected: "true" frequency: weekly team: ml cluster: @@ -945,7 +941,6 @@ group: Workspace templates working_dir: workspace_templates/02_many_model_training python: "3.9" - affected: "true" frequency: nightly-3x team: ml cluster: @@ -1503,7 +1498,6 @@ frequency: nightly-3x team: ml python: "3.8" - affected: "true" cluster: byod: type: gpu @@ -1534,7 +1528,6 @@ frequency: nightly-3x team: ml python: "3.8" - affected: "true" cluster: byod: type: gpu @@ -1630,7 +1623,6 @@ frequency: nightly-3x team: ml python: "3.8" - affected: "true" cluster: byod: runtime_env: @@ -1663,7 +1655,6 @@ frequency: nightly-3x team: ml python: "3.8" - affected: "true" cluster: byod: runtime_env: @@ -1758,7 +1749,6 @@ frequency: nightly-3x team: ml python: "3.8" - affected: "true" cluster: byod: type: gpu @@ -1795,7 +1785,6 @@ frequency: nightly team: ml python: "3.8" - affected: "true" cluster: byod: {} cluster_env: app_config.yaml @@ -1824,7 +1813,6 @@ frequency: nightly team: ml python: "3.8" - affected: "true" cluster: byod: {} cluster_env: app_config.yaml @@ -1853,7 +1841,6 @@ frequency: nightly team: ml python: "3.8" - affected: "true" cluster: byod: {} cluster_env: app_config.yaml @@ -2106,7 +2093,6 @@ frequency: weekly team: ml python: "3.8" - affected: "true" cluster: byod: {} cluster_env: app_config.yaml @@ -2320,7 +2306,6 @@ frequency: nightly-3x team: ml python: "3.8" - affected: "true" cluster: byod: type: gpu @@ -2395,7 +2380,6 @@ frequency: weekly team: rllib python: "3.8" - affected: "true" cluster: byod: type: gpu @@ -2442,7 +2426,6 @@ frequency: weekly team: rllib python: "3.8" - affected: "true" cluster: byod: type: gpu @@ -3720,7 +3703,6 @@ frequency: nightly team: rllib python: "3.8" - affected: "true" cluster: byod: type: gpu @@ -4540,7 +4522,6 @@ frequency: nightly team: rllib python: "3.8" - affected: "true" cluster: byod: type: gpu From 9e9227f6ef665cc50a07d8c87c1d1c527c6cc860 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Tue, 11 Jul 2023 14:40:30 -0700 Subject: [PATCH 36/38] Fix gptj finetuning release test Signed-off-by: Justin Yu --- .../examples/gptj_deepspeed_fine_tuning.ipynb | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb b/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb index 7cb444ddb7251..2ca0d0f32f4b2 100644 --- a/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb +++ b/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb @@ -582,10 +582,37 @@ "We pass the preprocessors we have defined earlier as an argument, wrapped in a {class}`~ray.data.preprocessors.chain.Chain`. The preprocessor will be included with the returned {class}`~ray.air.checkpoint.Checkpoint`, meaning it will also be applied during inference.\n", "\n", "```{note}\n", - "If you want to upload checkpoints to cloud storage (eg. S3), set {class}`air.RunConfig(storage_path) `. See {ref}`train-run-config` for an example. Using cloud storage is highly recommended, especially for production.\n", + "Since this example runs with multiple nodes, we need to persist checkpoints\n", + "and other outputs to some external storage for access after training has completed.\n", + "**You should set up cloud storage or NFS, then replace `storage_path` with your own cloud bucket URI or NFS path.**\n", + "\n", + "See the [storage guide](tune-storage-options) for more details.\n", "```" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "storage_path=\"s3://your-bucket-here\" # TODO: Set up cloud storage\n", + "# storage_path=\"/mnt/path/to/nfs\" # TODO: Alternatively, set up NFS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "storage_path = \"/mnt/cluster_storage\"" + ] + }, { "cell_type": "code", "execution_count": 12, @@ -593,7 +620,7 @@ "outputs": [], "source": [ "from ray.train.huggingface import TransformersTrainer\n", - "from ray.air.config import ScalingConfig\n", + "from ray.air import RunConfig, ScalingConfig\n", "from ray.data.preprocessors import Chain\n", "\n", "\n", @@ -610,6 +637,7 @@ " ),\n", " datasets={\"train\": ray_datasets[\"train\"], \"evaluation\": ray_datasets[\"validation\"]},\n", " preprocessor=Chain(splitter, tokenizer),\n", + " run_config=RunConfig(storage_path=storage_path),\n", ")" ] }, From 1d94f051e9151b52bee0b9cbb7b2378ec7386473 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Wed, 12 Jul 2023 12:29:27 -0700 Subject: [PATCH 37/38] 2.8 -> 2.7 in comments Signed-off-by: Justin Yu --- python/ray/air/constants.py | 2 +- python/ray/tune/tests/test_syncer_callback.py | 4 ++-- release/tune_tests/cloud_tests/workloads/run_cloud_test.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/ray/air/constants.py b/python/ray/air/constants.py index 62a31a8c6bb5b..c0ab7401183eb 100644 --- a/python/ray/air/constants.py +++ b/python/ray/air/constants.py @@ -93,7 +93,7 @@ # as Trainable) DISABLE_LAZY_CHECKPOINTING_ENV = "TRAIN_DISABLE_LAZY_CHECKPOINTING" -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 # Whether or not the sync-to-head behavior is enabled by default. # If unset, running AIR on a multi-node cluster with checkpointing will raise # an error telling the user to switch to cloud/NFS. diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index 6d8141fad9e1d..396491f80c1ec 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -578,7 +578,7 @@ def train_fn(config): assert_file(False, tmp_target, "save_to_object1234") -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): syncer_callback = SyncerCallback(sync_period=0) trial = MockTrial(trial_id="a", logdir=None) @@ -622,7 +622,7 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): ) -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 def test_head_node_syncing_disabled_warning(propagate_logs, caplog, monkeypatch): monkeypatch.setenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, "0") syncer_callback = SyncerCallback(sync_period=0) diff --git a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py index beff05d56a03c..afe77d95bd548 100644 --- a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py +++ b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py @@ -1025,7 +1025,7 @@ def after_experiments(): ) -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 def test_head_node_syncing_disabled_error(): """Tests that head node syncing is disabled properly in a multi-node setting. Runs a 4 trial Tune run, where each trial uses 2 CPUs. @@ -1076,7 +1076,7 @@ def train_fn_no_checkpoint(config): print("Success: a multi-node experiment without checkpoint still runs") -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.7. +# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 def test_ssh_sync(): """ SSH syncing, so: From 4e7c93d806cbe55419baf11aca5531147d8c8220 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Wed, 12 Jul 2023 14:54:30 -0700 Subject: [PATCH 38/38] tentatively deprecate in 2.7 Signed-off-by: Justin Yu --- python/ray/air/constants.py | 2 +- python/ray/tune/syncer.py | 3 ++- python/ray/tune/tests/test_syncer_callback.py | 4 ++-- release/tune_tests/cloud_tests/workloads/run_cloud_test.py | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/ray/air/constants.py b/python/ray/air/constants.py index c0ab7401183eb..6dc8a2bd77a7a 100644 --- a/python/ray/air/constants.py +++ b/python/ray/air/constants.py @@ -93,7 +93,7 @@ # as Trainable) DISABLE_LAZY_CHECKPOINTING_ENV = "TRAIN_DISABLE_LAZY_CHECKPOINTING" -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 +# TODO(ml-team): [Deprecation - head node syncing] # Whether or not the sync-to-head behavior is enabled by default. # If unset, running AIR on a multi-node cluster with checkpointing will raise # an error telling the user to switch to cloud/NFS. diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 49534ed471ec9..100b145a72da2 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -99,7 +99,8 @@ "syncing explicitly turned off, set `RunConfig(SyncConfig(syncer=None))`\n" "- Or, to re-enable the head node syncing behavior, set the " f"environment variable {REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE}=1\n" - " - **Note that this functionality will be fully removed in Ray 2.8.**" + " - **Note that this functionality will tentatively be hard-deprecated in " + "Ray 2.7.** See the linked issue for the latest information." ) diff --git a/python/ray/tune/tests/test_syncer_callback.py b/python/ray/tune/tests/test_syncer_callback.py index 396491f80c1ec..9d9ab0c960e64 100644 --- a/python/ray/tune/tests/test_syncer_callback.py +++ b/python/ray/tune/tests/test_syncer_callback.py @@ -578,7 +578,7 @@ def train_fn(config): assert_file(False, tmp_target, "save_to_object1234") -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 +# TODO(ml-team): [Deprecation - head node syncing] def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): syncer_callback = SyncerCallback(sync_period=0) trial = MockTrial(trial_id="a", logdir=None) @@ -622,7 +622,7 @@ def test_head_node_syncing_disabled_error(monkeypatch, tmp_path): ) -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 +# TODO(ml-team): [Deprecation - head node syncing] def test_head_node_syncing_disabled_warning(propagate_logs, caplog, monkeypatch): monkeypatch.setenv(REENABLE_DEPRECATED_SYNC_TO_HEAD_NODE, "0") syncer_callback = SyncerCallback(sync_period=0) diff --git a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py index afe77d95bd548..8783f843bbfaa 100644 --- a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py +++ b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py @@ -1025,7 +1025,7 @@ def after_experiments(): ) -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 +# TODO(ml-team): [Deprecation - head node syncing] def test_head_node_syncing_disabled_error(): """Tests that head node syncing is disabled properly in a multi-node setting. Runs a 4 trial Tune run, where each trial uses 2 CPUs. @@ -1076,7 +1076,7 @@ def train_fn_no_checkpoint(config): print("Success: a multi-node experiment without checkpoint still runs") -# TODO(ml-team): [Deprecation - head node syncing] Remove in 2.8 +# TODO(ml-team): [Deprecation - head node syncing] def test_ssh_sync(): """ SSH syncing, so: