ray-project · zhe-thoughts · Aug 25, 2023 · Aug 24, 2023 · Aug 24, 2023 · Aug 24, 2023
@@ -948,13 +948,11 @@
     }
    ],
    "source": [
-    "from ray.tune.syncer import SyncConfig\n",
     "# Save AIR checkpoints according to the performance on validation set\n",
     "run_config = RunConfig(\n",
     "    storage_path=storage_path,\n",
     "    name=\"finetune_dolly-v2-7b\",\n",
     "    checkpoint_config=CheckpointConfig(),\n",
-    "    sync_config=SyncConfig(sync_artifacts=False),\n",
     ")\n",
     "\n",
     "# Scale the DDP training workload across 16 GPUs\n",

@@ -66,7 +66,7 @@ Train Configuration
 Trainers are configured with configuration objects. There are two main configuration classes,
 the :class:`ScalingConfig <ray.air.config.ScalingConfig>` and the :class:`RunConfig <ray.air.config.RunConfig>`.
 The latter contains subconfigurations, such as the :class:`FailureConfig <ray.air.config.FailureConfig>`,
-:class:`SyncConfig <ray.tune.syncer.SyncConfig>` and :class:`CheckpointConfig <ray.air.config.CheckpointConfig>`.
+:class:`SyncConfig <ray.train.SyncConfig>` and :class:`CheckpointConfig <ray.air.config.CheckpointConfig>`.
 
 .. _train-key-concepts-results:
 

@@ -1,13 +1,11 @@
-Syncing in Tune (tune.SyncConfig, tune.Syncer)
-==============================================
+Syncing in Tune (train.SyncConfig)
+==================================
 
 .. seealso::
 
     See :doc:`this user guide </tune/tutorials/tune-storage>` for more details and examples.
 
 
-.. currentmodule:: ray.tune.syncer
-
 .. _tune-sync-config:
 
 Tune Syncing Configuration
@@ -16,42 +14,4 @@ Tune Syncing Configuration
 .. autosummary::
     :toctree: doc/
 
-    SyncConfig
-
-.. _tune-syncer:
-
-Remote Storage Syncer Interface (tune.Syncer)
----------------------------------------------
-
-Constructor
-~~~~~~~~~~~
-
-.. autosummary::
-    :toctree: doc/
-
-    Syncer
-
-
-Syncer Methods to Implement
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autosummary::
-    :toctree: doc/
-
-    Syncer.sync_up
-    Syncer.sync_down
-    Syncer.delete
-    Syncer.wait
-    Syncer.wait_or_retry
-
-
-Tune Built-in Syncers
----------------------
-
-.. autosummary::
-    :toctree: doc/
-
-    SyncerCallback
-    _DefaultSyncer
-    _BackgroundSyncer
-
+    ray.train.SyncConfig
@@ -249,102 +249,6 @@ def f(config, data=None):
     tuner.fit()
     # __log_1_end__
 
-    # __log_2_start__
-    from ray.tune.syncer import Syncer
-
-    class CustomSyncer(Syncer):
-        def sync_up(
-            self, local_dir: str, remote_dir: str, exclude: list = None
-        ) -> bool:
-            pass  # sync up
-
-        def sync_down(
-            self, remote_dir: str, local_dir: str, exclude: list = None
-        ) -> bool:
-            pass  # sync down
-
-        def delete(self, remote_dir: str) -> bool:
-            pass  # delete
-
-    tuner = tune.Tuner(
-        MyTrainableClass,
-        run_config=train.RunConfig(storage_path="s3://my-log-dir"),
-    )
-    tuner.fit()
-    # __log_2_end__
-
-    # __custom_command_syncer_start__
-    import subprocess
-    from ray.tune.syncer import Syncer
-
-    class CustomCommandSyncer(Syncer):
-        def __init__(
-            self,
-            sync_up_template: str,
-            sync_down_template: str,
-            delete_template: str,
-            sync_period: float = 300.0,
-        ):
-            self.sync_up_template = sync_up_template
-            self.sync_down_template = sync_down_template
-            self.delete_template = delete_template
-
-            super().__init__(sync_period=sync_period)
-
-        def sync_up(
-            self, local_dir: str, remote_dir: str, exclude: list = None
-        ) -> bool:
-            cmd_str = self.sync_up_template.format(
-                source=local_dir,
-                target=remote_dir,
-            )
-            try:
-                subprocess.check_call(cmd_str, shell=True)
-            except Exception as e:
-                print(f"Exception when syncing up {local_dir} to {remote_dir}: {e}")
-                return False
-            return True
-
-        def sync_down(
-            self, remote_dir: str, local_dir: str, exclude: list = None
-        ) -> bool:
-            cmd_str = self.sync_down_template.format(
-                source=remote_dir,
-                target=local_dir,
-            )
-            try:
-                subprocess.check_call(cmd_str, shell=True)
-            except Exception as e:
-                print(f"Exception when syncing down {remote_dir} to {local_dir}: {e}")
-                return False
-            return True
-
-        def delete(self, remote_dir: str) -> bool:
-            cmd_str = self.delete_template.format(
-                target=remote_dir,
-            )
-            try:
-                subprocess.check_call(cmd_str, shell=True)
-            except Exception as e:
-                print(f"Exception when deleting {remote_dir}: {e}")
-                return False
-            return True
-
-        def retry(self):
-            raise NotImplementedError
-
-        def wait(self):
-            pass
-
-    sync_config = tune.SyncConfig(
-        syncer=CustomCommandSyncer(
-            sync_up_template="aws s3 sync {source} {target}",
-            sync_down_template="aws s3 sync {source} {target}",
-            delete_template="aws s3 rm {target} --recursive",
-        ),
-    )
-    # __custom_command_syncer_end__
-
 
 if not MOCK:
     # __s3_start__

@@ -462,24 +462,10 @@ to do that depending on whether you are using class or functional Trainable API.
 
 **You are training a large number of trials on a cluster, or you are saving huge checkpoints**
 
-Checkpoints and logs are synced between nodes
-- usually at least to the driver on the head node, but sometimes between worker nodes if needed (e.g. when
-using :ref:`Population Based Training <tune-scheduler-pbt>`). If these checkpoints are very large (e.g. for
-NLP models), or if you are training a large number of trials, this syncing can take a long time.
-
-If nothing else is specified, syncing happens via SSH, which can lead to network overhead as connections are
-not kept open by Ray Tune.
-
-**Solution**: There are multiple solutions, depending on your needs:
-
-1. You can disable syncing to the driver in the :class:`tune.SyncConfig <ray.tune.SyncConfig>`. In this case,
-   logs and checkpoints will not be synced to the driver, so if you need to access them later, you will have to
-   transfer them where you need them manually.
-
-2. You can use :ref:`cloud checkpointing <tune-cloud-checkpointing>` to save logs and checkpoints to a specified `storage_path`.
-   This is the preferred way to deal with this. All syncing will be taken care of automatically, as all nodes
-   are able to access the cloud storage. Additionally, your results will be safe, so even when you're working on
-   pre-emptible instances, you won't lose any of your data.
+**Solution**: You can use :ref:`cloud checkpointing <tune-cloud-checkpointing>` to save logs and checkpoints to a specified `storage_path`.
+This is the preferred way to deal with this. All syncing will be taken care of automatically, as all nodes
+are able to access the cloud storage. Additionally, your results will be safe, so even when you're working on
+pre-emptible instances, you won't lose any of your data.
 
 **You are reporting results too often**
 
@@ -604,14 +590,6 @@ Here is an example of uploading to S3, using a bucket called ``my-log-dir``:
     :start-after: __log_1_start__
     :end-before: __log_1_end__
 
-You can customize synchronization behavior by implementing your own Syncer:
-
-.. literalinclude:: doc_code/faq.py
-    :dedent:
-    :language: python
-    :start-after: __log_2_start__
-    :end-before: __log_2_end__
-
 By default, syncing occurs whenever one of the following conditions are met:
 
 * if you have used a :py:class:`~ray.train.CheckpointConfig` with ``num_to_keep`` and a trial has checkpointed more than ``num_to_keep`` times since last sync,
@@ -628,53 +606,6 @@ For AWS set up, this involves adding an IamInstanceProfile configuration for wor
 Please :ref:`see here for more tips <aws-cluster-s3>`.
 
 
-.. _tune-cloud-syncing-command-line-example:
-
-How can I use the awscli or gsutil command line commands for syncing?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some users reported to run into problems with the default pyarrow-based syncing.
-In this case, a custom syncer that invokes the respective command line tools
-for transferring files between nodes and cloud storage can be implemented.
-
-Here is an example for a syncer that uses string templates that will be run
-as a command:
-
-.. literalinclude:: doc_code/faq.py
-    :dedent:
-    :language: python
-    :start-after: __custom_command_syncer_start__
-    :end-before: __custom_command_syncer_end__
-
-For different cloud services, these are example templates you can use with this syncer:
-
-AWS S3
-''''''
-
-.. code-block::
-
-    sync_up_template="aws s3 sync {source} {target} --exact-timestamps --only-show-errors"
-    sync_down_template="aws s3 sync {source} {target} --exact-timestamps --only-show-errors"
-    delete_template="aws s3 rm {target} --recursive --only-show-errors"
-
-Google cloud storage
-''''''''''''''''''''
-
-.. code-block::
-
-    sync_up_template="gsutil rsync -r {source} {target}"
-    sync_down_template="down": "gsutil rsync -r {source} {target}"
-    delete_template="delete": "gsutil rm -r {target}"
-
-HDFS
-''''
-
-.. code-block::
-
-    sync_up_template="hdfs dfs -put -f {source} {target}"
-    sync_down_template="down": "hdfs dfs -get -f {source} {target}"
-    delete_template="delete": "hdfs dfs -rm -r {target}"
-
-
 .. _tune-docker:
 
 How can I use Tune with Docker?

@@ -186,6 +186,14 @@ or use a custom logging library that requires multi-process logging.
 For example, you may want to do this if you're trying to log images to TensorBoard.
 We refer to these saved files as **trial artifacts**.
 
+.. note::
+
+    If :class:`SyncConfig(sync_artifacts=True) <ray.train.SyncConfig>`, trial artifacts
+    are uploaded periodically from each trial (or from each remote training worker for Ray Train)
+    to the :class:`RunConfig(storage_path) <ray.train.RunConfig>`.
+
+    See the :class:`~ray.train.SyncConfig` API reference for artifact syncing configuration options.
+
 You can save trial artifacts directly in the trainable, as shown below:
 
 .. tip:: Make sure that any logging calls or objects stay within scope of the Trainable.
@@ -269,42 +277,6 @@ should be configured to log to the Trainable's *working directory.* By default,
 the current working directory of both functional and class trainables is set to the
 corresponding trial directory once it's been launched as a remote Ray actor.
 
-.. warning::
-
-    When running in a multi-node cluster using the *deprecated* :ref:`head node storage option <tune-default-syncing>`,
-    trial artifacts are synchronized to the driver node under the specified path.
-    This will allow you to visualize and analyze logs of all distributed training workers on a single machine.
-
-When :ref:`specifying a cloud upload directory <tune-cloud-checkpointing>`, trial artifacts are uploaded to that cloud bucket
-for later analysis. Note that the driver node does not necessarily contain
-artifacts from *all* trials -- only the ones that were running on that node.
-To disable artifacts from being uploaded to the cloud, set ``SyncConfig(sync_artifacts=False)`` in :class:`~ray.tune.syncer.SyncConfig`.
-
-.. warning::
-
-    Appending to trial artifacts upon restoration is not supported.
-    As a workaround, save trial artifacts to separate files with unique filenames.
-
-    For example, instead of doing this:
-
-    .. code-block:: python
-
-        def appending_train_fn(config):
-            for i in range(config["num_epochs"]):
-                with open("./artifact.txt", "a") as f:
-                    f.write(f"Some data about iteration {i}\n")
-
-    Log artifacts as independent files with unique filenames:
-
-    .. code-block:: python
-
-        def separate_files_train_fn(config):
-            for i in range(config["num_epochs"]):
-                with open(f"./artifact_{i}.txt", "w") as f:
-                    f.write(f"Some data about iteration {i}\n")
-
-    If you are running into issues, `file an issue <https://github.com/ray-project/ray/issues>`_
-
 
 How to Build Custom Tune Loggers?
 ---------------------------------