diff --git a/docs/Makefile b/docs/Makefile index e520c7e1..c893a632 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -10,7 +10,7 @@ SPHINXBUILD = sphinx-build SPHINXPROJ = torchelastic SOURCEDIR = source BUILDDIR = build -VERSION := $(shell python -c "import torchelastic; print(torchelastic.__version__)") +VERSION := "0.2.3.dev0" # Put it first so that "make" without argument is like "make help". help: diff --git a/docs/source/agent.rst b/docs/source/agent.rst deleted file mode 100644 index 26620293..00000000 --- a/docs/source/agent.rst +++ /dev/null @@ -1,61 +0,0 @@ -Elastic Agent -============== - -.. automodule:: torchelastic.agent -.. currentmodule:: torchelastic.agent - -Server --------- - -.. automodule:: torchelastic.agent.server - -Below is a diagram of an agent that manages a local group of workers. - -.. image:: agent_diagram.jpg - -Concepts --------- - -This section describes the high-level classes and concepts that -are relevant to understanding the role of the ``agent`` in torchelastic. - -.. currentmodule:: torchelastic.agent.server - -.. autoclass:: ElasticAgent - :members: - -.. autoclass:: WorkerSpec - :members: - -.. autoclass:: WorkerState - :members: - -.. autoclass:: Worker - :members: - -.. autoclass:: WorkerGroup - :members: - -Implementations -------------------- - -Below are the agent implementations provided by torchelastic. - -.. currentmodule:: torchelastic.agent.server.local_elastic_agent -.. autoclass:: LocalElasticAgent - - -Extending the Agent ---------------------- - -To extend the agent you can implement ```ElasticAgent`` directly, however -we recommend you extend ``SimpleElasticAgent`` instead, which provides -most of the scaffolding and leaves you with a few specific abstract methods -to implement. - -.. currentmodule:: torchelastic.agent.server -.. autoclass:: SimpleElasticAgent - :members: - :private-members: - -.. autoclass:: torchelastic.agent.server.api.RunResult diff --git a/docs/source/agent_diagram.jpg b/docs/source/agent_diagram.jpg deleted file mode 100644 index 79fad343..00000000 Binary files a/docs/source/agent_diagram.jpg and /dev/null differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 5fafd1a2..5c7c3379 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,8 +22,6 @@ # import os # import sys # sys.path.insert(0, os.path.abspath('.')) -import torch -import torchelastic from docutils import nodes from sphinx import addnodes from sphinx.util.docfields import TypedField @@ -88,7 +86,7 @@ # # The short X.Y version. # TODO: change to [:2] at v1.0 -version = f"v{torchelastic.__version__}" +version = "v0.2.3.dev0" # The full version, including alpha/beta/rc tags. # TODO: verify this works as expected release = "master" diff --git a/docs/source/customization.rst b/docs/source/customization.rst deleted file mode 100644 index 761413b6..00000000 --- a/docs/source/customization.rst +++ /dev/null @@ -1,118 +0,0 @@ -Customization -============= - -This section describes how to customize TorchElastic to fit your needs. - -Launcher ------------------------- - -The launcher program that ships with TorchElastic -should be sufficient for most use-cases (see :ref:`launcher-api`). -You can implement a custom launcher by -programmatically creating an agent and passing it specs for your workers as -shown below. - -.. code-block:: python - - # my_launcher.py - - if __name__ == "__main__": - args = parse_args(sys.argv[1:]) - rdzv_handler = RendezvousHandler(...) - spec = WorkerSpec( - local_world_size=args.nproc_per_node, - fn=trainer_entrypoint_fn, - args=(trainer_entrypoint_fn args.fn_args,...), - rdzv_handler=rdzv_handler, - max_restarts=args.max_restarts, - monitor_interval=args.monitor_interval, - ) - - agent = LocalElasticAgent(spec, start_method="spawn") - try: - run_result = agent.run() - if run_result.is_failed(): - print(f"worker 0 failed with: run_result.failures[0]") - else: - print(f"worker 0 return value is: run_result.return_values[0]") - except Exception ex: - # handle exception - - -Rendezvous Handler ------------------------- - -To implement your own rendezvous, extend ``torchelastic.rendezvous.RendezvousHandler`` -and implement its methods. - -.. warning:: Rendezvous handlers are tricky to implement. Before you begin - make sure you completely understand the properties of rendezvous. - Please refer to :ref:`rendezvous-api` for more information. - -Once implemented you can pass your custom rendezvous handler to the worker -spec when creating the agent. - -.. code-block:: python - - spec = WorkerSpec( - rdzv_handler=MyRendezvousHandler(params), - ... - ) - elastic_agent = LocalElasticAgent(spec, start_method=start_method) - elastic_agent.run(spec.role) - - -Metric Handler ------------------------------ - -TorchElastic emits platform level metrics (see :ref:`metrics-api`). -By default metrics are emitted to `/dev/null` so you will not see them. -To have the metrics pushed to a metric handling service in your infrastructure, -implement a `torchelastic.metrics.MetricHandler` and `configure` it in your -custom launcher. - -.. code-block:: python - - # my_launcher.py - - import torchelastic.metrics as metrics - - class MyMetricHandler(metrics.MetricHandler): - def emit(self, metric_data: metrics.MetricData): - # push metric_data to your metric sink - - def main(): - metrics.configure(MyMetricHandler()) - - spec = WorkerSpec(...) - agent = LocalElasticAgent(spec) - agent.run() - -Events Handler ------------------------------ - -TorchElastic supports events recording (see :ref:`events-api`). -The events module defines API that allows you to record events and -implement custom EventHandler. EventHandler is used for publishing events -produced during torchelastic execution to different sources, e.g. AWS CloudWatch. -By default it uses `torchelastic.events.NullEventHandler` that ignores -events. To configure custom events handler you need to implement -`torchelastic.events.EventHandler` interface and `configure` it -in your custom launcher. - -.. code-block:: python - - # my_launcher.py - - import torchelastic.events as events - - class MyEventHandler(events.EventHandler): - def record(self, event: events.Event): - # process event - - def main(): - events.configure(MyEventHandler()) - - spec = WorkerSpec(...) - agent = LocalElasticAgent(spec) - agent.run() diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst deleted file mode 100644 index c32ac6db..00000000 --- a/docs/source/distributed.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. _launcher-api: - -Elastic Launch -============================ - -.. automodule:: torchelastic.distributed - -Elastic launcher ------------------- - -.. automodule:: torchelastic.distributed.launch diff --git a/docs/source/errors.rst b/docs/source/errors.rst deleted file mode 100644 index 1105d1b2..00000000 --- a/docs/source/errors.rst +++ /dev/null @@ -1,17 +0,0 @@ -Error Propagation -================== - -.. automodule:: torch.distributed.elastic.multiprocessing.errors - -Methods and Classes ---------------------- - -.. currentmodule:: torch.distributed.elastic.multiprocessing.errors - -.. autofunction:: torch.distributed.elastic.multiprocessing.errors.record - -.. autoclass:: ChildFailedError - -.. autoclass:: ErrorHandler - -.. autoclass:: ProcessFailure diff --git a/docs/source/etcd_rdzv_diagram.png b/docs/source/etcd_rdzv_diagram.png deleted file mode 100644 index c15b8160..00000000 Binary files a/docs/source/etcd_rdzv_diagram.png and /dev/null differ diff --git a/docs/source/events.rst b/docs/source/events.rst deleted file mode 100644 index 2fba6d7d..00000000 --- a/docs/source/events.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. _events-api: - -Events -============================ - -.. automodule:: torchelastic.events - -API Methods ------------- - -.. autofunction:: torchelastic.events.record - -.. autofunction:: torchelastic.events.get_logging_handler - -Event Objects ------------------ - -.. currentmodule:: torchelastic.events.api - -.. autoclass:: Event - -.. autoclass:: EventSource - -.. autoclass:: EventMetadataValue - - - diff --git a/docs/source/index.rst b/docs/source/index.rst index 428d0ded..47af3d2f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,49 +3,14 @@ TorchElastic ================== -Makes distributed PyTorch fault-tolerant and elastic. - -Get Started ---------------- -.. toctree:: - :maxdepth: 1 - :caption: Usage - - quickstart - train_script - examples - -Documentation ---------------- - -.. toctree:: - :maxdepth: 1 - :caption: API - - distributed - agent - multiprocessing - errors - rendezvous - timer - metrics - events - -.. toctree:: - :maxdepth: 1 - :caption: Advanced - - customization - -.. toctree:: - :maxdepth: 1 - :caption: Plugins - - kubernetes - runtime - +.. important:: TorchElastic has been upstreamed to PyTorch (``torch.distributed.elastic``) and will be + included as part of the next PyTorch release (``torch-1.9``). In the meanwhile please use + `pytorch-nightly `_ or + `torchelastic-0.2.2 `_. For documentation please refer to our + `v0.2.2 docs `_ for now. + Training Session Manager(TSM) ---------------- +------------------------------ .. toctree:: :maxdepth: 1 :caption: Training Session Manager(TSM) diff --git a/docs/source/kubernetes.rst b/docs/source/kubernetes.rst deleted file mode 100644 index 55a051b5..00000000 --- a/docs/source/kubernetes.rst +++ /dev/null @@ -1,5 +0,0 @@ -TorchElastic Kubernetes -========================== - -Please refer to our github's `Kubernetes README `_ -for more information on Elastic Job Controller and custom resource definition. diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst deleted file mode 100644 index c4a3d4ec..00000000 --- a/docs/source/metrics.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _metrics-api: - -Metrics -========= - -.. automodule:: torchelastic.metrics - - -Metric Handlers ------------------ - -.. currentmodule:: torchelastic.metrics.api - -Below are the metric handlers that come included with torchelastic. - -.. autoclass:: MetricHandler - -.. autoclass:: ConsoleMetricHandler - -.. autoclass:: NullMetricHandler - - - -Methods ------------- - -.. autofunction:: torchelastic.metrics.configure - -.. autofunction:: torchelastic.metrics.prof - -.. autofunction:: torchelastic.metrics.put_metric diff --git a/docs/source/multiprocessing.rst b/docs/source/multiprocessing.rst deleted file mode 100644 index fc5866c0..00000000 --- a/docs/source/multiprocessing.rst +++ /dev/null @@ -1,24 +0,0 @@ -:github_url: https://github.com/pytorch/elastic - -Multiprocessing -================ - -.. automodule:: torch.distributed.elastic.multiprocessing - -Starting Multiple Workers ---------------------------- - -.. autofunction:: torch.distributed.elastic.multiprocessing.start_processes - -Process Context ----------------- - -.. currentmodule:: torch.distributed.elastic.multiprocessing.api - -.. autoclass:: PContext - -.. autoclass:: MultiprocessContext - -.. autoclass:: SubprocessContext - -.. autoclass:: RunProcsResult diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst deleted file mode 100644 index 81f8c20d..00000000 --- a/docs/source/quickstart.rst +++ /dev/null @@ -1,50 +0,0 @@ -Quickstart -=========== - -.. code-block:: bash - - pip install torchelastic - - # start a single-node etcd server on ONE host - etcd --enable-v2 - --listen-client-urls http://0.0.0.0:2379,http://127.0.0.1:4001 - --advertise-client-urls PUBLIC_HOSTNAME:2379 - -To launch a **fault-tolerant** job, run the following on all nodes. - -.. code-block:: bash - - python -m torchelastic.distributed.launch - --nnodes=NUM_NODES - --nproc_per_node=TRAINERS_PER_NODE - --rdzv_id=JOB_ID - --rdzv_backend=etcd - --rdzv_endpoint=ETCD_HOST:ETCD_PORT - YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...) - - -To launch an **elastic** job, run the following on at least ``MIN_SIZE`` nodes -and at most ``MAX_SIZE`` nodes. - -.. code-block:: bash - - python -m torchelastic.distributed.launch - --nnodes=MIN_SIZE:MAX_SIZE - --nproc_per_node=TRAINERS_PER_NODE - --rdzv_id=JOB_ID - --rdzv_backend=etcd - --rdzv_endpoint=ETCD_HOST:ETCD_PORT - YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...) - - -.. note:: The `--standalone` option can be passed to launch a single node job with - a sidecar rendezvous server. You don’t have to pass —rdzv_id, —rdzv_endpoint, - and —rdzv_backend when the —standalone option is used - - -.. note:: Learn more about writing your distributed training script - `here `_. - -If ``torchelastic.distributed.launch`` does not meet your requirements -you may use our APIs directly for more powerful customization. Start by -taking a look at the `elastic agent `_ API). diff --git a/docs/source/rendezvous.rst b/docs/source/rendezvous.rst deleted file mode 100644 index 8eb99a52..00000000 --- a/docs/source/rendezvous.rst +++ /dev/null @@ -1,57 +0,0 @@ -.. _rendezvous-api: - -Rendezvous -========== - -.. automodule:: torchelastic.rendezvous - -Below is a state diagram describing how rendezvous works. - -.. image:: etcd_rdzv_diagram.png - - -Handler --------------------- - -.. currentmodule:: torchelastic.rendezvous - -.. autoclass:: RendezvousHandler - :members: - -Exceptions -------------- -.. autoclass:: RendezvousClosedException -.. autoclass:: RendezvousTimeoutException -.. autoclass:: RendezvousNonRetryableError - -Implmentations ----------------- - -Etcd Rendezvous -**************** - -.. currentmodule:: torchelastic.rendezvous.etcd_rendezvous - -.. autoclass:: EtcdRendezvousHandler - -.. autoclass:: EtcdRendezvous - :members: - -.. autoclass:: EtcdStore - :members: - -Etcd Server -************* - -The ``EtcdServer`` is a convenience class that makes it easy for you to -start and stop an etcd server on a subprocess. This is useful for testing -or single-node (multi-worker) deployments where manually setting up an -etcd server on the side is cumbersome. - -.. warning:: For production and multi-node deployments please consider - properly deploying a highly available etcd server as this is - the single point of failure for your distributed jobs. - -.. currentmodule:: torchelastic.rendezvous.etcd_server - -.. autoclass:: EtcdServer diff --git a/docs/source/runtime.rst b/docs/source/runtime.rst deleted file mode 100644 index e32d7c50..00000000 --- a/docs/source/runtime.rst +++ /dev/null @@ -1,18 +0,0 @@ -Cloud Provider Support (Deprecated) -==================================== - -Sections below are instructions on running torchelastic on Kubernetes and -various cloud providers. - -.. warning:: The instructions below will only work on torchelastic ``0.1.0rc``. - We highly encourage you to use our kubernetes - `elastic job controller `_ and use a kubernetes - cluster setup on your preferred cloud provider. - -AWS (EC2) ------------------- -Refer to our github's `ec2 README `_ - -Azure ------- -Refer to our github's `azure README `_ diff --git a/docs/source/timer.rst b/docs/source/timer.rst deleted file mode 100644 index ca5b251c..00000000 --- a/docs/source/timer.rst +++ /dev/null @@ -1,41 +0,0 @@ -Expiration Timers -================== - -.. automodule:: torchelastic.timer -.. currentmodule:: torchelastic.timer - -Client Methods ---------------- -.. autofunction:: torchelastic.timer.configure - -.. autofunction:: torchelastic.timer.expires - -Server/Client Implementations ------------------------------- -Below are the timer server and client pairs that are provided by torchelastic. - -.. note:: Timer server and clients always have to be implemented and used - in pairs since there is a messaging protocol between the server - and client. - -.. autoclass:: LocalTimerServer - -.. autoclass:: LocalTimerClient - -Writing a custom timer server/client --------------------------------------- - -To write your own timer server and client extend the -``torchelastic.timer.TimerServer`` for the server and -``torchelastic.timer.TimerClient`` for the client. The -``TimerRequest`` object is used to pass messages between -the server and client. - -.. autoclass:: TimerRequest - :members: - -.. autoclass:: TimerServer - :members: - -.. autoclass:: TimerClient - :members: diff --git a/docs/source/train_script.rst b/docs/source/train_script.rst deleted file mode 100644 index 8134a648..00000000 --- a/docs/source/train_script.rst +++ /dev/null @@ -1,46 +0,0 @@ -Train script -------------- - -If your train script works with ``torch.distributed.launch`` it will continue -working with ``torchelastic.distributed.launch`` with these differences: - -1. No need to manually pass ``RANK``, ``WORLD_SIZE``, - ``MASTER_ADDR``, and ``MASTER_PORT``. - -2. ``rdzv_backend`` and ``rdzv_endpoint`` must be provided. For most users - this will be set to ``etcd`` (see `rendezvous `_). - -3. Make sure you have a ``load_checkpoint(path)`` and - ``save_checkpoint(path)`` logic in your script. When workers fail - we restart all the workers with the same program arguments so you will - lose progress up to the most recent checkpoint - (see `elastic launch `_). - -4. ``use_env`` flag has been removed. If you were parsing local rank by parsing - the ``--local_rank`` option, you need to get the local rank from the - environment variable ``LOCAL_RANK`` (e.g. ``os.environ["LOCAL_RANK"]``). - -Below is an expository example of a training script that checkpoints on each -epoch, hence the worst-case progress lost on failure is one full epoch worth -of training. - -.. code-block:: python - - def main(): - args = parse_args(sys.argv[1:]) - state = load_checkpoint(args.checkpoint_path) - initialize(state) - - # torchelastic.distributed.launch ensure that this will work - # by exporting all the env vars needed to initialize the process group - torch.distributed.init_process_group(backend=args.backend) - - for i in range(state.epoch, state.total_num_epochs) - for batch in iter(state.dataset) - train(batch, state.model) - - state.epoch += 1 - save_checkpoint(state) - -For concrete examples of torchelastic-compliant train scripts, visit -our `examples `_ page.