diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yaml index b84bb1780..8c855400b 100644 --- a/.github/workflows/doc-build.yaml +++ b/.github/workflows/doc-build.yaml @@ -42,6 +42,17 @@ jobs: run: | cd docs make papermill + - name: Coverage + run: | + set -ex + cd docs + make coverage + if [ "$(wc -l build/*/coverage/python.txt)" -ne 2 ] + then + cat build/*/coverage/python.txt + echo "missing documentation coverage!" + exit 1 + fi docpush: runs-on: ubuntu-18.04 diff --git a/docs/source/app_best_practices.rst b/docs/source/app_best_practices.rst index a587786ee..29f4fb3b1 100644 --- a/docs/source/app_best_practices.rst +++ b/docs/source/app_best_practices.rst @@ -111,7 +111,7 @@ model definition from a python file and then you'll load the weights and state dict from a ``.ckpt`` or ``.pt`` file. This is how Pytorch Lightning's -`ModelCheckpoint `__ hook works. +`ModelCheckpoint `__ hook works. This is the most common but makes it harder to make a reusable app since your trainer app needs to include the model definition code. diff --git a/docs/source/basics.rst b/docs/source/basics.rst index b4b5688ed..04583014d 100644 --- a/docs/source/basics.rst +++ b/docs/source/basics.rst @@ -10,11 +10,12 @@ The top level modules in TorchX are: 1. :mod:`torchx.specs`: application spec (job definition) APIs 2. :mod:`torchx.components`: predefined (builtin) app specs -3. :mod:`torchx.runner`: given an app spec, submits the app as a job on a scheduler -4. :mod:`torchx.schedulers`: backend job schedulers that the runner supports -5. :mod:`torchx.pipelines`: adapters that convert the given app spec to a "stage" in an ML pipeline platform -6. :mod:`torchx.runtime`: util and abstraction libraries you can use in authoring apps (not app spec) -7. :mod:`torchx.cli`: CLI tool +3. :mod:`torchx.workspace`: handles patching images for remote execution +4. :mod:`torchx.cli`: CLI tool +5. :mod:`torchx.runner`: given an app spec, submits the app as a job on a scheduler +6. :mod:`torchx.schedulers`: backend job schedulers that the runner supports +7. :mod:`torchx.pipelines`: adapters that convert the given app spec to a "stage" in an ML pipeline platform +8. :mod:`torchx.runtime`: util and abstraction libraries you can use in authoring apps (not app spec) Below is a UML diagram diff --git a/docs/source/components/utils.rst b/docs/source/components/utils.rst index fcc001e3f..d4ad5b93d 100644 --- a/docs/source/components/utils.rst +++ b/docs/source/components/utils.rst @@ -4,9 +4,10 @@ Utils .. automodule:: torchx.components.utils .. currentmodule:: torchx.components.utils -.. autofunction:: torchx.components.utils.echo -.. autofunction:: torchx.components.utils.touch -.. autofunction:: torchx.components.utils.sh -.. autofunction:: torchx.components.utils.copy -.. autofunction:: torchx.components.utils.python -.. autofunction:: torchx.components.utils.booth +.. autofunction:: echo +.. autofunction:: touch +.. autofunction:: sh +.. autofunction:: copy +.. autofunction:: python +.. autofunction:: booth +.. autofunction:: binary diff --git a/docs/source/conf.py b/docs/source/conf.py index 6e6cd658c..0e297fd6a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -66,6 +66,12 @@ "IPython.sphinxext.ipython_console_highlighting", ] +# coverage options + +coverage_ignore_modules = [ + "torchx.components.component_test_base", +] + # katex options # # diff --git a/docs/source/index.rst b/docs/source/index.rst index 244e97cbd..d8e5de945 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -16,7 +16,7 @@ most unique applications can be serviced without customizing the whole vertical **GETTING STARTED?** First learn the :ref:`basic concepts` and -follow the :ref:`quickstart guide`. +follow the :ref:`quickstart guide`. .. image:: torchx_index_diag.png @@ -47,8 +47,37 @@ Documentation quickstart.md cli + runner.config + advanced + +Works With +--------------- + +.. _Schedulers: +.. toctree:: + :maxdepth: 1 + :caption: Schedulers + + schedulers/local + schedulers/docker + schedulers/kubernetes + schedulers/slurm + schedulers/ray + schedulers/aws_batch + +.. _Pipelines: +.. toctree:: + :maxdepth: 1 + :caption: Pipelines + + pipelines/kfp + + +Examples +------------ + .. toctree:: :maxdepth: 1 :caption: Examples @@ -58,6 +87,7 @@ Documentation examples_pipelines/index + Components Library --------------------- .. _Components: @@ -85,28 +115,6 @@ Runtime Library runtime/tracking -Works With ---------------- - -.. _Schedulers: -.. toctree:: - :maxdepth: 1 - :caption: Schedulers - - schedulers/local - schedulers/kubernetes - schedulers/slurm - schedulers/ray - schedulers/aws_batch - -.. _Pipelines: -.. toctree:: - :maxdepth: 1 - :caption: Pipelines - - pipelines/kfp - - Reference ----------- @@ -118,6 +126,7 @@ Reference specs runner schedulers + workspace pipelines .. toctree:: @@ -126,15 +135,3 @@ Reference app_best_practices component_best_practices - - -Experimental ---------------- -.. toctree:: - :maxdepth: 1 - :caption: Experimental Features - - experimental/runner.config - - - diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md index 174f6b8cf..cf3a45501 100644 --- a/docs/source/quickstart.md +++ b/docs/source/quickstart.md @@ -12,7 +12,7 @@ jupyter: name: python3 --- -# Quickstart +# Quickstart - Custom Components This is a self contained guide on how to build a simple app and component spec and launch it via two different schedulers. diff --git a/docs/source/experimental/runner.config.rst b/docs/source/runner.config.rst similarity index 62% rename from docs/source/experimental/runner.config.rst rename to docs/source/runner.config.rst index 76671b821..ed760296a 100644 --- a/docs/source/experimental/runner.config.rst +++ b/docs/source/runner.config.rst @@ -1,4 +1,4 @@ -(beta) .torchxconfig file +.torchxconfig ----------------------------- .. automodule:: torchx.runner.config @@ -10,3 +10,7 @@ Config API Functions .. autofunction:: apply .. autofunction:: load .. autofunction:: dump +.. autofunction:: find_configs +.. autofunction:: get_configs +.. autofunction:: get_config +.. autofunction:: load_sections diff --git a/docs/source/runtime/hpo.rst b/docs/source/runtime/hpo.rst index 3495bfd53..af26d513f 100644 --- a/docs/source/runtime/hpo.rst +++ b/docs/source/runtime/hpo.rst @@ -14,4 +14,7 @@ Ax (Adaptive Experimentation) .. currentmodule:: torchx.runtime.hpo.ax .. autoclass:: TorchXRunner + :members: + .. autoclass:: AppMetric + :members: diff --git a/docs/source/schedulers/aws_batch.rst b/docs/source/schedulers/aws_batch.rst index ec2f65410..7c50a6f67 100644 --- a/docs/source/schedulers/aws_batch.rst +++ b/docs/source/schedulers/aws_batch.rst @@ -2,7 +2,17 @@ AWS Batch ================= .. automodule:: torchx.schedulers.aws_batch_scheduler + .. currentmodule:: torchx.schedulers.aws_batch_scheduler .. autoclass:: AWSBatchScheduler :members: + :show-inheritance: + +.. autoclass:: BatchJob + :members: + +Reference +~~~~~~~~~~~~ + +.. autofunction:: create_scheduler diff --git a/docs/source/schedulers/docker.rst b/docs/source/schedulers/docker.rst new file mode 100644 index 000000000..ec0558cd3 --- /dev/null +++ b/docs/source/schedulers/docker.rst @@ -0,0 +1,23 @@ +Docker +================= + +.. automodule:: torchx.schedulers.docker_scheduler + +.. currentmodule:: torchx.schedulers.docker_scheduler + +.. autoclass:: DockerScheduler + :members: + :show-inheritance: + +.. autoclass:: DockerJob + :members: + +Reference +~~~~~~~~~~~~ + +.. autofunction:: create_scheduler + +.. autoclass:: DockerContainer + :members: + +.. autofunction:: has_docker diff --git a/docs/source/schedulers/kubernetes.rst b/docs/source/schedulers/kubernetes.rst index eacff62d9..e09a75678 100644 --- a/docs/source/schedulers/kubernetes.rst +++ b/docs/source/schedulers/kubernetes.rst @@ -2,8 +2,22 @@ Kubernetes ================= .. automodule:: torchx.schedulers.kubernetes_scheduler + .. currentmodule:: torchx.schedulers.kubernetes_scheduler .. autoclass:: KubernetesScheduler :members: + :show-inheritance: + +.. autoclass:: KubernetesJob + :members: + +Reference +~~~~~~~~~~~~ +.. autofunction:: create_scheduler +.. autofunction:: app_to_resource +.. autofunction:: cleanup_str +.. autofunction:: pod_labels +.. autofunction:: role_to_pod +.. autofunction:: sanitize_for_serialization diff --git a/docs/source/schedulers/local.rst b/docs/source/schedulers/local.rst index 0f735a08e..bdef5f837 100644 --- a/docs/source/schedulers/local.rst +++ b/docs/source/schedulers/local.rst @@ -2,24 +2,38 @@ Local ================= .. automodule:: torchx.schedulers.local_scheduler + .. currentmodule:: torchx.schedulers.local_scheduler .. autoclass:: LocalScheduler :members: - -.. automodule:: torchx.schedulers.docker_scheduler -.. currentmodule:: torchx.schedulers.docker_scheduler - -.. autoclass:: DockerScheduler - :members: + :show-inheritance: Image Providers ~~~~~~~~~~~~~~~~~ -.. currentmodule:: torchx.schedulers.local_scheduler - .. autoclass:: ImageProvider :members: .. autoclass:: CWDImageProvider :members: + +.. autoclass:: LocalDirectoryImageProvider + :members: + +Reference +~~~~~~~~~~~~ + +.. autofunction:: create_cwd_scheduler + +.. autoclass:: LogIterator + :members: + +.. autoclass:: PopenRequest + :members: + +.. autoclass:: ReplicaParam + :members: + +.. autoclass:: SignalException + :members: diff --git a/docs/source/schedulers/ray.rst b/docs/source/schedulers/ray.rst index 9095f8541..b6e53f382 100644 --- a/docs/source/schedulers/ray.rst +++ b/docs/source/schedulers/ray.rst @@ -2,7 +2,16 @@ Ray ================= .. automodule:: torchx.schedulers.ray_scheduler + .. currentmodule:: torchx.schedulers.ray_scheduler .. autoclass:: RayScheduler :members: + :show-inheritance: + +.. autofunction:: create_scheduler +.. autofunction:: has_ray +.. autofunction:: serialize + +.. autoclass:: RayJob + :members: diff --git a/docs/source/schedulers/slurm.rst b/docs/source/schedulers/slurm.rst index afac06714..c6019e04f 100644 --- a/docs/source/schedulers/slurm.rst +++ b/docs/source/schedulers/slurm.rst @@ -2,7 +2,17 @@ Slurm ================= .. automodule:: torchx.schedulers.slurm_scheduler + .. currentmodule:: torchx.schedulers.slurm_scheduler .. autoclass:: SlurmScheduler :members: + :show-inheritance: + +.. autofunction:: create_scheduler + +.. autoclass:: SlurmBatchRequest + :members: + +.. autoclass:: SlurmReplicaRequest + :members: diff --git a/docs/source/specs.rst b/docs/source/specs.rst index ada17156a..623a43965 100644 --- a/docs/source/specs.rst +++ b/docs/source/specs.rst @@ -24,14 +24,22 @@ Resource .. autoclass:: Resource :members: +.. autofunction:: resource + .. autofunction:: get_named_resources + AWS Named Resources ^^^^^^^^^^^^^^^^^^^^^ .. automodule:: torchx.specs.named_resources_aws .. currentmodule:: torchx.specs.named_resources_aws +.. autofunction:: aws_m5_2xlarge +.. autofunction:: aws_p3_2xlarge +.. autofunction:: aws_p3_8xlarge +.. autofunction:: aws_t3_medium + Macros ------------ .. currentmodule:: torchx.specs @@ -60,4 +68,23 @@ Component Linter .. automodule:: torchx.specs.file_linter .. currentmodule:: torchx.specs.file_linter -.. autofunction:: validate \ No newline at end of file +.. autofunction:: validate +.. autofunction:: get_fn_docstring + +.. autoclass:: LinterMessage + :members: + +.. autoclass:: TorchFunctionVisitor + :members: + +.. autoclass:: TorchXArgumentHelpFormatter + :members: + +.. autoclass:: TorchxFunctionArgsValidator + :members: + +.. autoclass:: TorchxFunctionValidator + :members: + +.. autoclass:: TorchxReturnValidator + :members: diff --git a/docs/source/workspace.rst b/docs/source/workspace.rst new file mode 100644 index 000000000..d8d851bd0 --- /dev/null +++ b/docs/source/workspace.rst @@ -0,0 +1,21 @@ +torchx.workspace +================ + +.. automodule:: torchx.workspace + :show-inheritance: + +.. currentmodule:: torchx.workspace + +.. autoclass:: Workspace + :members: + +torchx.workspace.docker_workspace +####################################### + + +.. automodule:: torchx.workspace.docker_workspace +.. currentmodule:: torchx.workspace.docker_workspace + +.. autoclass:: DockerWorkspace + :members: + :private-members: _update_app_images, _push_images diff --git a/torchx/runner/config.py b/torchx/runner/config.py index acd542eee..0702aae09 100644 --- a/torchx/runner/config.py +++ b/torchx/runner/config.py @@ -6,6 +6,8 @@ # LICENSE file in the root directory of this source tree. """ +Status: Beta + You can store the scheduler run cfg (run configs) for your project by storing them in the ``.torchxconfig`` file. Currently this file is only read and honored when running the component from the CLI. diff --git a/torchx/schedulers/__init__.py b/torchx/schedulers/__init__.py index 63128269b..f4d321042 100644 --- a/torchx/schedulers/__init__.py +++ b/torchx/schedulers/__init__.py @@ -23,7 +23,7 @@ def __call__(self, session_name: str, **kwargs: object) -> Scheduler: ... -def try_get_ray_scheduler() -> Optional[SchedulerFactory]: +def _try_get_ray_scheduler() -> Optional[SchedulerFactory]: try: from torchx.schedulers.ray_scheduler import _has_ray # @manual @@ -52,7 +52,7 @@ def get_scheduler_factories() -> Dict[str, SchedulerFactory]: "aws_batch": aws_batch_scheduler.create_scheduler, } - ray_scheduler_creator = try_get_ray_scheduler() + ray_scheduler_creator = _try_get_ray_scheduler() if ray_scheduler_creator: default_schedulers["ray"] = ray_scheduler_creator diff --git a/torchx/schedulers/aws_batch_scheduler.py b/torchx/schedulers/aws_batch_scheduler.py index 0dc2c3062..031ef7a0a 100644 --- a/torchx/schedulers/aws_batch_scheduler.py +++ b/torchx/schedulers/aws_batch_scheduler.py @@ -74,7 +74,7 @@ } -def role_to_node_properties(idx: int, role: Role) -> Dict[str, object]: +def _role_to_node_properties(idx: int, role: Role) -> Dict[str, object]: resource = role.resource reqs = [] cpu = resource.cpu @@ -244,7 +244,7 @@ def _submit_dryrun( # localhost for rank0. # See: https://docs.aws.amazon.com/batch/latest/userguide/job_env_vars.html replica_role.env["TORCHX_RANK0_HOST"] = "localhost" - nodes.append(role_to_node_properties(rank, replica_role)) + nodes.append(_role_to_node_properties(rank, replica_role)) req = BatchJob( name=name, diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py index 5efd0537d..bab42a865 100644 --- a/torchx/schedulers/local_scheduler.py +++ b/torchx/schedulers/local_scheduler.py @@ -95,7 +95,7 @@ def _terminate_process_handler(signum: int, frame: FrameType) -> None: @dataclass class ReplicaParam: """ - Holds ``LocalScheduler._popen()``parameters for each replica of the role. + Holds ``LocalScheduler._popen()`` parameters for each replica of the role. """ args: List[str] @@ -452,7 +452,7 @@ def __repr__(self) -> str: return f"{{app_id:{self.id}, state:{self.state}, pid_map:{role_to_pid}}}" -def join_PATH(*paths: Optional[str]) -> str: +def _join_PATH(*paths: Optional[str]) -> str: """ Joins strings that go in the PATH env var. Deals with empty strings and None-types, making sure no leading @@ -492,7 +492,7 @@ class PopenRequest: role_log_dirs: Dict[RoleName, List[str]] -def register_termination_signals() -> None: +def _register_termination_signals() -> None: """ Register SIGTERM and SIGINT handlers only for the main thread. """ @@ -569,7 +569,7 @@ def __init__( if cache_size <= 0: raise ValueError("cache size must be greater than zero") self._cache_size = cache_size - register_termination_signals() + _register_termination_signals() self._extra_paths: List[str] = extra_paths or [] @@ -679,7 +679,7 @@ def _popen( env.update(replica_params.env) # prepend extra_paths to PATH - env["PATH"] = join_PATH(*self._extra_paths, env.get("PATH")) + env["PATH"] = _join_PATH(*self._extra_paths, env.get("PATH")) cwd = replica_params.cwd if cwd: @@ -688,9 +688,9 @@ def _popen( # otherwise append cwd to PATH so that the binaries in PATH # precede over those in cwd if prepend_cwd: - env["PATH"] = join_PATH(cwd, env.get("PATH")) + env["PATH"] = _join_PATH(cwd, env.get("PATH")) else: - env["PATH"] = join_PATH(env.get("PATH"), cwd) + env["PATH"] = _join_PATH(env.get("PATH"), cwd) # default to unbuffered python for faster responsiveness locally env.setdefault("PYTHONUNBUFFERED", "x") diff --git a/torchx/schedulers/ray_scheduler.py b/torchx/schedulers/ray_scheduler.py index 0ffb22502..3800dbe49 100644 --- a/torchx/schedulers/ray_scheduler.py +++ b/torchx/schedulers/ray_scheduler.py @@ -273,6 +273,11 @@ def _validate(self, app: AppDef, scheduler: SchedulerBackend) -> None: break def wait_until_finish(self, app_id: str, timeout: int = 30) -> None: + """ + ``wait_until_finish`` waits until the specified job has finished + with a given timeout. This is intended for testing. Programmatic + usage should use the runner wait method instead. + """ addr, app_id = app_id.split("-") client = JobSubmissionClient(f"http://{addr}") diff --git a/torchx/schedulers/slurm_scheduler.py b/torchx/schedulers/slurm_scheduler.py index cd41f0c84..125da5f70 100644 --- a/torchx/schedulers/slurm_scheduler.py +++ b/torchx/schedulers/slurm_scheduler.py @@ -92,6 +92,10 @@ class SlurmReplicaRequest: def from_role( cls, name: str, role: Role, cfg: Mapping[str, CfgVal] ) -> "SlurmReplicaRequest": + """ + ``from_role`` creates a SlurmReplicaRequest for the specific role and + name. + """ sbatch_opts = {} for k, v in cfg.items(): if v is None: diff --git a/torchx/schedulers/test/local_scheduler_test.py b/torchx/schedulers/test/local_scheduler_test.py index 63cbec443..7bdfbc20d 100644 --- a/torchx/schedulers/test/local_scheduler_test.py +++ b/torchx/schedulers/test/local_scheduler_test.py @@ -31,7 +31,7 @@ LocalScheduler, ReplicaParam, create_cwd_scheduler, - join_PATH, + _join_PATH, make_unique, ) from torchx.specs.api import AppDef, AppState, Role, is_terminal, macros, Resource @@ -973,20 +973,20 @@ def _test_orphan_workflow(self) -> None: class JoinPATHTest(unittest.TestCase): def test_join_PATH(self) -> None: - self.assertEqual("", join_PATH(None)) - self.assertEqual("", join_PATH("")) - self.assertEqual("", join_PATH("", None)) - self.assertEqual("/usr/local/bin", join_PATH("/usr/local/bin", "")) - self.assertEqual("/usr/local/bin", join_PATH("/usr/local/bin", None)) - self.assertEqual("/usr/local/bin", join_PATH("", "/usr/local/bin")) - self.assertEqual("/usr/local/bin", join_PATH(None, "/usr/local/bin")) + self.assertEqual("", _join_PATH(None)) + self.assertEqual("", _join_PATH("")) + self.assertEqual("", _join_PATH("", None)) + self.assertEqual("/usr/local/bin", _join_PATH("/usr/local/bin", "")) + self.assertEqual("/usr/local/bin", _join_PATH("/usr/local/bin", None)) + self.assertEqual("/usr/local/bin", _join_PATH("", "/usr/local/bin")) + self.assertEqual("/usr/local/bin", _join_PATH(None, "/usr/local/bin")) path = ":/usr/bin:/bin:" self.assertEqual( - "/usr/local/bin:/usr/bin:/bin", join_PATH("/usr/local/bin", path) + "/usr/local/bin:/usr/bin:/bin", _join_PATH("/usr/local/bin", path) ) self.assertEqual( - "/usr/bin:/bin:/usr/local/bin", join_PATH(path, "/usr/local/bin") + "/usr/bin:/bin:/usr/local/bin", _join_PATH(path, "/usr/local/bin") ) diff --git a/torchx/specs/__init__.py b/torchx/specs/__init__.py index b26614057..d22fe294c 100644 --- a/torchx/specs/__init__.py +++ b/torchx/specs/__init__.py @@ -93,11 +93,12 @@ def resource( Example: .. code-block:: python - resource(cpu=1) # returns Resource(cpu=1) - resource(named_resource="foobar") # returns registered named resource "foo" - resource(cpu=1, named_resource="foobar") # returns registered named resource "foo" (cpu=1 ignored) - resource() # returns default resource values - resource(cpu=None, gpu=None, memMB=None) # throws + + resource(cpu=1) # returns Resource(cpu=1) + resource(named_resource="foobar") # returns registered named resource "foo" + resource(cpu=1, named_resource="foobar") # returns registered named resource "foo" (cpu=1 ignored) + resource() # returns default resource values + resource(cpu=None, gpu=None, memMB=None) # throws """ if h: diff --git a/torchx/specs/file_linter.py b/torchx/specs/file_linter.py index 8f6ccbba2..061762a71 100644 --- a/torchx/specs/file_linter.py +++ b/torchx/specs/file_linter.py @@ -94,6 +94,9 @@ class LinterMessage: class TorchxFunctionValidator(abc.ABC): @abc.abstractmethod def validate(self, app_specs_func_def: ast.FunctionDef) -> List[LinterMessage]: + """ + Method to call to validate the provided function def. + """ raise NotImplementedError() def _gen_linter_message(self, description: str, lineno: int) -> LinterMessage: @@ -244,11 +247,14 @@ def __init__(self, component_function_name: str) -> None: self.visited_function = False def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + """ + Validates the function def with the child validators. + """ if node.name != self.component_function_name: return self.visited_function = True - for validatior in self.validators: - self.linter_errors += validatior.validate(node) + for validator in self.validators: + self.linter_errors += validator.validate(node) def validate(path: str, component_function: str) -> List[LinterMessage]: diff --git a/torchx/workspace/__init__.py b/torchx/workspace/__init__.py index b871b6a0f..2b48db3b0 100644 --- a/torchx/workspace/__init__.py +++ b/torchx/workspace/__init__.py @@ -4,4 +4,20 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +""" +Status: Beta + +Workspaces are used to apply local changes on top of existing images so you can +execute your code on a remote cluster. This module contains the interfaces used +by workspace implementations. + +These workspaces are defined as an ``fsspec`` path which the directories and +files under will be used to generate a patch. + +Example workspace paths: + + * ``file://.`` the current working directory + * ``memory://foo-bar/`` an in-memory workspace for notebook/programmatic usage +""" + from torchx.workspace.api import Workspace # noqa: F401 diff --git a/torchx/workspace/api.py b/torchx/workspace/api.py index 8dd593471..d5bd868fb 100644 --- a/torchx/workspace/api.py +++ b/torchx/workspace/api.py @@ -19,7 +19,7 @@ class Workspace(abc.ABC): automatically rebuild images or generate diff patches that are applied to the ``Role``, allowing the user to make local code changes to the application and having those changes be reflected - (either through a new image or an overlayed patch) at runtime + (either through a new image or an overlaid patch) at runtime without a manual image rebuild. The exact semantics of what the workspace build artifact is, is implementation dependent. """ @@ -29,7 +29,7 @@ def build_workspace_and_update_role(self, role: Role, workspace: str) -> None: """ Builds the specified ``workspace`` with respect to ``img`` and updates the ``role`` to reflect the built workspace artifacts. - In the simplest case, this method builds a new image and udpates + In the simplest case, this method builds a new image and updates the role's image. Certain (more efficient) implementations build incremental diff patches that overlay on top of the role's image. diff --git a/torchx/workspace/docker_workspace.py b/torchx/workspace/docker_workspace.py index 4d605ad77..252abf6f2 100644 --- a/torchx/workspace/docker_workspace.py +++ b/torchx/workspace/docker_workspace.py @@ -25,7 +25,18 @@ class DockerWorkspace(Workspace): """ - DockerWorkspace will build patched docker images from the workspace. + DockerWorkspace will build patched docker images from the workspace. These + patched images are docker images and can be either used locally via the + docker daemon or pushed using the helper methods to a remote repository for + remote jobs. + + This requires a running docker daemon locally and for remote pushing + requires being authenticated to those repositories via ``docker login``. + + See more: + + * https://docs.docker.com/engine/reference/commandline/login/ + * https://docs.docker.com/get-docker/ """ LABEL_VERSION: str = "torchx.pytorch.org/version" @@ -73,6 +84,17 @@ def build_workspace_and_update_role(self, role: Role, workspace: str) -> None: def _update_app_images( self, app: AppDef, cfg: Mapping[str, CfgVal] ) -> Dict[str, Tuple[str, str]]: + """ + _update_app_images replaces the local Docker images (identified via + ``sha256:...``) in the provided ``AppDef`` with the remote path that they will be uploaded to and + returns a mapping of local to remote names. + + ``_push_images`` must be called with the returned mapping before + launching the job. + + Returns: + A dict of [local image name, (remote repo, tag)]. + """ HASH_PREFIX = "sha256:" images_to_push = {} @@ -95,6 +117,15 @@ def _update_app_images( return images_to_push def _push_images(self, images_to_push: Dict[str, Tuple[str, str]]) -> None: + """ + _push_images pushes the specified images to the remote container + repository with the specified tag. The docker daemon must be + authenticated to the remote repository using ``docker login``. + + Args: + images_to_push: A dict of [local image name, (remote repo, tag)]. + """ + if len(images_to_push) == 0: return