From f479809f39c0ea05c4e6eba68db979e8d9fead8e Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Thu, 13 May 2021 12:19:51 -0700 Subject: [PATCH] add support for jetter to Role (base_image) for mast launches Summary: 1. Adds `ml_image` buck macro 2. Adds `--run_path` option to `torch.distributed.run` 3. Adds `tsm/driver/fb/test/patched/foo` (for unittesting) 4. Changes to `distributed_sum` to use `ml_image` (see Test plan for how this was tested in local and mast) NOTE: need to enable jetter for flow and local schedulers (will do this on a separate diff since this diff is already really big) Differential Revision: D28421033 fbshipit-source-id: ef1e4e53a337114276d35af66b92b95dd40b11f5 --- torchelastic/tsm/driver/api.py | 42 ++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/torchelastic/tsm/driver/api.py b/torchelastic/tsm/driver/api.py index d16644a8..7cbf4c5b 100644 --- a/torchelastic/tsm/driver/api.py +++ b/torchelastic/tsm/driver/api.py @@ -92,6 +92,17 @@ class Container: A ``Resource`` can be bound to a specific scheduler backend or ``SchedulerBackend.ALL`` (default) to specify that the same ``Resource`` is to be used for all schedulers. + An optional ``base_image`` can be specified if the scheduler supports a + concept of base images. For schedulers that run Docker containers the + base image is not useful since the application image itself can be + built from a base image (using the ``FROM base/image:latest`` construct in + the Dockerfile). However the base image is useful for schedulers that + work with simple image artifacts (e.g. ``*.tar.gz``) that do not have a built-in + concept of base images. For these schedulers, specifying a base image that + includes dependencies while the main image is the actual application code + makes it possible to make changes to the application code without incurring + the cost of re-building the uber artifact. + Usage: :: @@ -106,9 +117,13 @@ class Container: .require(Resource(cpu=1, gpu=1, memMB=500), "custom_scheduler") .ports(tcp_store=8080, tensorboard=8081) + # for schedulers that support base_images + my_container = Container(image="my/trainer:1", base_image="common/ml-tools:latest") + .require(...) """ image: str + base_image: Optional[str] = None resources: Resource = NULL_RESOURCE port_map: Dict[str, int] = field(default_factory=dict) @@ -130,6 +145,9 @@ def ports(self, **kwargs: int) -> "Container": # sentinel value used to represent missing string attributes, such as image or entrypoint MISSING: str = "" +# sentinel value used to represent "unset" optional string attributes +NONE: str = "" + # sentinel value used as the "zero" element in the container group NULL_CONTAINER: Container = Container(image=MISSING) @@ -141,9 +159,11 @@ class macros: Available macros: - 1. ``img_root`` - root directory of the pulled image on the container - 2. ``app_id`` - application id as assigned by the scheduler - 3. ``replica_id`` - unique id for each instance of a replica of a Role, + 1. ``img_root`` - root directory of the pulled conatiner.image + 2. ``base_img_root`` - root directory of the pulled container.base_image + (resolves to "" if no base_image set) + 3. ``app_id`` - application id as assigned by the scheduler + 4. ``replica_id`` - unique id for each instance of a replica of a Role, for instance a role with 3 replicas could have the 0, 1, 2 as replica ids. Note that when the container fails and is replaced, the new container will have the same ``replica_id`` @@ -163,15 +183,25 @@ class macros: """ img_root = "${img_root}" + base_img_root = "${base_img_root}" app_id = "${app_id}" replica_id = "${replica_id}" @staticmethod - def substitute(args: List[str], img_root: str, app_id: str, replica_id: str): + def substitute( + args: List[str], + img_root: str, + app_id: str, + replica_id: str, + base_img_root: str = NONE, + ): args_sub = [] for arg in args: sub = Template(arg).safe_substitute( - img_root=img_root, app_id=app_id, replica_id=replica_id + img_root=img_root, + app_id=app_id, + replica_id=replica_id, + base_img_root=base_img_root, ) args_sub.append(sub) return args_sub @@ -419,8 +449,6 @@ def is_terminal(state: AppState) -> bool: return state in _TERMINAL_STATES -NONE: str = "" - # ======================= # ==== Status API ======= # =======================