From 3f6cf1e39ff4563e4bb6255d955fa822eee7d84c Mon Sep 17 00:00:00 2001 From: Kevin Date: Fri, 3 Mar 2023 14:07:22 -0500 Subject: [PATCH 1/4] create classes for submitting and watching DDP jobs Signed-off-by: Kevin --- src/codeflare_sdk/cluster/cluster.py | 16 +++- src/codeflare_sdk/job/__init__.py | 0 src/codeflare_sdk/job/jobs.py | 124 +++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 src/codeflare_sdk/job/__init__.py create mode 100644 src/codeflare_sdk/job/jobs.py diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index b98eeb54..80fcd869 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -20,7 +20,7 @@ from os import stat from time import sleep -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict import openshift as oc from ray.job_submission import JobSubmissionClient @@ -45,6 +45,8 @@ class Cluster: Note that currently, the underlying implementation is a Ray cluster. """ + torchx_scheduler = "ray" + def __init__(self, config: ClusterConfiguration): """ Create the resource cluster object by passing in a ClusterConfiguration @@ -268,6 +270,18 @@ def job_logs(self, job_id: str) -> str: client = JobSubmissionClient(dashboard_route) return client.get_job_logs(job_id) + def torchx_config(self, working_dir: str = None, requirements: str = None) -> Dict[str, str]: + dashboard_address = f"{self.cluster_dashboard_uri().lstrip('http://')}" + to_return = { + "cluster_name": self.config.name, + "dashboard_address": dashboard_address, + } + if working_dir: + to_return["working_dir"] = working_dir + if requirements: + to_return["requirements"] = requirements + return to_return + def get_current_namespace() -> str: """ diff --git a/src/codeflare_sdk/job/__init__.py b/src/codeflare_sdk/job/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/codeflare_sdk/job/jobs.py b/src/codeflare_sdk/job/jobs.py new file mode 100644 index 00000000..6fa12e94 --- /dev/null +++ b/src/codeflare_sdk/job/jobs.py @@ -0,0 +1,124 @@ +# Copyright 2023 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +from typing import TYPE_CHECKING, Optional, Dict, List +from pathlib import Path + +from torchx.components.dist import ddp +from torchx.runner import get_runner +from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo + +if TYPE_CHECKING: + from ..cluster.cluster import Cluster + +all_jobs: List["Job"] = [] +torchx_runner = get_runner() + +class JobDefinition(metaclass=abc.ABCMeta): + def _dry_run(self, cluster: "Cluster"): + pass + + def submit(self, cluster: "Cluster"): + pass + + +class Job(metaclass=abc.ABCMeta): + def status(self): + pass + + def logs(self): + pass + + +class DDPJobDefinition(JobDefinition): + + def __init__( + self, + script: Optional[str] = None, + m: Optional[str]=None, + script_args: Optional[List[str]] = None, + name: Optional[str] = None, + cpu: Optional[int] = None, + gpu: Optional[int] = None, + memMB: Optional[int] = None, + h: Optional[str] = None, + j: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + max_retries: int = 0, + mounts: Optional[List[str]] = None, + rdzv_port: int = 29500, + scheduler_args: Optional[Dict[str, str]] = None, + ): + if bool(script) == bool(m): # logical XOR + raise ValueError("Exactly one of the following arguments must be defined: [script, m].") + self.script = script + self.m=m + self.script_args: List[str] = script_args if script_args is not None else [] + self.name = name + self.cpu = cpu + self.gpu = gpu + self.memMB = memMB + self.h = h + self.j = j + self.env: Dict[str, str] = env if env is not None else dict() + self.max_retries = max_retries + self.mounts: List[str] = mounts if mounts is not None else [] + self.rdzv_port = rdzv_port + self.scheduler_args: Dict[str, str] = scheduler_args if scheduler_args is not None else dict() + + def _dry_run(self, cluster: "Cluster"): + j = f"{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}" # # of proc. = # of gpus + return torchx_runner.dryrun( + app=ddp( + *self.script_args, + script=self.script, + m=self.m, + name=self.name, + h=self.h, + cpu=self.cpu if self.cpu is not None else cluster.config.max_cpus, + gpu=self.gpu if self.gpu is not None else cluster.config.gpu, + memMB=self.memMB if self.memMB is not None else cluster.config.max_memory * 1024, + j=self.j if self.j is not None else j, + env=self.env, + max_retries=self.max_retries, + rdzv_port=self.rdzv_port, + mounts=self.mounts, + ), + scheduler=cluster.torchx_scheduler, + cfg=cluster.torchx_config(**self.scheduler_args), + workspace=f"file://{Path.cwd()}" + ) + + def submit(self, cluster: "Cluster") -> "Job": + return DDPJob(self, cluster) + + +class DDPJob(Job): + def __init__( + self, + job_definition: "DDPJobDefinition", + cluster: "Cluster" + ): + self.job_definition = job_definition + self.cluster = cluster + self._app_handle = torchx_runner.schedule(job_definition._dry_run(cluster)) + all_jobs.append(self) + + def status(self) -> str: + return torchx_runner.status(self._app_handle) + + def logs(self) -> str: + return "".join(torchx_runner.log_lines(self._app_handle, None)) From 60ef21d3a641dc6b42e214c5b2dd3d8f208cd333 Mon Sep 17 00:00:00 2001 From: Kevin Date: Fri, 3 Mar 2023 16:10:58 -0500 Subject: [PATCH 2/4] update demo notebook to use new job class Signed-off-by: Kevin --- demo-notebooks/batch-job/batch_mnist.ipynb | 1873 +------------------- src/codeflare_sdk/cluster/cluster.py | 4 +- src/codeflare_sdk/job/jobs.py | 26 +- 3 files changed, 95 insertions(+), 1808 deletions(-) diff --git a/demo-notebooks/batch-job/batch_mnist.ipynb b/demo-notebooks/batch-job/batch_mnist.ipynb index 0b0c25f6..4d434640 100644 --- a/demo-notebooks/batch-job/batch_mnist.ipynb +++ b/demo-notebooks/batch-job/batch_mnist.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 12, + "execution_count": 1, "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", "metadata": {}, "outputs": [], @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "614daa0c", "metadata": {}, "outputs": [], @@ -38,10 +38,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "0f4bc870-091f-4e11-9642-cba145710159", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Written to: mnisttest.yaml\n" + ] + } + ], "source": [ "# Create our cluster and submit appwrapper\n", "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" @@ -57,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", "metadata": {}, "outputs": [], @@ -77,50 +85,20 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "id": "3c1b4311-2e61-44c9-8225-87c2db11363d", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
╭─────────────────────────╮\n",
-       "│   🚀 List of CodeFlare  │\n",
-       "│   clusters in queue🚀   │\n",
-       "│ +-----------+---------+ │\n",
-       "│ | Name      | Status  | │\n",
-       "│ +===========+=========+ │\n",
-       "│ | mnisttest | pending | │\n",
-       "│ |           |         | │\n",
-       "│ +-----------+---------+ │\n",
-       "╰─────────────────────────╯\n",
-       "
\n" - ], - "text/plain": [ - "╭─────────────────────────╮\n", - "│ \u001b[3m \u001b[0m\u001b[1;3m 🚀 List of CodeFlare\u001b[0m\u001b[3m \u001b[0m │\n", - "│ \u001b[3m \u001b[0m\u001b[1;3mclusters in queue🚀\u001b[0m\u001b[3m \u001b[0m │\n", - "│ +-----------+---------+ │\n", - "│ |\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0m|\u001b[1m \u001b[0m\u001b[1mStatus \u001b[0m\u001b[1m \u001b[0m| │\n", - "│ +===========+=========+ │\n", - "│ |\u001b[36m \u001b[0m\u001b[36mmnisttest\u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35mpending\u001b[0m\u001b[35m \u001b[0m| │\n", - "│ |\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m|\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m| │\n", - "│ +-----------+---------+ │\n", - "╰─────────────────────────╯\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "(False, )" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" + "ename": "AttributeError", + "evalue": "'Cluster' object has no attribute 'is_ready'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcluster\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mis_ready\u001b[49m()\n", + "\u001b[0;31mAttributeError\u001b[0m: 'Cluster' object has no attribute 'is_ready'" + ] } ], "source": [ @@ -158,51 +136,37 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
                  🚀 List of CodeFlare clusters 🚀                  \n",
+       "
                   🚀 CodeFlare Cluster Status 🚀                   \n",
        "                                                                    \n",
        " ╭────────────────────────────────────────────────────────────────╮ \n",
-       " │   Owner                                                        │ \n",
+       " │   Name                                                         │ \n",
        " │   mnisttest                                        Active ✅   │ \n",
        " │                                                                │ \n",
        " │   URI: ray://mnisttest-head-svc.default.svc:10001              │ \n",
        " │                                                                │ \n",
-       " │   Dashboard🔗                                                  │ \n",
+       " │   Dashboard🔗                                                  │ \n",
        " │                                                                │ \n",
-       " │                      Cluster Resources                         │ \n",
-       " │   ╭─ Workers ──╮  ╭───────── Worker specs(each) ─────────╮     │ \n",
-       " │   │  Min  Max  │  │  Memory      CPU         GPU         │     │ \n",
-       " │   │            │  │                                      │     │ \n",
-       " │   │  2    2    │  │  16G~16G     8           4           │     │ \n",
-       " │   │            │  │                                      │     │ \n",
-       " │   ╰────────────╯  ╰──────────────────────────────────────╯     │ \n",
        " ╰────────────────────────────────────────────────────────────────╯ \n",
        "
\n" ], "text/plain": [ - "\u001b[3m \u001b[0m\u001b[1;3m 🚀 List of CodeFlare clusters 🚀\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[3m \u001b[0m\u001b[1;3m 🚀 CodeFlare Cluster Status 🚀\u001b[0m\u001b[3m \u001b[0m\n", "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", " ╭────────────────────────────────────────────────────────────────╮ \n", - " │ \u001b[1;37;42mOwner\u001b[0m │ \n", + " │ \u001b[1;37;42mName\u001b[0m │ \n", " │ \u001b[1;4mmnisttest\u001b[0m Active ✅ │ \n", " │ │ \n", " │ \u001b[1mURI:\u001b[0m ray://mnisttest-head-svc.default.svc:10001 │ \n", " │ │ \n", - " │ \u001b]8;id=309861;ray-dashboard-mnisttest-default.apps.prepfullinstall.psap.aws.rhperfscale.org\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", + " │ \u001b]8;id=464037;ray-dashboard-mnisttest-default.apps.kpostoffice.dev.datahub.redhat.com\u001b\\\u001b[4;34mDashboard🔗\u001b[0m\u001b]8;;\u001b\\ │ \n", " │ │ \n", - " │ \u001b[3m Cluster Resources \u001b[0m │ \n", - " │ ╭─ Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n", - " │ │ \u001b[1m \u001b[0m\u001b[1mMin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mMax\u001b[0m\u001b[1m \u001b[0m │ │ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m │ │ \n", - " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", - " │ │ \u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m16G~16G \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m8 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m4 \u001b[0m\u001b[35m \u001b[0m │ │ \n", - " │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m │ │ \n", - " │ ╰────────────╯ ╰──────────────────────────────────────╯ │ \n", " ╰────────────────────────────────────────────────────────────────╯ \n" ] }, @@ -212,10 +176,10 @@ { "data": { "text/plain": [ - "" + "(, True)" ] }, - "execution_count": 18, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -234,46 +198,12 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "id": "3cc6183a-8f6e-4347-af91-d088ed422544", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "environemnt before exec ddp from torchx {'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Checking for changes in workspace `file:///opt/app-root/src/codeflare/notebooks/jobs`...\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m To disable workspaces pass: --workspace=\"\" from CLI or workspace=None programmatically.\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Built new image `/tmp/torchx_workspace3c_d437b` based on original image `ghcr.io/pytorch/torchx:0.3.0dev0` and changes in workspace `file:///opt/app-root/src/codeflare/notebooks/jobs` for role[0]=mnist.\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 WARNING \u001b[0m The Ray scheduler does not support port mapping.\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Uploading package gcs://_ray_pkg_ce2c3e935774455d.zip.\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Creating a file package for local directory '/tmp/torchx_workspace3c_d437b'.\n", - "ray://torchx/mnisttest-head-svc.default.svc:8265-mnist-jlm13hx5g53mk\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Launched app: ray://torchx/mnisttest-head-svc.default.svc:8265-mnist-jlm13hx5g53mk\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m AppStatus:\n", - " msg: PENDING\n", - " num_restarts: -1\n", - " roles:\n", - " - replicas:\n", - " - hostname: \n", - " id: 0\n", - " role: ray\n", - " state: !!python/object/apply:torchx.specs.api.AppState\n", - " - 2\n", - " structured_error_msg: \n", - " role: ray\n", - " state: PENDING (2)\n", - " structured_error_msg: \n", - " ui_url: null\n", - "\n", - "\u001b[34mtorchx\u001b[0m \u001b[2m2022-11-04 15:04:31 INFO \u001b[0m Job URL: None\n", - "\u001b[0m" - ] - } - ], + "outputs": [], "source": [ - "! torchx run -s ray -cfg dashboard_address=mnisttest-head-svc.default.svc:8265,requirements=requirements.txt dist.ddp -j 2x4 --gpu 4 --script mnist.py" + "from codeflare_sdk.job.jobs import DDPJobDefinition" ] }, { @@ -286,1726 +216,79 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 6, "id": "ced6ccd6-a17e-413a-a0e4-65004fc35463", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "\u001b[37mJob submission server address\u001b[39m: \u001b[1mhttp://mnisttest-head-svc.default.svc:8265\u001b[22m\n", - "{'mnist-jlm13hx5g53mk': JobInfo(status='SUCCEEDED', entrypoint='python3 ray_driver.py', message='Job finished successfully.', error_type=None, start_time=1667574271415, end_time=1667574616127, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_ce2c3e935774455d.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'e4ce38d001dbbe09cd21c497fedd03d692b2be3e'})}\n", - "\u001b[0m" + "The Ray scheduler does not support port mapping.\n" ] } ], "source": [ - "cluster.list_jobs()" + "job = DDPJobDefinition(script=\"mnist.py\", scheduler_args={\"requirements\": \"requirements.txt\"}).submit(cluster)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 7, "id": "e5c0b0da-c22e-4142-b096-407ac8aebe5e", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[37mJob submission server address\u001b[39m: \u001b[1mhttp://mnisttest-head-svc.default.svc:8265\u001b[22m\n", - "\n", - "\u001b[32m-----------------------------------\u001b[39m\n", - "\u001b[32mJob 'mnist-jlm13hx5g53mk' succeeded\u001b[39m\n", - "\u001b[32m-----------------------------------\u001b[39m\n", - "\n", - "\u001b[0m" - ] + "data": { + "text/plain": [ + "AppStatus:\n", + " msg: !!python/object/apply:ray.dashboard.modules.job.common.JobStatus\n", + " - FAILED\n", + " num_restarts: -1\n", + " roles:\n", + " - replicas:\n", + " - hostname: \n", + " id: 0\n", + " role: ray\n", + " state: !!python/object/apply:torchx.specs.api.AppState\n", + " - 5\n", + " structured_error_msg: \n", + " role: ray\n", + " state: FAILED (5)\n", + " structured_error_msg: \n", + " ui_url: null" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "cluster.job_status(\"mnist-jlm13hx5g53mk\")" + "job.status()" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 8, "id": "264c1809-de72-4acf-b0f6-e67d345640f6", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[37mJob submission server address\u001b[39m: \u001b[1mhttp://mnisttest-head-svc.default.svc:8265\u001b[22m\n", - "acrtors: [RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '0' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}, num_cpus=2, num_gpus=4), RayActor(name='mnist', command=['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '1' --tee 3 --role '' mnist.py\"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}, num_cpus=2, num_gpus=4)]\n", - "Waiting for placement group to start.\n", - "here and rank is 0 and 10.131.66.16 49782\n", - "finally setting actor remote address and port 10.131.66.16 49782\n", - "here and rank is 1 and 10.131.66.16 49782\n", - "setting actor remote address and port 10.131.66.16 49782\n", - "finally setting actor remote address and port 10.131.66.16 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port before: 10.131.66.16 42903\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port: 10.131.66.16 42903\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m set_address_and_port: 10.131.66.16 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port before: 10.131.66.16 53621\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m get_actor_address_and_port: 10.131.66.16 53621\n", - "running ray.wait on [ObjectRef(32b0eec39cfa87ac523554acce28b667f9bc98bb0200000001000000), ObjectRef(80b655a2d9b04d4074fb8e3cef07ab2b3516f40e0200000001000000)]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m cmd: ['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '0' --tee 3 --role '' mnist.py\"]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m worker env: {'NV_LIBCUBLAS_DEV_VERSION': '11.3.1.68-1', 'NV_CUDA_COMPAT_PACKAGE': 'cuda-compat-11-2', 'RAY_IP': 'mnisttest-head-svc', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_GCS': '6379', 'NV_CUDNN_PACKAGE_DEV': 'libcudnn8-dev=8.1.1.33-1+cuda11.2', 'LC_ALL': 'C.UTF-8', 'LD_LIBRARY_PATH': '/usr/local/nvidia/lib:/usr/local/nvidia/lib64', 'NV_LIBNCCL_DEV_PACKAGE': 'libnccl-dev=2.8.4-1+cuda11.2', 'REDIS_PASSWORD': '', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PORT': '8265', 'RAY_USAGE_STATS_ENABLED': '0', 'LANG': 'C.UTF-8', 'TZ': 'America/Los_Angeles', 'NV_LIBNPP_DEV_PACKAGE': 'libnpp-dev-11-2=11.2.1.68-1', 'HOSTNAME': 'mnisttest-worker-small-group-mnisttest-wzz2l', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP': 'tcp://172.30.163.155:8265', 'OLDPWD': '/home/ray/workspace', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_ADDR': '172.30.163.155', 'RAY_CLIENT_MODE': '0', 'RAY_JOB_ID': '02000000', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_ADDR': '172.30.163.155', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_ADDR': '172.30.163.155', 'NV_LIBNPP_VERSION': '11.2.1.68-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PROTO': 'tcp', 'NVIDIA_VISIBLE_DEVICES': 'GPU-d3e8af45-f80b-98a8-dcd8-d3b428c4a4c2,GPU-15e57e64-c38b-9923-8f4a-6c098fdbc062,GPU-d14042c5-219c-5419-9511-ac62c72f90d1,GPU-b0d6ba11-ccb2-c4fb-89ad-01c50e6d393c', 'VIRTUAL_ENV': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv', 'NV_LIBCUSPARSE_VERSION': '11.3.1.68-1', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_DASHBOARD': '8265', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_CLIENT': '10001', 'KUBERNETES_PORT_443_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PORT': '6379', 'KUBERNETES_PORT_443_TCP_ADDR': '172.30.0.1', 'NV_LIBCUBLAS_DEV_PACKAGE': 'libcublas-dev-11-2=11.3.1.68-1', 'NCCL_VERSION': '2.8.4-1', 'KUBERNETES_PORT': 'tcp://172.30.0.1:443', 'PWD': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d', 'NVARCH': 'x86_64', 'NV_LIBCUSPARSE_DEV_VERSION': '11.3.1.68-1', 'HOME': '/home/ray', 'RAY_RAYLET_PID': '19', 'NV_ML_REPO_URL': 'https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64', 'NV_LIBNCCL_PACKAGE_VERSION': '2.8.4-1', 'SPT_NOENV': '1', 'KUBERNETES_SERVICE_PORT_HTTPS': '443', 'NV_LIBNCCL_PACKAGE': 'libnccl2=2.8.4-1+cuda11.2', 'NV_LIBNCCL_DEV_PACKAGE_NAME': 'libnccl-dev', 'KUBERNETES_PORT_443_TCP_PORT': '443', 'NV_CUDA_LIB_VERSION': '11.2.0-1', 'NV_ML_REPO_ENABLED': '1', 'NV_LIBNPP_PACKAGE': 'libnpp-11-2=11.2.1.68-1', 'NV_LIBNCCL_PACKAGE_NAME': 'libnccl2', 'LIBRARY_PATH': '/usr/local/cuda/lib64/stubs', 'NV_NVTX_VERSION': '11.2.67-1', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP': 'tcp://172.30.163.155:10001', 'NV_LIBCUBLAS_VERSION': '11.3.1.68-1', 'RAY_ADDRESS': 'mnisttest-head-svc:6379', 'NV_LIBCUBLAS_PACKAGE': 'libcublas-11-2=11.3.1.68-1', 'KUBERNETES_PORT_443_TCP': 'tcp://172.30.0.1:443', 'NV_CUDNN_VERSION': '8.1.1.33', 'RAY_PORT': '6379', 'NV_CUDA_CUDART_DEV_VERSION': '11.2.72-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP': 'tcp://172.30.163.155:6379', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PORT': '10001', 'TERM': 'xterm', 'MNISTTEST_HEAD_SVC_SERVICE_PORT': '6379', 'NV_NVML_DEV_VERSION': '11.2.67-1', 'CUDA_VERSION': '11.2.0', 'NV_LIBCUBLAS_PACKAGE_NAME': 'libcublas-11-2', 'NSS_SDB_USE_CACHE': 'no', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'MY_POD_IP': '10.131.66.16', 'SHLVL': '1', 'PYTHONPATH': ':/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d:/home/ray/workspace::/home/ray/workspace:', 'NV_LIBCUBLAS_DEV_PACKAGE_NAME': 'libcublas-dev-11-2', 'NVIDIA_REQUIRE_CUDA': 'cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450', 'NV_LIBNPP_DEV_VERSION': '11.2.1.68-1', 'KUBERNETES_SERVICE_PORT': '443', 'NV_CUDA_CUDART_VERSION': '11.2.72-1', 'NV_CUDNN_PACKAGE_NAME': 'libcudnn8', 'PATH': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv/bin:/home/ray/anaconda3/bin:/home/ray/anaconda3/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'NV_LIBNCCL_DEV_PACKAGE_VERSION': '2.8.4-1', 'MNISTTEST_HEAD_SVC_PORT': 'tcp://172.30.163.155:6379', 'PS1': '(virtualenv) ', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_SERVICE_HOST': '172.30.163.155', 'KUBERNETES_SERVICE_HOST': '172.30.0.1', 'NV_CUDNN_PACKAGE': 'libcudnn8=8.1.1.33-1+cuda11.2', 'OMP_NUM_THREADS': '1', 'PYTHONBREAKPOINT': 'ray.util.rpdb.set_trace', 'CUDA_VISIBLE_DEVICES': '0,1,2,3', 'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_RANK0_HOST': '10.131.66.16'}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m set_address_and_port: 10.131.66.16 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m cmd: ['bash', '-c', \"python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '1' --tee 3 --role '' mnist.py\"]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m worker env: {'NV_LIBCUBLAS_DEV_VERSION': '11.3.1.68-1', 'NV_CUDA_COMPAT_PACKAGE': 'cuda-compat-11-2', 'RAY_IP': 'mnisttest-head-svc', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_GCS': '6379', 'NV_CUDNN_PACKAGE_DEV': 'libcudnn8-dev=8.1.1.33-1+cuda11.2', 'LC_ALL': 'C.UTF-8', 'LD_LIBRARY_PATH': '/usr/local/nvidia/lib:/usr/local/nvidia/lib64', 'NV_LIBNCCL_DEV_PACKAGE': 'libnccl-dev=2.8.4-1+cuda11.2', 'REDIS_PASSWORD': '', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PORT': '8265', 'RAY_USAGE_STATS_ENABLED': '0', 'LANG': 'C.UTF-8', 'TZ': 'America/Los_Angeles', 'NV_LIBNPP_DEV_PACKAGE': 'libnpp-dev-11-2=11.2.1.68-1', 'HOSTNAME': 'mnisttest-worker-small-group-mnisttest-hfm8l', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP': 'tcp://172.30.163.155:8265', 'OLDPWD': '/home/ray/workspace', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_ADDR': '172.30.163.155', 'RAY_CLIENT_MODE': '0', 'RAY_JOB_ID': '02000000', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_ADDR': '172.30.163.155', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_ADDR': '172.30.163.155', 'NV_LIBNPP_VERSION': '11.2.1.68-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PROTO': 'tcp', 'NVIDIA_VISIBLE_DEVICES': 'GPU-48fae530-6bda-e366-3423-864fe847ff3b,GPU-5d8d79bb-5c38-4ef7-0ea8-c91297cbc59f,GPU-8c8b3c0b-ccf8-c06c-f253-0bb90285c4cb,GPU-a8a4e808-841d-c212-2686-a2bd227279b3', 'VIRTUAL_ENV': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv', 'NV_LIBCUSPARSE_VERSION': '11.3.1.68-1', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_DASHBOARD': '8265', 'MNISTTEST_HEAD_SVC_SERVICE_PORT_CLIENT': '10001', 'KUBERNETES_PORT_443_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP_PORT': '6379', 'KUBERNETES_PORT_443_TCP_ADDR': '172.30.0.1', 'NV_LIBCUBLAS_DEV_PACKAGE': 'libcublas-dev-11-2=11.3.1.68-1', 'NCCL_VERSION': '2.8.4-1', 'KUBERNETES_PORT': 'tcp://172.30.0.1:443', 'PWD': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d', 'NVARCH': 'x86_64', 'NV_LIBCUSPARSE_DEV_VERSION': '11.3.1.68-1', 'HOME': '/home/ray', 'RAY_RAYLET_PID': '19', 'NV_ML_REPO_URL': 'https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64', 'NV_LIBNCCL_PACKAGE_VERSION': '2.8.4-1', 'SPT_NOENV': '1', 'KUBERNETES_SERVICE_PORT_HTTPS': '443', 'NV_LIBNCCL_PACKAGE': 'libnccl2=2.8.4-1+cuda11.2', 'NV_LIBNCCL_DEV_PACKAGE_NAME': 'libnccl-dev', 'KUBERNETES_PORT_443_TCP_PORT': '443', 'NV_CUDA_LIB_VERSION': '11.2.0-1', 'NV_ML_REPO_ENABLED': '1', 'NV_LIBNPP_PACKAGE': 'libnpp-11-2=11.2.1.68-1', 'NV_LIBNCCL_PACKAGE_NAME': 'libnccl2', 'LIBRARY_PATH': '/usr/local/cuda/lib64/stubs', 'NV_NVTX_VERSION': '11.2.67-1', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP': 'tcp://172.30.163.155:10001', 'NV_LIBCUBLAS_VERSION': '11.3.1.68-1', 'RAY_ADDRESS': 'mnisttest-head-svc:6379', 'NV_LIBCUBLAS_PACKAGE': 'libcublas-11-2=11.3.1.68-1', 'KUBERNETES_PORT_443_TCP': 'tcp://172.30.0.1:443', 'NV_CUDNN_VERSION': '8.1.1.33', 'RAY_PORT': '6379', 'NV_CUDA_CUDART_DEV_VERSION': '11.2.72-1', 'MNISTTEST_HEAD_SVC_PORT_6379_TCP': 'tcp://172.30.163.155:6379', 'MNISTTEST_HEAD_SVC_PORT_8265_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PORT': '10001', 'TERM': 'xterm', 'MNISTTEST_HEAD_SVC_SERVICE_PORT': '6379', 'NV_NVML_DEV_VERSION': '11.2.67-1', 'CUDA_VERSION': '11.2.0', 'NV_LIBCUBLAS_PACKAGE_NAME': 'libcublas-11-2', 'NSS_SDB_USE_CACHE': 'no', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'MY_POD_IP': '10.128.68.15', 'SHLVL': '1', 'PYTHONPATH': ':/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/working_dir_files/_ray_pkg_ce2c3e935774455d:/home/ray/workspace::/home/ray/workspace:', 'NV_LIBCUBLAS_DEV_PACKAGE_NAME': 'libcublas-dev-11-2', 'NVIDIA_REQUIRE_CUDA': 'cuda>=11.2 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450', 'NV_LIBNPP_DEV_VERSION': '11.2.1.68-1', 'KUBERNETES_SERVICE_PORT': '443', 'NV_CUDA_CUDART_VERSION': '11.2.72-1', 'NV_CUDNN_PACKAGE_NAME': 'libcudnn8', 'PATH': '/tmp/ray/session_2022-11-04_08-02-48_207951_7/runtime_resources/pip/3510e0c008a5c3627e4d2408c8b93ed71be6c3e1/virtualenv/bin:/home/ray/anaconda3/bin:/home/ray/anaconda3/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'NV_LIBNCCL_DEV_PACKAGE_VERSION': '2.8.4-1', 'MNISTTEST_HEAD_SVC_PORT': 'tcp://172.30.163.155:6379', 'PS1': '(virtualenv) ', 'MNISTTEST_HEAD_SVC_PORT_10001_TCP_PROTO': 'tcp', 'MNISTTEST_HEAD_SVC_SERVICE_HOST': '172.30.163.155', 'KUBERNETES_SERVICE_HOST': '172.30.0.1', 'NV_CUDNN_PACKAGE': 'libcudnn8=8.1.1.33-1+cuda11.2', 'OMP_NUM_THREADS': '1', 'PYTHONBREAKPOINT': 'ray.util.rpdb.set_trace', 'CUDA_VISIBLE_DEVICES': '0,1,2,3', 'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_RANK0_HOST': '10.131.66.16'}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m entrypoint : mnist.py\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m min_nodes : 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m max_nodes : 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m nproc_per_node : 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m run_id : mnist-jlm13hx5g53mk\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m rdzv_backend : static\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m rdzv_endpoint : 10.131.66.16:49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m rdzv_configs : {'rank': 1, 'timeout': 900}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m max_restarts : 0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m monitor_interval : 5\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m log_dir : None\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m metrics_cfg : {}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m \n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.launcher.api:Starting elastic_operator with launch configs:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m entrypoint : mnist.py\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m min_nodes : 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m max_nodes : 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m nproc_per_node : 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m run_id : mnist-jlm13hx5g53mk\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m rdzv_backend : static\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m rdzv_endpoint : 10.131.66.16:49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m rdzv_configs : {'rank': 0, 'timeout': 900}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m max_restarts : 0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m monitor_interval : 5\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m log_dir : None\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m metrics_cfg : {}\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m \n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.local_elastic_agent:log directory set to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] starting workers for entrypoint: python\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous'ing worker group\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m restart_count=0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m master_addr=10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m master_port=49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m group_rank=1\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m group_world_size=2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m local_ranks=[0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m role_ranks=[4, 5, 6, 7]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m global_ranks=[4, 5, 6, 7]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m role_world_sizes=[8, 8, 8, 8]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m global_world_sizes=[8, 8, 8, 8]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m \n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/0/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/1/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/2/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.128.68.15)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_d2kdqlka/mnist-jlm13hx5g53mk_r9bujvap/attempt_0/3/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Rendezvous complete for workers. Result:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m restart_count=0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m master_addr=10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m master_port=49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m group_rank=0\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m group_world_size=2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m local_ranks=[0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m role_ranks=[0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m global_ranks=[0, 1, 2, 3]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m role_world_sizes=[8, 8, 8, 8]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m global_world_sizes=[8, 8, 8, 8]\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m \n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.agent.server.api:[] Starting worker group\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/0/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/1/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/2/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_d4z71nty/mnist-jlm13hx5g53mk_nxz_och1/attempt_0/3/error.json\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:prior to running the trainer\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:MASTER_ADDR: is 10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:MASTER_PORT: is 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:GROUP: 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [3]:LOCAL: 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:prior to running the trainer\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:MASTER_ADDR: is 10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:MASTER_PORT: is 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:GROUP: 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:LOCAL: 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading MNIST dataset...\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:prior to running the trainer\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:MASTER_ADDR: is 10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:MASTER_PORT: is 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:GROUP: 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [2]:LOCAL: 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:prior to running the trainer\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:MASTER_ADDR: is 10.131.66.16\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:MASTER_PORT: is 49782\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:GROUP: 2\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [1]:LOCAL: 4\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Validation sanity check: 0it [00:00, ?it/s][0]:\n", - "\u001b[2m\u001b[36m(CommandActor pid=123, ip=10.131.66.16)\u001b[0m [0]:Validation sanity check: 0%| | 0/2 [00:00\\n main()\\n File \"ray_driver.py\", line 308, in main\\n driver.run()\\n File \"ray_driver.py\", line 293, in run\\n terminal = self._step()\\n File \"ray_driver.py\", line 245, in _step\\n result = ray.get(object_ref)\\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/client_mode_hook.py\", line 105, in wrapper\\n return func(*args, **kwargs)\\n File \"/home/ray/anaconda3/lib/python3.8/site-packages/ray/_private/worker.py\", line 2289, in get\\n raise value.as_instanceof_cause()\\nray.exceptions.RayTaskError(RuntimeError): \\x1b[36mray::CommandActor.exec_module()\\x1b[39m (pid=3315, ip=10.129.2.222, repr=)\\n File \"ray_driver.py\", line 76, in exec_module\\n raise RuntimeError(\\nRuntimeError: Either MASTER_ADDR or MASTER_PORT are not set. This is most likely bug in torchxOpen issue at https://github.com/pytorch/torchx\\n'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "cluster.job_logs(\"mnist-jlm13hx5g53mk\")" + "print(job.logs())" ] }, { @@ -2039,7 +322,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.9.7 64-bit", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 80fcd869..b727e0cb 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -270,7 +270,9 @@ def job_logs(self, job_id: str) -> str: client = JobSubmissionClient(dashboard_route) return client.get_job_logs(job_id) - def torchx_config(self, working_dir: str = None, requirements: str = None) -> Dict[str, str]: + def torchx_config( + self, working_dir: str = None, requirements: str = None + ) -> Dict[str, str]: dashboard_address = f"{self.cluster_dashboard_uri().lstrip('http://')}" to_return = { "cluster_name": self.config.name, diff --git a/src/codeflare_sdk/job/jobs.py b/src/codeflare_sdk/job/jobs.py index 6fa12e94..b95a9ba0 100644 --- a/src/codeflare_sdk/job/jobs.py +++ b/src/codeflare_sdk/job/jobs.py @@ -27,6 +27,7 @@ all_jobs: List["Job"] = [] torchx_runner = get_runner() + class JobDefinition(metaclass=abc.ABCMeta): def _dry_run(self, cluster: "Cluster"): pass @@ -44,11 +45,10 @@ def logs(self): class DDPJobDefinition(JobDefinition): - def __init__( self, script: Optional[str] = None, - m: Optional[str]=None, + m: Optional[str] = None, script_args: Optional[List[str]] = None, name: Optional[str] = None, cpu: Optional[int] = None, @@ -63,9 +63,11 @@ def __init__( scheduler_args: Optional[Dict[str, str]] = None, ): if bool(script) == bool(m): # logical XOR - raise ValueError("Exactly one of the following arguments must be defined: [script, m].") + raise ValueError( + "Exactly one of the following arguments must be defined: [script, m]." + ) self.script = script - self.m=m + self.m = m self.script_args: List[str] = script_args if script_args is not None else [] self.name = name self.cpu = cpu @@ -77,7 +79,9 @@ def __init__( self.max_retries = max_retries self.mounts: List[str] = mounts if mounts is not None else [] self.rdzv_port = rdzv_port - self.scheduler_args: Dict[str, str] = scheduler_args if scheduler_args is not None else dict() + self.scheduler_args: Dict[str, str] = ( + scheduler_args if scheduler_args is not None else dict() + ) def _dry_run(self, cluster: "Cluster"): j = f"{cluster.config.max_worker}x{max(cluster.config.gpu, 1)}" # # of proc. = # of gpus @@ -90,7 +94,9 @@ def _dry_run(self, cluster: "Cluster"): h=self.h, cpu=self.cpu if self.cpu is not None else cluster.config.max_cpus, gpu=self.gpu if self.gpu is not None else cluster.config.gpu, - memMB=self.memMB if self.memMB is not None else cluster.config.max_memory * 1024, + memMB=self.memMB + if self.memMB is not None + else cluster.config.max_memory * 1024, j=self.j if self.j is not None else j, env=self.env, max_retries=self.max_retries, @@ -99,7 +105,7 @@ def _dry_run(self, cluster: "Cluster"): ), scheduler=cluster.torchx_scheduler, cfg=cluster.torchx_config(**self.scheduler_args), - workspace=f"file://{Path.cwd()}" + workspace=f"file://{Path.cwd()}", ) def submit(self, cluster: "Cluster") -> "Job": @@ -107,11 +113,7 @@ def submit(self, cluster: "Cluster") -> "Job": class DDPJob(Job): - def __init__( - self, - job_definition: "DDPJobDefinition", - cluster: "Cluster" - ): + def __init__(self, job_definition: "DDPJobDefinition", cluster: "Cluster"): self.job_definition = job_definition self.cluster = cluster self._app_handle = torchx_runner.schedule(job_definition._dry_run(cluster)) From 022bc761f5f95619bc01d4791e41323db1f1c78a Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Thu, 23 Mar 2023 13:57:43 -0400 Subject: [PATCH 3/4] added job tests (#1) * WIP job tests * added unit tests for Jobs * add more specificity to tests --- tests/unit_test.py | 195 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/tests/unit_test.py b/tests/unit_test.py index 95051400..bd9261c4 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -16,6 +16,7 @@ import sys import filecmp import os +import re parent = Path(__file__).resolve().parents[1] sys.path.append(str(parent) + "/src") @@ -46,10 +47,20 @@ RayClusterStatus, CodeFlareClusterStatus, ) +from codeflare_sdk.job.jobs import ( + JobDefinition, + Job, + DDPJobDefinition, + DDPJob, + torchx_runner, +) import openshift from openshift import OpenShiftPythonException from openshift.selector import Selector import ray +from torchx.specs import AppDryRunInfo, AppDef +from torchx.runner import get_runner, Runner +from torchx.schedulers.ray_scheduler import RayJob import pytest @@ -1535,6 +1546,7 @@ def test_cluster_status(mocker): mocker.patch( "codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=fake_ray ) + status, ready = cf.status() assert status == CodeFlareClusterStatus.STARTING assert ready == False @@ -1594,3 +1606,186 @@ def test_cmd_line_generation(): def test_cleanup(): os.remove("test.yaml") os.remove("raytest2.yaml") + + +def test_jobdefinition_coverage(): + abstract = JobDefinition() + cluster = Cluster(test_config_creation()) + abstract._dry_run(cluster) + abstract.submit(cluster) + + +def test_job_coverage(): + abstract = Job() + abstract.status() + abstract.logs() + + +def test_DDPJobDefinition_creation(): + ddp = DDPJobDefinition( + script="test.py", + m=None, + script_args=["test"], + name="test", + cpu=1, + gpu=0, + memMB=1024, + h=None, + j="2x1", + env={"test": "test"}, + max_retries=0, + mounts=[], + rdzv_port=29500, + scheduler_args={"requirements": "test"}, + ) + assert ddp.script == "test.py" + assert ddp.m == None + assert ddp.script_args == ["test"] + assert ddp.name == "test" + assert ddp.cpu == 1 + assert ddp.gpu == 0 + assert ddp.memMB == 1024 + assert ddp.h == None + assert ddp.j == "2x1" + assert ddp.env == {"test": "test"} + assert ddp.max_retries == 0 + assert ddp.mounts == [] + assert ddp.rdzv_port == 29500 + assert ddp.scheduler_args == {"requirements": "test"} + return ddp + + +def test_DDPJobDefinition_dry_run(): + """ + Test that the dry run method returns the correct type: AppDryRunInfo, + that the attributes of the returned object are of the correct type, + and that the values from cluster and job definition are correctly passed. + """ + ddp = test_DDPJobDefinition_creation() + cluster = Cluster(test_config_creation()) + ddp_job = ddp._dry_run(cluster) + assert type(ddp_job) == AppDryRunInfo + assert ddp_job._fmt is not None + assert type(ddp_job.request) == RayJob + assert type(ddp_job._app) == AppDef + assert type(ddp_job._cfg) == type(dict()) + assert type(ddp_job._scheduler) == type(str()) + + assert ddp_job.request.app_id.startswith("test") + assert ddp_job.request.working_dir.startswith("/tmp/torchx_workspace") + assert ddp_job.request.cluster_name == "unit-test-cluster" + assert ddp_job.request.requirements == "test" + + assert ddp_job._app.roles[0].resource.cpu == 1 + assert ddp_job._app.roles[0].resource.gpu == 0 + assert ddp_job._app.roles[0].resource.memMB == 1024 + + assert ddp_job._cfg["cluster_name"] == "unit-test-cluster" + assert ddp_job._cfg["requirements"] == "test" + + assert ddp_job._scheduler == "ray" + + +def test_DDPJobDefinition_dry_run_no_resource_args(): + """ + Test that the dry run correctly gets resources from the cluster object + when the job definition does not specify resources. + """ + cluster = Cluster(test_config_creation()) + ddp = DDPJobDefinition( + script="test.py", + m=None, + script_args=["test"], + name="test", + h=None, + env={"test": "test"}, + max_retries=0, + mounts=[], + rdzv_port=29500, + scheduler_args={"requirements": "test"}, + ) + ddp_job = ddp._dry_run(cluster) + + assert ddp_job._app.roles[0].resource.cpu == cluster.config.max_cpus + assert ddp_job._app.roles[0].resource.gpu == cluster.config.gpu + assert ddp_job._app.roles[0].resource.memMB == cluster.config.max_memory * 1024 + assert ( + parse_j(ddp_job._app.roles[0].args[1]) + == f"{cluster.config.max_worker}x{cluster.config.gpu}" + ) + + +def test_DDPJobDefinition_submit(mocker): + """ + Tests that the submit method returns the correct type: DDPJob + And that the attributes of the returned object are of the correct type + """ + ddp_def = test_DDPJobDefinition_creation() + cluster = Cluster(test_config_creation()) + mocker.patch( + "codeflare_sdk.job.jobs.torchx_runner.schedule", + return_value="fake-dashboard-url", + ) # a fake app_handle + ddp_job = ddp_def.submit(cluster) + assert type(ddp_job) == DDPJob + assert type(ddp_job.job_definition) == DDPJobDefinition + assert type(ddp_job.cluster) == Cluster + assert type(ddp_job._app_handle) == str + assert ddp_job._app_handle == "fake-dashboard-url" + + +def test_DDPJob_creation(mocker): + ddp_def = test_DDPJobDefinition_creation() + cluster = Cluster(test_config_creation()) + mocker.patch( + "codeflare_sdk.job.jobs.torchx_runner.schedule", + return_value="fake-dashboard-url", + ) # a fake app_handle + ddp_job = DDPJob(ddp_def, cluster) + assert type(ddp_job) == DDPJob + assert type(ddp_job.job_definition) == DDPJobDefinition + assert type(ddp_job.cluster) == Cluster + assert type(ddp_job._app_handle) == str + assert ddp_job._app_handle == "fake-dashboard-url" + _, args, kwargs = torchx_runner.schedule.mock_calls[0] + assert type(args[0]) == AppDryRunInfo + job_info = args[0] + assert type(job_info.request) == RayJob + assert type(job_info._app) == AppDef + assert type(job_info._cfg) == type(dict()) + assert type(job_info._scheduler) == type(str()) + return ddp_job + + +def test_DDPJob_status(mocker): + ddp_job = test_DDPJob_creation(mocker) + mocker.patch( + "codeflare_sdk.job.jobs.torchx_runner.status", return_value="fake-status" + ) + assert ddp_job.status() == "fake-status" + _, args, kwargs = torchx_runner.status.mock_calls[0] + assert args[0] == "fake-dashboard-url" + + +def test_DDPJob_logs(mocker): + ddp_job = test_DDPJob_creation(mocker) + mocker.patch( + "codeflare_sdk.job.jobs.torchx_runner.log_lines", return_value="fake-logs" + ) + assert ddp_job.logs() == "fake-logs" + _, args, kwargs = torchx_runner.log_lines.mock_calls[0] + assert args[0] == "fake-dashboard-url" + + +def parse_j(cmd): + + pattern = r"--nnodes\s+\d+\s+--nproc_per_node\s+\d+" + match = re.search(pattern, cmd) + if match: + substring = match.group(0) + else: + return None + args = substring.split() + max_worker = args[1] + gpu = args[3] + return f"{max_worker}x{gpu}" From 103be3cba8ea22b022eece611e9eb5f49037e086 Mon Sep 17 00:00:00 2001 From: Kevin Date: Thu, 23 Mar 2023 14:45:53 -0400 Subject: [PATCH 4/4] add torchx to requirements file Signed-off-by: Kevin --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index d0bbd2c9..59f8082d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ openshift-client==1.0.18 rich==12.5.1 ray[default]==2.1.0 +git+https://github.com/project-codeflare/torchx@6517d5b060e4fe32b9ad41019c3bef647095c35f#egg=torchx \ No newline at end of file