In [1]:
# Import pieces from codeflare-sdk
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication
from codeflare_sdk.job.jobs import DDPJobDefinition

In [None]:
# Create authentication object for oc user permissions
auth = TokenAuthentication(
    token = "XXXXX",
    server = "XXXXX",
    skip_tls=False
)
auth.login()

Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).

In [4]:
# Create our cluster and submit appwrapper (reduce specs as desired)
cluster = Cluster(ClusterConfiguration(
    name='mnisttest',
    namespace='default',
    min_worker=2,
    max_worker=2,
    min_cpus=8,
    max_cpus=8,
    min_memory=16,
    max_memory=16,
    gpu=4,
    instascale=True, # Can be set to false if scaling not needed
    machine_types=["m5.xlarge", "g4dn.xlarge"] # Can be removed if above is false
))

Written to: mnisttest.yaml


Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster.

In [5]:
# Bring up the cluster
cluster.up()

Now, we want to check on the status of our resource cluster, and wait until it is finally ready for use.

In [6]:
cluster.status()

(<CodeFlareClusterStatus.QUEUED: 3>, False)

In [6]:
cluster.wait_ready()

Waiting for requested resources to be set up...
Requested cluster up and running!


In [7]:
cluster.status()

(<CodeFlareClusterStatus.READY: 1>, True)

Let's quickly verify that the specs of the cluster are as expected.

In [8]:
cluster.details()

RayCluster(name='mnisttest', status=<RayClusterStatus.READY: 'ready'>, min_workers=2, max_workers=2, worker_mem_min=8, worker_mem_max=8, worker_cpu=2, worker_gpu=0, namespace='default', dashboard='http://ray-dashboard-mnisttest-default.apps.meyceoz-032023.psap.aws.rhperfscale.org')

Now that our resource cluster is ready, we can directly submit our batch job (model training on two workers with four gpus each) to the cluster via torchx.

In [10]:
jobdef = DDPJobDefinition(
    name="mnisttest",
    script="mnist.py",
    scheduler_args={"requirements": "requirements.txt"}
)
job = jobdef.submit(cluster)

The Ray scheduler does not support port mapping.


Now we can go ahead and look at the status and logs of our batch job.

In [20]:
job.status()

AppStatus:
  msg: !!python/object/apply:ray.dashboard.modules.job.common.JobStatus
  - RUNNING
  num_restarts: -1
  roles:
  - replicas:
    - hostname: <NONE>
      id: 0
      role: ray
      state: !!python/object/apply:torchx.specs.api.AppState
      - 3
      structured_error_msg: <NONE>
    role: ray
  state: RUNNING (3)
  structured_error_msg: <NONE>
  ui_url: null

In [25]:
print(job.logs())

[RayActor(name='mnist', command=['bash', '-c', "python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-zvm96dmvgkq5hc' --nnodes 2 --nproc_per_node 1 --node_rank '0' --tee 3 --role '' mnist.py"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_JOB_ID': 'ray://torchx/mnist-zvm96dmvgkq5hc'}, num_cpus=2, num_gpus=0, min_replicas=2), RayActor(name='mnist', command=['bash', '-c', "python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-zvm96dmvgkq5hc' --nnodes 2 --nproc_per_node 1 --node_rank '1' --tee 3 --role '' mnist.py"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL', 'TORCHX_JOB_ID': 'ray://torchx/mnist-zvm96dmvgkq5hc'}, num_cpus=2, num_gpus=0, min_replicas=2)]
2023-04-03 14:55:18,399	INFO worker.py:1230 -- Using address 10.129.0.91:6379 set in the environment variable RAY_ADDRESS
2023-04-03 14:55:18,399	INFO worker.py:1342 -- Connecting

Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up.

In [7]:
cluster.down()

In [None]:
auth.logout()