In [12]:
# Import pieces from codeflare-sdk
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication

In [None]:
# Create authentication object for oc user permissions
auth = TokenAuthentication(
    token = "XXXX",
    server = "XXXX",
    skip_tls=True
)
auth.login()

Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding AppWrapper).

In [None]:
# Create our cluster and submit appwrapper
cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=["m5.xlarge", "p3.8xlarge"]))

Next, we want to bring our cluster up, so we call the `up()` function below to submit our cluster AppWrapper yaml onto the MCAD queue, and begin the process of obtaining our resource cluster.

In [None]:
# Bring up the cluster
cluster.up()

Now, we want to check on the status of our resource cluster, and wait until it is finally ready for use.

In [17]:
cluster.status()

(False, <CodeFlareClusterStatus.QUEUED: 2>)

In [None]:
cluster.wait_ready()

In [None]:
cluster.status()

Let's quickly verify that the specs of the cluster are as expected.

In [18]:
cluster.details()

<RayClusterStatus.READY: 'ready'>

Now that our resource cluster is ready, we can directly submit our batch job (model training on two workers with four gpus each) to the cluster via torchx.

In [19]:
! torchx run -s ray -cfg dashboard_address=mnisttest-head-svc.default.svc:8265,requirements=requirements.txt dist.ddp -j 2x4 --gpu 4 --script mnist.py

environemnt before exec ddp from torchx {'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}
[34mtorchx[0m [2m2022-11-04 15:04:31 INFO    [0m Checking for changes in workspace `file:///opt/app-root/src/codeflare/notebooks/jobs`...
[34mtorchx[0m [2m2022-11-04 15:04:31 INFO    [0m To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.
[34mtorchx[0m [2m2022-11-04 15:04:31 INFO    [0m Built new image `/tmp/torchx_workspace3c_d437b` based on original image `ghcr.io/pytorch/torchx:0.3.0dev0` and changes in workspace `file:///opt/app-root/src/codeflare/notebooks/jobs` for role[0]=mnist.
[34mtorchx[0m [2m2022-11-04 15:04:31 INFO    [0m Uploading package gcs://_ray_pkg_ce2c3e935774455d.zip.
[34mtorchx[0m [2m2022-11-04 15:04:31 INFO    [0m Creating a file package for local directory '/tmp/torchx_workspace3c_d437b'.
ray://torchx/mnisttest-head-svc.default.svc:8265-mnist-jlm13hx5g53mk
[34mtorchx[0m [2m2022-11-04 15:04:31 INFO    [0m La

Now we can go ahead and look at the status and logs of our batch job.

In [31]:
cluster.list_jobs()

[37mJob submission server address[39m: [1mhttp://mnisttest-head-svc.default.svc:8265[22m
{'mnist-jlm13hx5g53mk': JobInfo(status='SUCCEEDED', entrypoint='python3 ray_driver.py', message='Job finished successfully.', error_type=None, start_time=1667574271415, end_time=1667574616127, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_ce2c3e935774455d.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'e4ce38d001dbbe09cd21c497fedd03d692b2be3e'})}
[0m

In [30]:
cluster.job_status("mnist-jlm13hx5g53mk")

[37mJob submission server address[39m: [1mhttp://mnisttest-head-svc.default.svc:8265[22m

[32m-----------------------------------[39m
[32mJob 'mnist-jlm13hx5g53mk' succeeded[39m
[32m-----------------------------------[39m

[0m

In [29]:
cluster.job_logs("mnist-jlm13hx5g53mk")

[37mJob submission server address[39m: [1mhttp://mnisttest-head-svc.default.svc:8265[22m
acrtors: [RayActor(name='mnist', command=['bash', '-c', "python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '0' --tee 3 --role '' mnist.py"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}, num_cpus=2, num_gpus=4), RayActor(name='mnist', command=['bash', '-c', "python -m torch.distributed.run --rdzv_backend static --rdzv_endpoint $TORCHX_RANK0_HOST:49782 --rdzv_id 'mnist-jlm13hx5g53mk' --nnodes 2 --nproc_per_node 4 --node_rank '1' --tee 3 --role '' mnist.py"], env={'LOGLEVEL': 'DEBUG', 'TORCH_DISTRIBUTED_DEBUG': 'DETAIL'}, num_cpus=2, num_gpus=4)]
Waiting for placement group to start.
here and rank is 0 and 10.131.66.16 49782
finally setting actor remote address and port 10.131.66.16 49782
here and rank is 1 and 10.131.66.16 49782
setting actor remote address and

Finally, we bring our resource cluster down and release/terminate the associated resources, bringing everything back to the way it was before our cluster was brought up.

In [32]:
cluster.down()

In [None]:
auth.logout()