In this third demo we will go over the basics of the Ray Job Submission Client in the SDK

In [1]:
# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, RayJobClient

In [None]:
# Create authentication object for user permissions
# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config
# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually

auth_token = "XXXXX" # The auth_token is used later for the RayJobClient
auth = TokenAuthentication(
    token = auth_token,
    server = "XXXXX",
    skip_tls=False
)
auth.login()

In [2]:
# Create and configure our cluster object
cluster = Cluster(ClusterConfiguration(
    name='jobtest',
    namespace='default',
    num_workers=2,
    min_cpus=1,
    max_cpus=1,
    min_memory=4,
    max_memory=4,
    num_gpus=0,
    image="quay.io/project-codeflare/ray:latest-py39-cu118"
))

Yaml resources loaded for jobtest


In [3]:
# Bring up the cluster
cluster.up()
cluster.wait_ready()

Waiting for requested resources to be set up...
Requested cluster is up and running!
Dashboard is ready!


In [4]:
cluster.details()

RayCluster(name='jobtest', status=<RayClusterStatus.READY: 'ready'>, head_cpus=2, head_mem=8, head_gpu=0, workers=2, worker_mem_min=4, worker_mem_max=4, worker_cpu=1, worker_gpu=0, namespace='default', dashboard='https://ray-dashboard-jobtest-default.apps.rosa.mcampbel.af68.p3.openshiftapps.com')

### Ray Job Submission - Authorized Ray Cluster

* Submit a job using an authorized Ray dashboard and the Job Submission Client
* Provide an entrypoint command directed to your job script
* Set up your runtime environment

In [6]:
# Gather the dashboard URL
ray_dashboard = cluster.cluster_dashboard_uri()

# Create the header for passing your bearer token
header = {
    'Authorization': f'Bearer {auth_token}'
}

# Initialize the RayJobClient
client = RayJobClient(address=ray_dashboard, headers=header, verify=True)

In [7]:
# Submit an example mnist job using the RayJobClient
submission_id = client.submit_job(
    entrypoint="python mnist.py",
    runtime_env={"working_dir": "./","pip": "requirements.txt"},
)
print(submission_id)

2024-04-03 12:16:07,112	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_431abdedbcc7e123.zip.
2024-04-03 12:16:07,115	INFO packaging.py:518 -- Creating a file package for local directory './'.


raysubmit_NvXkkh1QP1kdq4LG


In [8]:
# Get the job's logs
client.get_job_logs(submission_id)

''

In [9]:
# Get the job's status
client.get_job_status(submission_id)

<JobStatus.PENDING: 'PENDING'>

In [10]:
# Get job related info
client.get_job_info(submission_id)

JobDetails(type=<JobType.SUBMISSION: 'SUBMISSION'>, job_id=None, submission_id='raysubmit_NvXkkh1QP1kdq4LG', driver_info=None, status=<JobStatus.PENDING: 'PENDING'>, entrypoint='python mnist.py', message='Job has not started yet. It may be waiting for the runtime environment to be set up.', error_type=None, start_time=1712142968879, end_time=None, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_431abdedbcc7e123.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address=None, driver_node_id=None)

In [11]:
# List all existing jobs
client.list_jobs()

[JobDetails(type=<JobType.SUBMISSION: 'SUBMISSION'>, job_id=None, submission_id='raysubmit_NvXkkh1QP1kdq4LG', driver_info=None, status=<JobStatus.PENDING: 'PENDING'>, entrypoint='python mnist.py', message='Job has not started yet. It may be waiting for the runtime environment to be set up.', error_type=None, start_time=1712142968879, end_time=None, metadata={}, runtime_env={'working_dir': 'gcs://_ray_pkg_431abdedbcc7e123.zip', 'pip': {'packages': ['pytorch_lightning==1.5.10', 'ray_lightning', 'torchmetrics==0.9.1', 'torchvision==0.12.0'], 'pip_check': False}, '_ray_commit': 'b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc'}, driver_agent_http_address=None, driver_node_id=None)]

In [None]:
# Iterate through the logs of a job 
async for lines in client.tail_job_logs(submission_id):
    print(lines, end="") 

In [12]:
# Delete a job
# Can run client.cancel_job(submission_id) first if job is still running
client.delete_job(submission_id)

(True, 'Successfully deleted Job raysubmit_NvXkkh1QP1kdq4LG')

### Unauthorized Ray Cluster with the Ray Job Client

In [None]:
"""
Initialise the RayJobClient with the Ray Dashboard
"""
ray_dashboard = cluster.cluster_dashboard_uri()
client = RayJobClient(address=ray_dashboard, verify=False)

In [None]:
# Submit an example mnist job using the RayJobClient
submission_id = client.submit_job(
    entrypoint="python mnist.py",
    runtime_env={"working_dir": "./","pip": "requirements.txt"},
)
print(submission_id)

In [None]:
# Stop the job 
client.stop_job(submission_id)

In [None]:
# Delete the job
client.delete_job(submission_id)

In [13]:
cluster.down()

In [None]:
auth.logout()