In this notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development.

In [1]:
# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication

In [None]:
# Create authentication object for user permissions
# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config
# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually
auth = TokenAuthentication(
    token = "XXXXX",
    server = "XXXXX",
    skip_tls=False
)
auth.login()

Once again, let's start by running through the same cluster setup as before:

NOTE: 'quay.io/rhoai/ray:2.23.0-py39-cu121' is the default community image used by the CodeFlare SDK for creating a RayCluster resource. 
If you have your own Ray image which suits your purposes, specify it in image field to override the default image.

In [2]:
# Create and configure our cluster object
# The SDK will try to find the name of your default local queue based on the annotation "kueue.x-k8s.io/default-queue": "true" unless you specify the local queue manually below
cluster_name = "interactivetest"
cluster = Cluster(ClusterConfiguration(
    name=cluster_name,
    head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus
    num_gpus=1,
    num_workers=2,
    min_cpus=2,
    max_cpus=2,
    min_memory=8,
    max_memory=8,
    # image="", # Optional Field 
    write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources 
    # local_queue="local-queue-name" # Specify the local queue manually
))

Written to: interactivetest.yaml


In [3]:
# Bring up the cluster
cluster.up()
cluster.wait_ready()

Waiting for requested resources to be set up...
Requested cluster up and running!


In [4]:
cluster.details()

RayCluster(name='interactivetest', status=<RayClusterStatus.READY: 'ready'>, workers=2, worker_mem_min=8, worker_mem_max=8, worker_cpu=2, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-interactivetest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org')

This time we will demonstrate another potential method of use: working with the Ray cluster interactively.

Using the SDK, we can get both the Ray cluster URI and dashboard URI:

In [5]:
ray_dashboard_uri = cluster.cluster_dashboard_uri()
ray_cluster_uri = cluster.cluster_uri()
print(ray_dashboard_uri)
print(ray_cluster_uri)

http://ray-dashboard-interactivetest-default.apps.meyceoz-07122023.psap.aws.rhperfscale.org
ray://interactivetest-head-svc.default.svc:10001


Now we can connect directly to our Ray cluster via the Ray python client:

In [None]:
from codeflare_sdk import generate_cert
# Create required TLS cert and export the environment variables to enable TLS
generate_cert.generate_tls_cert(cluster_name, cluster.config.namespace)
generate_cert.export_env(cluster_name, cluster.config.namespace)

In [6]:
#before proceeding make sure the cluster exists and the uri is not empty
assert ray_cluster_uri, "Ray cluster needs to be started and set before proceeding"

import ray

# reset the ray context in case there's already one. 
ray.shutdown()
# establish connection to ray cluster

#install additional libraries that will be required for model training
runtime_env = {"pip": ["transformers==4.41.2", "datasets==2.17.0", "accelerate==0.31.0", "scikit-learn==1.5.0"]}
# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines
# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb
ray.init(address=ray_cluster_uri, runtime_env=runtime_env)

print("Ray cluster is up and running: ", ray.is_initialized())

Ray cluster is up and running:  True


Now that we are connected (and have passed in some package requirements), let's try writing some training code:

In [7]:
@ray.remote
def train_fn():
    import os
    import numpy as np
    from datasets import load_dataset, load_metric
    import transformers
    from transformers import (
        Trainer,
        TrainingArguments,
        AutoTokenizer,
        AutoModelForSequenceClassification,
    )
    import ray.train.huggingface.transformers
    from ray.train import ScalingConfig
    from ray.train.torch import TorchTrainer

    # When running in a multi-node cluster you will need persistent storage that is accessible across all worker nodes. 
    # See www.github.com/project-codeflare/codeflare-sdk/tree/main/docs/s3-compatible-storage.md for more information.
    
    def train_func():
        # Datasets
        dataset = load_dataset("imdb")
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

        def tokenize_function(examples):
            return tokenizer(examples["text"], padding="max_length", truncation=True)

        small_train_dataset = (
            dataset["train"].select(range(100)).map(tokenize_function, batched=True)
        )
        small_eval_dataset = (
            dataset["test"].select(range(100)).map(tokenize_function, batched=True)
        )

        # Model
        model = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=2
        )

        def compute_metrics(eval_pred):
            metric = load_metric("accuracy")
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            return metric.compute(predictions=predictions, references=labels)

        # Hugging Face Trainer
        training_args = TrainingArguments(
            output_dir="test_trainer",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            report_to="none",
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=small_train_dataset,
            eval_dataset=small_eval_dataset,
            compute_metrics=compute_metrics,
        )


        callback = ray.train.huggingface.transformers.RayTrainReportCallback()
        trainer.add_callback(callback)

        trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)

        trainer.train()


    ray_trainer = TorchTrainer(
        train_func,
        scaling_config=ScalingConfig(num_workers=3, use_gpu=True),
        # Configure persistent storage that is accessible across 
        # all worker nodes.
        # Uncomment and update the RunConfig below to include your storage details.
        # run_config=ray.train.RunConfig(storage_path="storage path"),
    )
    result: ray.train.Result = ray_trainer.fit()

Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:

In [8]:
#call the above cell as a remote ray function
ray.get(train_fn.remote())

Downloading builder script: 100%|██████████| 4.31k/4.31k [00:00<00:00, 20.9MB/s]
Downloading metadata: 100%|██████████| 2.17k/2.17k [00:00<00:00, 14.1MB/s]
Downloading readme: 100%|██████████| 7.59k/7.59k [00:00<00:00, 22.9MB/s]


[2m[36m(train_fn pid=425)[0m Downloading and preparing dataset imdb/plain_text to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]
Downloading data:   1%|          | 738k/84.1M [00:00<00:11, 7.34MB/s]
Downloading data:   6%|▌         | 4.88M/84.1M [00:00<00:02, 27.1MB/s]
Downloading data:  14%|█▍        | 11.8M/84.1M [00:00<00:01, 46.2MB/s]
Downloading data:  23%|██▎       | 19.5M/84.1M [00:00<00:01, 58.3MB/s]
Downloading data:  32%|███▏      | 27.3M/84.1M [00:00<00:00, 65.5MB/s]
Downloading data:  42%|████▏     | 35.2M/84.1M [00:00<00:00, 70.0MB/s]
Downloading data:  51%|█████     | 43.1M/84.1M [00:00<00:00, 72.9MB/s]
Downloading data:  61%|██████    | 51.0M/84.1M [00:00<00:00, 74.8MB/s]
Downloading data:  70%|███████   | 59.0M/84.1M [00:00<00:00, 76.5MB/s]
Downloading data:  80%|███████▉  | 66.9M/84.1M [00:01<00:00, 77.4MB/s]
Downloading data:  89%|████████▉ | 75.0M/84.1M [00:01<00:00, 78.2MB/s]
Downloading data: 100%|██████████| 84.1M/84.1M [00:01<00:00, 69.2MB/s]
Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]
Generating tra

[2m[36m(train_fn pid=425)[0m Dataset imdb downloaded and prepared to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 599.79it/s]                                                
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 8.59kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 163kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.72MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 44.8MB/s]
Map:   0%|          | 0/25000 [00:00<?, ? examples/s]
Map:   4%|▍         | 1000/25000 [00:00<00:13, 1733.22 examples/s]
Map:   8%|▊         | 2000/25000 [00:01<00:12, 1866.13 examples/s]
Map:  12%|█▏        | 3000/25000 [00:01<00:11, 1887.41 examples/s]
Map:  16%|█▌        | 4000/25000 [00:02<00:11, 1898.51 examples/s]
Map:  20%|██        | 5000/25000 [00:02<00:10, 1828.14 examples/s]
Map:  24%|██▍       | 6000/25000 [00:03<00:10, 1841.43 examples/s]
Map:  28%|██▊       | 7000/25000 [00:03<00:09, 1849.60 examples/s]
Map:  32%|███▏      | 8000/25000 

[2m[36m(train_fn pid=425)[0m len of train Dataset({
[2m[36m(train_fn pid=425)[0m     features: ['text', 'label', 'input_ids', 'attention_mask'],
[2m[36m(train_fn pid=425)[0m     num_rows: 100
[2m[36m(train_fn pid=425)[0m }) and test Dataset({
[2m[36m(train_fn pid=425)[0m     features: ['text', 'label', 'input_ids', 'attention_mask'],
[2m[36m(train_fn pid=425)[0m     num_rows: 100
[2m[36m(train_fn pid=425)[0m })


                                                                   
[2m[36m(train_fn pid=425)[0m 
[2m[36m(train_fn pid=425)[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m


[2m[36m(train_fn pid=425)[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
[2m[36m(train_fn pid=425)[0m 	- Avoid using `tokenizers` before the fork if possible
[2m[36m(train_fn pid=425)[0m 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[2m[36m(train_fn pid=425)[0m == Status ==
[2m[36m(train_fn pid=425)[0m Current time: 2023-08-09 14:51:51 (running for 00:00:00.12)
[2m[36m(train_fn pid=425)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=425)[0m Logical resource usage: 0/6 CPUs, 0/2 GPUs
[2m[36m(train_fn pid=425)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51
[2m[36m(train_fn pid=425)[0m Number of trials: 1/1 (1 PENDING)
[2m[36m(train_fn pid=425)[0m +--------------------------------+----------+-------+
[2m[36m(train_fn pid=425)[0m | Trial name                     | status   | loc   

[2m[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)[0m 
[2m[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode[0m


[2m[36m(train_fn pid=425)[0m == Status ==
[2m[36m(train_fn pid=425)[0m Current time: 2023-08-09 14:51:56 (running for 00:00:05.16)
[2m[36m(train_fn pid=425)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=425)[0m Logical resource usage: 1.0/6 CPUs, 2.0/2 GPUs
[2m[36m(train_fn pid=425)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51
[2m[36m(train_fn pid=425)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(train_fn pid=425)[0m +--------------------------------+----------+-----------------+
[2m[36m(train_fn pid=425)[0m | Trial name                     | status   | loc             |
[2m[36m(train_fn pid=425)[0m |--------------------------------+----------+-----------------|
[2m[36m(train_fn pid=425)[0m | HuggingFaceTrainer_f2621_00000 | RUNNING  | 10.130.4.19:196 |
[2m[36m(train_fn pid=425)[0m +--------------------------------+----------+-----------------+
[2m[36m(train_fn pid=425)[0m 
[2m[36m(train_fn pid=425)[0m 

[2m[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)[0m 2023-08-09 14:51:57,260	INFO backend_executor.py:137 -- Starting distributed worker processes: ['235 (10.130.4.19)', '232 (10.129.4.19)']
[2m[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)[0m 2023-08-09 14:51:58,957	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)[0m 2023-08-09 14:51:58,957	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(HuggingFaceTrainer pid=196, ip=10.130.4.19)[0m 2023-08-09 14:51:58,958	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(HuggingFaceTrainer pid=196, ip

[2m[36m(train_fn pid=425)[0m == Status ==
[2m[36m(train_fn pid=425)[0m Current time: 2023-08-09 14:52:01 (running for 00:00:10.18)
[2m[36m(train_fn pid=425)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=425)[0m Logical resource usage: 1.0/6 CPUs, 2.0/2 GPUs
[2m[36m(train_fn pid=425)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51
[2m[36m(train_fn pid=425)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(train_fn pid=425)[0m +--------------------------------+----------+-----------------+
[2m[36m(train_fn pid=425)[0m | Trial name                     | status   | loc             |
[2m[36m(train_fn pid=425)[0m |--------------------------------+----------+-----------------|
[2m[36m(train_fn pid=425)[0m | HuggingFaceTrainer_f2621_00000 | RUNNING  | 10.130.4.19:196 |
[2m[36m(train_fn pid=425)[0m +--------------------------------+----------+-----------------+
[2m[36m(train_fn pid=425)[0m 
[2m[36m(train_fn pid=425)[0m 

[2m[36m(RayTrainWorker pid=235, ip=10.130.4.19)[0m 2023-08-09 14:52:01,262	INFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[RandomizeBlockOrder]
[2m[36m(RayTrainWorker pid=235, ip=10.130.4.19)[0m 2023-08-09 14:52:01,262	INFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(RayTrainWorker pid=235, ip=10.130.4.19)[0m 2023-08-09 14:52:01,262	INFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(RayTrainWorker pid=235, ip=10.130.4.19)[0m 2023-08-09 14:52:01,274	INFO streaming_executor.py:149 -- Shutting down <StreamingExecutor(Thread-3, stopped daemon 140636397225728)>.
[2m[36m(RayTrainWorker pid=232, ip=10.129.4.19)[0m 2023-

[2m[36m(train_fn pid=425)[0m == Status ==
[2m[36m(train_fn pid=425)[0m Current time: 2023-08-09 14:52:06 (running for 00:00:15.20)
[2m[36m(train_fn pid=425)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=425)[0m Logical resource usage: 1.0/6 CPUs, 2.0/2 GPUs
[2m[36m(train_fn pid=425)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51
[2m[36m(train_fn pid=425)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(train_fn pid=425)[0m +--------------------------------+----------+-----------------+
[2m[36m(train_fn pid=425)[0m | Trial name                     | status   | loc             |
[2m[36m(train_fn pid=425)[0m |--------------------------------+----------+-----------------|
[2m[36m(train_fn pid=425)[0m | HuggingFaceTrainer_f2621_00000 | RUNNING  | 10.130.4.19:196 |
[2m[36m(train_fn pid=425)[0m +--------------------------------+----------+-----------------+
[2m[36m(train_fn pid=425)[0m 
[2m[36m(train_fn pid=425)[0m 



[2m[36m(train_fn pid=425)[0m Trial HuggingFaceTrainer_f2621_00000 completed.
[2m[36m(train_fn pid=425)[0m == Status ==
[2m[36m(train_fn pid=425)[0m Current time: 2023-08-09 14:57:50 (running for 00:05:59.21)
[2m[36m(train_fn pid=425)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=425)[0m Logical resource usage: 1.0/6 CPUs, 2.0/2 GPUs
[2m[36m(train_fn pid=425)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-08-09_14-51-51
[2m[36m(train_fn pid=425)[0m Number of trials: 1/1 (1 TERMINATED)
[2m[36m(train_fn pid=425)[0m +--------------------------------+------------+-----------------+--------+------------------+--------+-----------------+---------+
[2m[36m(train_fn pid=425)[0m | Trial name                     | status     | loc             |   iter |   total time (s) |   loss |   learning_rate |   epoch |
[2m[36m(train_fn pid=425)[0m |--------------------------------+------------+-----------------+--------+------------------+--------+--

Once complete, we can bring our Ray cluster down and clean up:

In [9]:
cluster.down()

In [None]:
auth.logout()