In this fourth and final notebook, we will go over how to leverage the SDK to directly work interactively with a Ray cluster during development.

In [1]:
# Import pieces from codeflare-sdk
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication

In [None]:
# Create authentication object for oc user permissions
auth = TokenAuthentication(
    token = "XXXXX",
    server = "XXXXX",
    skip_tls=False
)
auth.login()

Once again, let's start by running through the same cluster setup as before:

In [3]:
# Create and configure our cluster object (and appwrapper)
cluster = Cluster(ClusterConfiguration(
    name='interactivetest',
    namespace='default',
    min_worker=2,
    max_worker=2,
    min_cpus=2,
    max_cpus=2,
    min_memory=8,
    max_memory=8,
    gpu=1,
    instascale=True,
    machine_types=["m5.xlarge", "g4dn.xlarge"]
    
))

Written to: interactivetest.yaml


In [4]:
# Bring up the cluster
cluster.up()
cluster.wait_ready()

Waiting for requested resources to be set up...
Requested cluster up and running!


In [5]:
cluster.details()

RayCluster(name='interactivetest', status=<RayClusterStatus.READY: 'ready'>, min_workers=2, max_workers=2, worker_mem_min=8, worker_mem_max=8, worker_cpu=2, worker_gpu=1, namespace='default', dashboard='http://ray-dashboard-interactivetest-default.apps.meyceoz-032023.psap.aws.rhperfscale.org')

This time we will demonstrate another potential method of use: working with the Ray cluster interactively.

Using the SDK, we can get both the Ray cluster URI and dashboard URI:

In [6]:
ray_dashboard_uri = cluster.cluster_dashboard_uri()
ray_cluster_uri = cluster.cluster_uri()
print(ray_dashboard_uri)
print(ray_cluster_uri)

http://ray-dashboard-interactivetest-default.apps.meyceoz-032023.psap.aws.rhperfscale.org
ray://interactivetest-head-svc.default.svc:10001


Now we can connect directly to our Ray cluster via the Ray python client:

In [7]:
#before proceeding make sure the cluster exists and the uri is not empty
assert ray_cluster_uri, "Ray cluster needs to be started and set before proceeding"

import ray
from ray.air.config import ScalingConfig

# reset the ray context in case there's already one. 
ray.shutdown()
# establish connection to ray cluster

#install additionall libraries that will be required for model training
runtime_env = {"pip": ["transformers", "datasets", "evaluate", "pyarrow<7.0.0", "accelerate"]}

ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env)

print("Ray cluster is up and running: ", ray.is_initialized())

Ray cluster is up and running:  True


Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):

In [8]:
@ray.remote
def train_fn():
    from datasets import load_dataset
    import transformers
    from transformers import AutoTokenizer, TrainingArguments
    from transformers import AutoModelForSequenceClassification
    import numpy as np
    from datasets import load_metric
    import ray
    from ray import tune
    from ray.train.huggingface import HuggingFaceTrainer

    dataset = load_dataset("imdb")
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    #using a fraction of dataset but you can run with the full dataset
    small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
    small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

    print(f"len of train {small_train_dataset} and test {small_eval_dataset}")

    ray_train_ds = ray.data.from_huggingface(small_train_dataset)
    ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)

    def compute_metrics(eval_pred):
        metric = load_metric("accuracy")
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    def trainer_init_per_worker(train_dataset, eval_dataset, **config):
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

        training_args = TrainingArguments("/tmp/hf_imdb/test", eval_steps=1, disable_tqdm=True, 
                                          num_train_epochs=1, skip_memory_metrics=True,
                                          learning_rate=2e-5,
                                          per_device_train_batch_size=16,
                                          per_device_eval_batch_size=16,                                
                                          weight_decay=0.01,)
        return transformers.Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics
        )

    scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus

    # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. 
    # the ray native HFTrainer has built in support for scaling to multiple GPUs
    trainer = HuggingFaceTrainer(
        trainer_init_per_worker=trainer_init_per_worker,
        scaling_config=scaling_config,
        datasets={"train": ray_train_ds, "evaluation": ray_evaluation_ds},
    )
    result = trainer.fit()

Once we want to test our code out, we can run the training function we defined above remotely on our Ray cluster:

In [9]:
#call the above cell as a remote ray function
ray.get(train_fn.remote())

Downloading builder script: 100%|██████████| 4.31k/4.31k [00:00<00:00, 4.22MB/s]
Downloading metadata: 100%|██████████| 2.17k/2.17k [00:00<00:00, 2.26MB/s]
Downloading readme: 100%|██████████| 7.59k/7.59k [00:00<00:00, 7.62MB/s]


[2m[36m(train_fn pid=293)[0m Downloading and preparing dataset imdb/plain_text to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]
Downloading data:   0%|          | 33.8k/84.1M [00:00<04:54, 286kB/s]
Downloading data:   0%|          | 99.3k/84.1M [00:00<03:10, 442kB/s]
Downloading data:   0%|          | 263k/84.1M [00:00<01:36, 870kB/s] 
Downloading data:   1%|          | 640k/84.1M [00:00<00:47, 1.78MB/s]
Downloading data:   2%|▏         | 1.41M/84.1M [00:00<00:23, 3.47MB/s]
Downloading data:   3%|▎         | 2.87M/84.1M [00:00<00:12, 6.46MB/s]
Downloading data:   7%|▋         | 5.60M/84.1M [00:00<00:06, 12.1MB/s]
Downloading data:  12%|█▏        | 9.80M/84.1M [00:00<00:03, 19.6MB/s]
Downloading data:  17%|█▋        | 14.2M/84.1M [00:01<00:02, 24.6MB/s]
Downloading data:  23%|██▎       | 19.6M/84.1M [00:01<00:02, 31.0MB/s]
Downloading data:  30%|██▉       | 25.1M/84.1M [00:01<00:01, 35.6MB/s]
Downloading data:  36%|███▌      | 29.9M/84.1M [00:01<00:01, 37.0MB/s]
Downloading data:  42%|████▏     | 35.0M/84.1M [00:01<00:01, 38.8MB/s]
Downloading data:  

[2m[36m(train_fn pid=293)[0m Dataset imdb downloaded and prepared to /home/ray/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 581.76it/s]                                                
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 3.61kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 77.0kB/s]
Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 17.1MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 37.7MB/s]
Map:   0%|          | 0/25000 [00:00<?, ? examples/s]
Map:   4%|▍         | 1000/25000 [00:00<00:16, 1472.29 examples/s]
Map:   8%|▊         | 2000/25000 [00:01<00:13, 1643.38 examples/s]
Map:  12%|█▏        | 3000/25000 [00:02<00:15, 1458.91 examples/s]
Map:  16%|█▌        | 4000/25000 [00:02<00:15, 1334.62 examples/s]
Map:  20%|██        | 5000/25000 [00:03<00:13, 1451.64 examples/s]
Map:  24%|██▍       | 6000/25000 [00:04<00:13, 1432.39 examples/s]
Map:  28%|██▊      

[2m[36m(train_fn pid=293)[0m len of train Dataset({
[2m[36m(train_fn pid=293)[0m     features: ['text', 'label', 'input_ids', 'attention_mask'],
[2m[36m(train_fn pid=293)[0m     num_rows: 100
[2m[36m(train_fn pid=293)[0m }) and test Dataset({
[2m[36m(train_fn pid=293)[0m     features: ['text', 'label', 'input_ids', 'attention_mask'],
[2m[36m(train_fn pid=293)[0m     num_rows: 100
[2m[36m(train_fn pid=293)[0m })


                                                                   


[2m[36m(train_fn pid=293)[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
[2m[36m(train_fn pid=293)[0m 	- Avoid using `tokenizers` before the fork if possible
[2m[36m(train_fn pid=293)[0m 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[2m[36m(train_fn pid=293)[0m == Status ==
[2m[36m(train_fn pid=293)[0m Current time: 2023-04-18 14:55:36 (running for 00:00:04.93)
[2m[36m(train_fn pid=293)[0m Memory usage on this node: 2.9/15.4 GiB 
[2m[36m(train_fn pid=293)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=293)[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.55 GiB objects
[2m[36m(train_fn pid=293)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-04-18_14-55-31
[2m[36m(train_fn pid=293)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(train_fn pid=293)[0m +---------------------

[2m[36m(RayTrainWorker pid=115, ip=10.130.14.19)[0m 2023-04-18 14:55:39,275	INFO config.py:87 -- Setting up process group for: env:// [rank=0, world_size=2]


[2m[36m(train_fn pid=293)[0m == Status ==
[2m[36m(train_fn pid=293)[0m Current time: 2023-04-18 14:55:41 (running for 00:00:09.93)
[2m[36m(train_fn pid=293)[0m Memory usage on this node: 2.9/15.4 GiB 
[2m[36m(train_fn pid=293)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=293)[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.55 GiB objects
[2m[36m(train_fn pid=293)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-04-18_14-55-31
[2m[36m(train_fn pid=293)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(train_fn pid=293)[0m +--------------------------------+----------+------------------+
[2m[36m(train_fn pid=293)[0m | Trial name                     | status   | loc              |
[2m[36m(train_fn pid=293)[0m |--------------------------------+----------+------------------|
[2m[36m(train_fn pid=293)[0m | HuggingFaceTrainer_bd08a_00000 | RUNNING  | 10.131.14.19:144 |
[2m[36m(train_fn pid=293)[0m +------------

Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 76.1kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 76.9kB/s]
Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]
Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]
Downloading pytorch_model.bin:   8%|▊         | 21.0M/268M [00:00<00:01, 181MB/s]
Downloading pytorch_model.bin:   8%|▊         | 21.0M/268M [00:00<00:01, 176MB/s]
Downloading pytorch_model.bin:  16%|█▌        | 41.9M/268M [00:00<00:01, 160MB/s]
Downloading pytorch_model.bin:  20%|█▉        | 52.4M/268M [00:00<00:01, 208MB/s]
Downloading pytorch_model.bin:  27%|██▋       | 73.4M/268M [00:00<00:01, 191MB/s]
Downloading pytorch_model.bin:  31%|███▏      | 83.9M/268M [00:00<00:00, 220MB/s]
Downloading pytorch_model.bin:  43%|████▎     | 115M/268M [00:00<00:00, 218MB/s] 
Downloading pytorch_model.bin:  35%|███▌      | 94.4M/268M [00:00<00:01, 158MB/s]
Downloading pytorch_mode

[2m[36m(train_fn pid=293)[0m == Status ==
[2m[36m(train_fn pid=293)[0m Current time: 2023-04-18 14:55:46 (running for 00:00:14.93)
[2m[36m(train_fn pid=293)[0m Memory usage on this node: 2.9/15.4 GiB 
[2m[36m(train_fn pid=293)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=293)[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.55 GiB objects
[2m[36m(train_fn pid=293)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-04-18_14-55-31
[2m[36m(train_fn pid=293)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(train_fn pid=293)[0m +--------------------------------+----------+------------------+
[2m[36m(train_fn pid=293)[0m | Trial name                     | status   | loc              |
[2m[36m(train_fn pid=293)[0m |--------------------------------+----------+------------------|
[2m[36m(train_fn pid=293)[0m | HuggingFaceTrainer_bd08a_00000 | RUNNING  | 10.131.14.19:144 |
[2m[36m(train_fn pid=293)[0m +------------



[2m[36m(train_fn pid=293)[0m == Status ==
[2m[36m(train_fn pid=293)[0m Current time: 2023-04-18 14:55:51 (running for 00:00:19.94)
[2m[36m(train_fn pid=293)[0m Memory usage on this node: 2.9/15.4 GiB 
[2m[36m(train_fn pid=293)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=293)[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.55 GiB objects
[2m[36m(train_fn pid=293)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-04-18_14-55-31
[2m[36m(train_fn pid=293)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(train_fn pid=293)[0m +--------------------------------+----------+------------------+
[2m[36m(train_fn pid=293)[0m | Trial name                     | status   | loc              |
[2m[36m(train_fn pid=293)[0m |--------------------------------+----------+------------------|
[2m[36m(train_fn pid=293)[0m | HuggingFaceTrainer_bd08a_00000 | RUNNING  | 10.131.14.19:144 |
[2m[36m(train_fn pid=293)[0m +------------



[2m[36m(train_fn pid=293)[0m == Status ==
[2m[36m(train_fn pid=293)[0m Current time: 2023-04-18 15:03:46 (running for 00:08:14.87)
[2m[36m(train_fn pid=293)[0m Memory usage on this node: 3.5/15.4 GiB 
[2m[36m(train_fn pid=293)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=293)[0m Resources requested: 1.0/6 CPUs, 2.0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.55 GiB objects
[2m[36m(train_fn pid=293)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-04-18_14-55-31
[2m[36m(train_fn pid=293)[0m Number of trials: 1/1 (1 RUNNING)
[2m[36m(train_fn pid=293)[0m +--------------------------------+----------+------------------+--------+------------------+--------+-----------------+----------+
[2m[36m(train_fn pid=293)[0m | Trial name                     | status   | loc              |   iter |   total time (s) |   loss |   learning_rate |    epoch |
[2m[36m(train_fn pid=293)[0m |--------------------------------+----------+------------------+--------+--



[2m[36m(train_fn pid=293)[0m Result for HuggingFaceTrainer_bd08a_00000:
[2m[36m(train_fn pid=293)[0m   _time_this_iter_s: 276.56069135665894
[2m[36m(train_fn pid=293)[0m   _timestamp: 1681855672
[2m[36m(train_fn pid=293)[0m   _training_iteration: 2
[2m[36m(train_fn pid=293)[0m   date: 2023-04-18_15-08-01
[2m[36m(train_fn pid=293)[0m   done: true
[2m[36m(train_fn pid=293)[0m   epoch: 1.0
[2m[36m(train_fn pid=293)[0m   experiment_id: cb2127867b714e92b008dd8d87659908
[2m[36m(train_fn pid=293)[0m   experiment_tag: '0'
[2m[36m(train_fn pid=293)[0m   hostname: nteractivetest-worker-small-group-interactivetest-7m9zh
[2m[36m(train_fn pid=293)[0m   iterations_since_restore: 2
[2m[36m(train_fn pid=293)[0m   learning_rate: 0.0
[2m[36m(train_fn pid=293)[0m   loss: 0.2252
[2m[36m(train_fn pid=293)[0m   node_ip: 10.131.14.19
[2m[36m(train_fn pid=293)[0m   pid: 144
[2m[36m(train_fn pid=293)[0m   should_checkpoint: true
[2m[36m(train_fn pid=293)[0m   



[2m[36m(train_fn pid=293)[0m == Status ==
[2m[36m(train_fn pid=293)[0m Current time: 2023-04-18 15:08:14 (running for 00:12:43.34)
[2m[36m(train_fn pid=293)[0m Memory usage on this node: 3.9/15.4 GiB 
[2m[36m(train_fn pid=293)[0m Using FIFO scheduling algorithm.
[2m[36m(train_fn pid=293)[0m Resources requested: 0/6 CPUs, 0/2 GPUs, 0.0/22.35 GiB heap, 0.0/6.55 GiB objects
[2m[36m(train_fn pid=293)[0m Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2023-04-18_14-55-31
[2m[36m(train_fn pid=293)[0m Number of trials: 1/1 (1 TERMINATED)
[2m[36m(train_fn pid=293)[0m +--------------------------------+------------+------------------+--------+------------------+--------+-----------------+---------+
[2m[36m(train_fn pid=293)[0m | Trial name                     | status     | loc              |   iter |   total time (s) |   loss |   learning_rate |   epoch |
[2m[36m(train_fn pid=293)[0m |--------------------------------+------------+------------------+--------

[2m[36m(train_fn pid=293)[0m 2023-04-18 15:08:14,963	INFO tune.py:777 -- Total run time: 763.54 seconds (763.34 seconds for the tuning loop).


Once complete, we can bring our Ray cluster down and clean up:

In [10]:
cluster.down()

In [None]:
auth.logout()