In [None]:
%pip install codeflare-sdk -U

In [1]:
import os, sys, yaml
# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
from codeflare_sdk.job import RayJobClient
from time import sleep

In [None]:
%pip show codeflare-sdk

In [None]:
#parameters
namespace = "default"
openshift_api_url = "has to be specified"
kubernetes_user_bearer_token = "has to be specified"
num_gpus = "has to be specified"
s3_bucket_name = "has to be specified"
s3_access_key_id = "has to be specified"
s3_secret_access_key = "has to be specified"
s3_default_region = "has to be specified"

In [2]:
# Create the training and evaluation datasets.
# This can be run only once.
!{sys.executable} -m pip install datasets
create_dataset_path=os.path.join(os.getcwd(),'../notebooks')
if os.path.exists(create_dataset_path):
    sys.path.append(os.path.abspath(create_dataset_path))
import create_dataset
create_dataset.main()

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fsspec[http]<=2024.5.0,>=2023.1.0
  Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.8/193.8 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m109.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2
  Downloading huggingface_hub-0.24.2-py3-none-any.whl (417 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
auth = TokenAuthentication(
    token=kubernetes_user_bearer_token,
    server=openshift_api_url,
    skip_tls=True,
)
auth.login()

In [28]:
# Configure the Ray cluster
cluster = Cluster(ClusterConfiguration(
    name='ray',
    namespace=namespace,
    head_cpus=1,
    head_memory=4,
    head_gpus=0,
    num_workers=1,
    min_cpus=1,
    max_cpus=1,
    min_memory=1,
    max_memory=4,
    num_gpus=int(num_gpus),
    write_to_file=True,
    verify_tls=False
))
print("raycluster config created","*"*8)

Written to: /opt/app-root/src/.codeflare/resources/ray.yaml


In [29]:
import yaml 
cluster_yaml = None
directory_path = os.path.expanduser("~/.codeflare/resources/")
outfile = os.path.join(directory_path, "ray.yaml")
with open(outfile, "r") as f:
    cluster_yaml = yaml.load(f, yaml.FullLoader)
    "# Add toleration for GPU nodes to Ray cluster worker podn",
    cluster_yaml["spec"]["workerGroupSpecs"][0]["template"]["spec"]["tolerations"]=[{"key": "nvidia.com/gpu", "value": "NONE", "effect": "NoSchedule"}]
    try:
        del cluster_yaml["metadata"]["labels"]["kueue.x-k8s.io/queue-name"]
        print("Kueue label deleted")
    except Exception as e:
        print(e)
    with open(outfile, "w") as f:
        yaml.dump(cluster_yaml, f, default_flow_style=False)

Kueue label deleted


In [30]:
# Create the Ray cluster
cluster.up()
# Wait until status is updated
cluster.wait_ready()

In [32]:
cluster.details()

RayCluster(name='ray', status=<RayClusterStatus.READY: 'ready'>, head_cpus=16, head_mem='48G', head_gpu=0, workers=1, worker_mem_min='48G', worker_mem_max='48G', worker_cpu=8, worker_gpu=1, namespace='ray-finetune-llm-deepspeed', dashboard='https://ray-dashboard-ray-ray-finetune-llm-deepspeed.apps.o1i1y2s4h9x4b9p.pdxp.p1.openshiftapps.com')

In [34]:
# Initialize the Job Submission Client
client = cluster.job_client

In [35]:
# The S3 bucket where to store checkpoint.
# It can be set manually, otherwise it's retrieved from configured the data connection.
s3_bucket = s3_bucket_name
if not s3_bucket_name and s3_bucket_name!=None and s3_bucket_name!="":
    s3_bucket = os.environ.get('AWS_S3_BUCKET')
assert s3_bucket, "An S3 bucket must be provided to store checkpoints"

In [None]:
submission_id = client.submit_job(
    entrypoint="python ray_finetune_llm_deepspeed.py "
               "--model-name=meta-llama/Llama-2-7b-chat-hf "
               "--lora "
               "--num-devices=1 "
               "--num-epochs=3 "
               "--ds-config=./deepspeed_configs/zero_3_llama_2_7b.json "
               f"--storage-path=s3://{s3_bucket_name}/ray-finetune-llm-deepspeed3/"
               "--batch-size-per-device=32 "
               "--eval-batch-size-per-device=32 "
               "--as-test ",
    runtime_env={
        "env_vars": {
            "AWS_ACCESS_KEY_ID": s3_access_key_id,
            "AWS_SECRET_ACCESS_KEY": s3_secret_access_key,
            "AWS_DEFAULT_REGION": s3_default_region,
        },
        "pip": "requirements.txt",
        "working_dir": "./",
        "excludes": ["/docs/", "*.ipynb", "*.md"]
    },
)
print(submission_id)

2024-07-26 10:57:35,471	INFO dashboard_sdk.py:385 -- Package gcs://_ray_pkg_e1883352aefb99c4.zip already exists, skipping upload.


raysubmit_Hmn8GnNcSa1UJFaC


In [37]:
client.stop_job(submission_id)

True

In [42]:
cluster.down()