In [1]:
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication
import os
import sys

In [3]:
# Create the training and evaluation datasets.
# This can be run only once.
!{sys.executable} -m pip install -q datasets
import create_dataset
create_dataset.gsm8k_qa_no_tokens_template()


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import importlib.metadata
importlib.metadata.version('codeflare_sdk')

'0.19.1'

In [5]:
# Authenticate the CodeFlare SDK
# On OpenShift, you can retrieve the token by running `oc whoami -t`,
# and the server with `oc cluster-info`.
auth = TokenAuthentication(
    token = 'sha256~vgNGHEFKFxuMPXuSxNJ8If0nC1kcmBveIXQsRbfa0tI',
    server = 'https://api.cluster-6jmjb.6jmjb.sandbox1479.opentlc.com:6443',
    skip_tls=False
)
auth.login()

Authenticated with certificate located at /etc/pki/tls/custom-certs/ca-bundle.crt


'Logged into https://api.cluster-6jmjb.6jmjb.sandbox1479.opentlc.com:6443'

In [33]:
# Configure the Ray cluster
cluster = Cluster(ClusterConfiguration(
    name='ray',
    namespace='ray-finetune-llm-deepspeed',
    num_workers=4,
    worker_cpu_requests=2,
    worker_cpu_limits=2,
    head_cpus=2,
    worker_memory_requests=8,
    worker_memory_limits=8,
    head_memory=8,
    # Use the following parameters with NVIDIA GPUs
    # Ensure the Python version in the notebook image matches the version used in the Ray cluster to avoid compatibility issues
    image="quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26",
    head_extended_resource_requests={'nvidia.com/gpu':1},
    worker_extended_resource_requests={'nvidia.com/gpu':1},
    # Or replace them with these parameters for AMD GPUs
    # image="quay.io/rhoai/ray:2.35.0-py311-rocm61-torch24-fa26",
    # head_extended_resource_requests={'amd.com/gpu':1},
    # worker_extended_resource_requests={'amd.com/gpu':1},
))

Yaml resources loaded for ray


In [34]:
# Create the Ray cluster
cluster.up()

In [35]:
cluster.wait_ready()

Waiting for requested resources to be set up...
Requested cluster is up and running!
Dashboard is ready!


In [36]:
cluster.details()

RayCluster(name='ray', status=<RayClusterStatus.READY: 'ready'>, head_cpus=2, head_mem='8G', workers=4, worker_mem_min='8G', worker_mem_max='8G', worker_cpu=2, namespace='ray-finetune-llm-deepspeed', dashboard='https://ray-dashboard-ray-ray-finetune-llm-deepspeed.apps.cluster-6jmjb.6jmjb.sandbox1479.opentlc.com', worker_extended_resources={'nvidia.com/gpu': 1}, head_extended_resources={'nvidia.com/gpu': 1})

In [37]:
# Initialize the Job Submission Client
client = cluster.job_client

In [38]:
# The S3 bucket where to store checkpoint.
# It can be set manually, otherwise it's retrieved from configured the data connection.
s3_bucket = 'ray-fintune-llm'
if not s3_bucket:
    s3_bucket = os.environ.get('AWS_S3_BUCKET')
assert s3_bucket, "An S3 bucket must be provided to store checkpoints"

In [45]:
submission_id = client.submit_job(
    entrypoint="python ray_finetune_llm_deepspeed.py "
               "--model-name=meta-llama/Llama-3.2-1B "
               "--lora "
               "--num-devices=4 "
               "--num-epochs=3 "
               "--ds-config=./deepspeed_configs/zero_3_offload_optim_param.json "
               f"--storage-path=s3://{s3_bucket}/ray_finetune_llm_deepspeed/ "
               "--batch-size-per-device=4 "
               "--eval-batch-size-per-device=4 ",
    runtime_env={
        "env_vars": {
            'AWS_ENDPOINT_URL': 'http://minio.ic-shared-minio.svc:9000',
            'AWS_ACCESS_KEY_ID': 'minio',# os.environ.get('AWS_ACCESS_KEY_ID'),
            'AWS_SECRET_ACCESS_KEY': 'minio123', # os.environ.get('AWS_SECRET_ACCESS_KEY'),
            'AWS_DEFAULT_REGION': 'us-east-1'#os.environ.get('AWS_DEFAULT_REGION')
        },
        'pip': 'requirements.txt',
        'working_dir': './',
        "excludes": ["/docs/", "*.ipynb", "*.md"]
    },
)
print(submission_id)

2024-11-20 05:40:39,459	INFO dashboard_sdk.py:385 -- Package gcs://_ray_pkg_f27301d602772ea4.zip already exists, skipping upload.


raysubmit_CJhsDPMZuSmbw9ZV


In [None]:
client.stop_job(submission_id)

In [32]:
cluster.down()