In [1]:
import kfp
import os
import requests

from kfp.dsl import Input, Model, component, Dataset, Output, Artifact, Markdown
from kfp.dsl import InputPath, OutputPath, pipeline, component, PipelineTask
from kfp.components import load_component_from_file

In [2]:
# ray_pvc_yaml_generator = load_component_from_file("component_pvc_yaml_generator.yaml")
ray_cluster_yaml_generator = load_component_from_file("component_ray_cluster_yaml_generator.yaml")

In [3]:
@component(
    base_image="python:3.11",
    packages_to_install=["kubernetes==26.1.0", "tenacity==9.0.0"]
)
def deploy_ray_cluster_yaml(ray_cluster_yaml: Input[Artifact], namespace: str) -> str:
    import yaml
    from kubernetes import client, config
    from tenacity import retry, wait_exponential, stop_after_attempt

    with open(ray_cluster_yaml.path, 'r') as f:
        dep = yaml.safe_load(f)

    name = dep['metadata']['name']
    
    config.load_incluster_config()
    api = client.CustomObjectsApi()

    api.create_namespaced_custom_object(
        group='ray.io',
        version='v1',
        namespace=namespace,
        plural='rayclusters',
        body=dep
    )

    @retry(
        wait=wait_exponential(multiplier=2, min=1, max=10),
        stop=stop_after_attempt(30),
        reraise=True,
    )
    def ray_cluster_wait(api, namespace, name):
        raycluster = api.get_namespaced_custom_object(
            group="ray.io",
            version="v1",
            namespace=namespace,
            plural="rayclusters",
            name=name
        )
        status = raycluster.get("status", {})
        state = status.get("state").lower()
        assert state == "ready", f"Failed to create Ray Cluster: {name} in {namespace}."

    ray_cluster_wait(api, namespace, name)

    raycluster = api.get_namespaced_custom_object(
        group="ray.io",
        version="v1",
        namespace=namespace,
        plural="rayclusters",
        name=name
    )

    # Extract useful info
    status = raycluster.get("status", {})
    endpoints = status.get("endpoints", {})
    head = status.get("head", {})

    # Format Markdown output
    md_lines = [
        f"# RayCluster `{name}` Status Overview",
        "## Endpoints",
    ]
    for key, val in endpoints.items():
        md_lines.append(f"- **{key}**: `{val}`")

    md_lines += [
        "\n## Head Node",
        f"- **podName**: `{head.get('podName', 'N/A')}`",
        f"- **podIP**: `{head.get('podIP', 'N/A')}`",
        f"- **serviceName**: `{head.get('serviceName', 'N/A')}`",
        f"- **serviceIP**: `{head.get('serviceIP', 'N/A')}`"
    ]

    md_info = '\n'.join(md_lines)

    return md_info

In [4]:
@component(
    base_image="python:3.11",
    packages_to_install=["kubernetes==26.1.0"]
)
def deploy_ray_pvc_yaml(namespace: str, release_name: str, size: str) -> None:
    import yaml
    from kubernetes import client, config

    # with open(ray_pvc_yaml.path, 'r') as f:
    #     dep = yaml.safe_load(f)
    pvc_yaml = f"""
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: pvc-{release_name}
  namespace: {namespace}
spec:
  accessModes:
    - ReadWriteMany
  resources:
    requests:
      storage: {size}
"""
    dep = yaml.safe_load(pvc_yaml)

    name = dep['metadata']['name']
    
    config.load_incluster_config()
    k8s_core_v1 = client.CoreV1Api()
    k8s_core_v1.create_namespaced_persistent_volume_claim(body=dep, namespace=namespace)

In [5]:
@pipeline(name='create-ray-cluster-pipeline')
def create_ray_cluster_pipeline(namespace: str, release_name: str, num_nodes: int, gpus_per_node: int) -> str:
    pvc_deploy_op = deploy_ray_pvc_yaml(namespace=namespace, release_name=release_name, size="30Gi")
    ray_yaml_op = ray_cluster_yaml_generator(namespace=namespace, release_name=release_name, num_nodes=1, gpus_per_node=1).after(pvc_deploy_op)
    ray_deploy_op = deploy_ray_cluster_yaml(ray_cluster_yaml=ray_yaml_op.outputs['rendered_yaml'], namespace=namespace)
    return ray_deploy_op.output

In [6]:
client = kfp.Client()

kfp.compiler.Compiler().compile(create_ray_cluster_pipeline, './pipelines-yaml/create_ray_cluster_pipeline.yaml')



In [7]:
run = client.create_run_from_pipeline_func(create_ray_cluster_pipeline, arguments={"namespace": "demo-ns", "release_name": "raycluster", "num_nodes": 2, "gpus_per_node": 1}, enable_caching=False)