diff --git a/src/codeflare_sdk/__init__.py b/src/codeflare_sdk/__init__.py index 80bd36c6..0ed41d15 100644 --- a/src/codeflare_sdk/__init__.py +++ b/src/codeflare_sdk/__init__.py @@ -11,6 +11,7 @@ CodeFlareClusterStatus, RayCluster, AppWrapper, + get_cluster, ) from .job import JobDefinition, Job, DDPJobDefinition, DDPJob, RayJobClient diff --git a/src/codeflare_sdk/cluster/__init__.py b/src/codeflare_sdk/cluster/__init__.py index 73950a5c..419561d7 100644 --- a/src/codeflare_sdk/cluster/__init__.py +++ b/src/codeflare_sdk/cluster/__init__.py @@ -13,6 +13,6 @@ AppWrapper, ) -from .cluster import Cluster, ClusterConfiguration +from .cluster import Cluster, ClusterConfiguration, get_cluster from .awload import AWManager diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 4341227a..af8b6f6d 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -468,7 +468,7 @@ def cluster_dashboard_uri(self) -> str: elif "route.openshift.io/termination" in annotations: protocol = "https" return f"{protocol}://{ingress.spec.rules[0].host}" - return "Dashboard ingress not available yet, have you run cluster.up()?" + return "Dashboard not available yet, have you run cluster.up()?" def list_jobs(self) -> List: """ @@ -505,30 +505,50 @@ def torchx_config( def from_k8_cluster_object( rc, mcad=True, ingress_domain=None, ingress_options={}, write_to_file=False ): + config_check() + openshift_oauth = False + if ( + rc["metadata"]["annotations"]["sdk.codeflare.dev/local_interactive"] + == "True" + ): + local_interactive = True + else: + local_interactive = False + if "codeflare.dev/oauth" in rc["metadata"]["annotations"]: + openshift_oauth = ( + rc["metadata"]["annotations"]["codeflare.dev/oauth"] == "True" + ) + else: + for container in rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ]: + openshift_oauth = "oauth-proxy" in container["name"] machine_types = ( rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] else [] ) - local_interactive = ( - "volumeMounts" - in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0] - ) - if local_interactive: - ingress_domain = get_ingress_domain_from_client( - rc["metadata"]["name"], rc["metadata"]["namespace"] - ) + + if local_interactive and ingress_domain == None: + ingress_domain = rc["metadata"]["annotations"][ + "sdk.codeflare.dev/ingress_domain" + ] + cluster_config = ClusterConfiguration( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], - min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["resources"]["requests"]["cpu"], - max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["resources"]["limits"]["cpu"], + min_cpus=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["cpu"] + ), + max_cpus=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["cpu"] + ), min_memory=int( rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ "resources" @@ -539,9 +559,11 @@ def from_k8_cluster_object( "resources" ]["limits"]["memory"][:-1] ), - num_gpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["resources"]["limits"]["nvidia.com/gpu"], + num_gpus=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["nvidia.com/gpu"] + ), instascale=True if machine_types else False, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 @@ -551,6 +573,7 @@ def from_k8_cluster_object( ingress_domain=ingress_domain, ingress_options=ingress_options, write_to_file=write_to_file, + openshift_oauth=openshift_oauth, ) return Cluster(cluster_config) @@ -652,56 +675,57 @@ def get_cluster( for rc in rcs["items"]: if rc["metadata"]["name"] == cluster_name: mcad = _check_aw_exists(cluster_name, namespace) - - try: - config_check() - api_instance = client.NetworkingV1Api(api_config_handler()) - ingresses = api_instance.list_namespaced_ingress(namespace) - ingress_host = None - ingress_options = {} - for ingress in ingresses.items: - # Search for ingress with AppWrapper name as the owner - if ( - "ingress-owner" in ingress.metadata.labels - and ingress.metadata.labels["ingress-owner"] == cluster_name - ): - ingress_host = ingress.spec.rules[0].host + ingress_host = None + ingress_options = {} + if not is_openshift_cluster(): + try: + config_check() + api_instance = client.NetworkingV1Api(api_config_handler()) + ingresses = api_instance.list_namespaced_ingress(namespace) + for ingress in ingresses.items: + # Search for ingress with AppWrapper name as the owner if ( - "ingress-options" in ingress.metadata.labels - and ingress.metadata.labels["ingress-options"] == "true" + "ingress-owner" in ingress.metadata.labels + and ingress.metadata.labels["ingress-owner"] == cluster_name ): - ingress_name = ingress.metadata.name - port = ( - ingress.spec.rules[0] - .http.paths[0] - .backend.service.port.number - ) - annotations = ingress.metadata.annotations - path = ingress.spec.rules[0].http.paths[0].path - ingress_class_name = ingress.spec.ingress_class_name - path_type = ingress.spec.rules[0].http.paths[0].path_type - - ingress_options = { - "ingresses": [ - { - "ingressName": ingress_name, - "port": port, - "annotations": annotations, - "ingressClassName": ingress_class_name, - "pathType": path_type, - "path": path, - "host": ingress_host, - } - ] - } - except Exception as e: - return _kube_api_error_handling(e) + ingress_host = ingress.spec.rules[0].host + if ( + "ingress-options" in ingress.metadata.labels + and ingress.metadata.labels["ingress-options"] == "true" + ): + ingress_name = ingress.metadata.name + port = ( + ingress.spec.rules[0] + .http.paths[0] + .backend.service.port.number + ) + annotations = ingress.metadata.annotations + path = ingress.spec.rules[0].http.paths[0].path + ingress_class_name = ingress.spec.ingress_class_name + path_type = ( + ingress.spec.rules[0].http.paths[0].path_type + ) + + ingress_options = { + "ingresses": [ + { + "ingressName": ingress_name, + "port": port, + "annotations": annotations, + "ingressClassName": ingress_class_name, + "pathType": path_type, + "path": path, + "host": ingress_host, + } + ] + } + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) # We gather the ingress domain from the host if ingress_host is not None and ingress_options == {}: ingress_domain = ingress_host.split(".", 1)[1] else: ingress_domain = None - return Cluster.from_k8_cluster_object( rc, mcad=mcad, @@ -1043,30 +1067,3 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: if ray.status == CodeFlareClusterStatus.READY: ray.status = RayClusterStatus.READY return ray - - -def get_ingress_domain_from_client(cluster_name: str, namespace: str = "default"): - if is_openshift_cluster(): - try: - config_check() - api_instance = client.CustomObjectsApi(api_config_handler()) - route = api_instance.get_namespaced_custom_object( - group="route.openshift.io", - version="v1", - namespace=namespace, - plural="routes", - name=f"rayclient-{cluster_name}", - ) - return route["spec"]["host"].split(".", 1)[1] - except Exception as e: # pragma no cover - return _kube_api_error_handling(e) - else: - try: - config_check() - api_instance = client.NetworkingV1Api(api_config_handler()) - ingress = api_instance.read_namespaced_ingress( - f"rayclient-{cluster_name}", namespace - ) - return ingress.spec.rules[0].host.split(".", 1)[1] - except Exception as e: # pragma no cover - return _kube_api_error_handling(e) diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index d2c6074f..fb6ef427 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -40,6 +40,8 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: + annotations: + sdk.codeflare.dev/local_interactive: "False" labels: workload.codeflare.dev/appwrapper: "aw-kuberay" controller-tools.k8s.io: "1.0" diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index fe012c6f..da65defd 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -461,12 +461,25 @@ def enable_local_interactive(resources, cluster_name, namespace, ingress_domain) namespace, ingress_domain, ) + item["generictemplate"]["metadata"]["annotations"][ + "sdk.codeflare.dev/local_interactive" + ] = "True" + item["generictemplate"]["metadata"]["annotations"][ + "sdk.codeflare.dev/ingress_domain" + ] = ingress_domain item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][ "initContainers" ][0].get("command")[2] = command +def apply_ingress_domain_annotation(resources, ingress_domain): + item = resources["resources"].get("GenericItems")[0] + item["generictemplate"]["metadata"]["annotations"][ + "sdk.codeflare.dev/ingress_domain" + ] = ingress_domain + + def del_from_list_by_name(l: list, target: typing.List[str]) -> list: return [x for x in l if x["name"] not in target] @@ -734,6 +747,9 @@ def generate_appwrapper( ingress_options, ingress_domain, ) + if ingress_domain is not None: + apply_ingress_domain_annotation(resources, ingress_domain) + if local_interactive: enable_local_interactive(resources, cluster_name, namespace, ingress_domain) else: diff --git a/tests/test-case-bad.yaml b/tests/test-case-bad.yaml index 6e969e01..aeccf519 100644 --- a/tests/test-case-bad.yaml +++ b/tests/test-case-bad.yaml @@ -32,6 +32,8 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: + annotations: + sdk.codeflare.dev/local_interactive: 'False' labels: workload.codeflare.dev/appwrapper: unit-test-cluster controller-tools.k8s.io: '1.0' diff --git a/tests/test-case-no-mcad.yamls b/tests/test-case-no-mcad.yamls index 299ff9a8..b15833fe 100644 --- a/tests/test-case-no-mcad.yamls +++ b/tests/test-case-no-mcad.yamls @@ -2,6 +2,9 @@ apiVersion: ray.io/v1 kind: RayCluster metadata: + annotations: + sdk.codeflare.dev/ingress_domain: apps.cluster.awsroute.org + sdk.codeflare.dev/local_interactive: 'False' labels: controller-tools.k8s.io: '1.0' workload.codeflare.dev/appwrapper: unit-test-cluster-ray diff --git a/tests/test-case-prio.yaml b/tests/test-case-prio.yaml index 1e72c442..c81d4396 100644 --- a/tests/test-case-prio.yaml +++ b/tests/test-case-prio.yaml @@ -32,6 +32,9 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: + annotations: + sdk.codeflare.dev/ingress_domain: apps.cluster.awsroute.org + sdk.codeflare.dev/local_interactive: 'False' labels: controller-tools.k8s.io: '1.0' workload.codeflare.dev/appwrapper: prio-test-cluster diff --git a/tests/test-case.yaml b/tests/test-case.yaml index 76285209..d7c31a11 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -31,6 +31,9 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: + annotations: + sdk.codeflare.dev/ingress_domain: apps.cluster.awsroute.org + sdk.codeflare.dev/local_interactive: 'False' labels: controller-tools.k8s.io: '1.0' workload.codeflare.dev/appwrapper: unit-test-cluster diff --git a/tests/test-default-appwrapper.yaml b/tests/test-default-appwrapper.yaml index 1ed93670..c9da340c 100644 --- a/tests/test-default-appwrapper.yaml +++ b/tests/test-default-appwrapper.yaml @@ -29,6 +29,9 @@ spec: apiVersion: ray.io/v1 kind: RayCluster metadata: + annotations: + sdk.codeflare.dev/ingress_domain: apps.cluster.awsroute.org + sdk.codeflare.dev/local_interactive: 'False' labels: controller-tools.k8s.io: '1.0' workload.codeflare.dev/appwrapper: unit-test-default-cluster diff --git a/tests/unit_test.py b/tests/unit_test.py index 9c31d305..3edadc63 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -40,7 +40,6 @@ _app_wrapper_status, _ray_cluster_status, _get_ingress_domain, - get_ingress_domain_from_client, ) from codeflare_sdk.cluster.auth import ( TokenAuthentication, @@ -618,7 +617,8 @@ def test_cluster_uris(mocker): mocker.patch( "kubernetes.client.NetworkingV1Api.list_namespaced_ingress", return_value=ingress_retrieval( - port=8265, annotations={"route.openshift.io/termination": "passthrough"} + cluster_name="unit-test-cluster", + annotations={"route.openshift.io/termination": "passthrough"}, ), ) assert ( @@ -627,7 +627,7 @@ def test_cluster_uris(mocker): ) mocker.patch( "kubernetes.client.NetworkingV1Api.list_namespaced_ingress", - return_value=ingress_retrieval(port=8265), + return_value=ingress_retrieval(), ) assert cluster.cluster_uri() == "ray://unit-test-cluster-head-svc.ns.svc:10001" assert ( @@ -640,7 +640,7 @@ def test_cluster_uris(mocker): ) assert ( cluster.cluster_dashboard_uri() - == "Dashboard ingress not available yet, have you run cluster.up()?" + == "Dashboard not available yet, have you run cluster.up()?" ) @@ -675,15 +675,15 @@ def ray_addr(self, *args): return self._address -def ingress_retrieval(port, annotations=None, cluster_name="unit-test-cluster"): +def mocked_ingress(port, cluster_name="unit-test-cluster", annotations: dict = None): labels = {"ingress-owner": cluster_name, "ingress-options": "false"} if port == 10001: - serviceName = "client" + name = f"rayclient-{cluster_name}" else: - serviceName = "dashboard" + name = f"ray-dashboard-{cluster_name}" mock_ingress = client.V1Ingress( metadata=client.V1ObjectMeta( - name=f"ray-{serviceName}-{cluster_name}", + name=name, annotations=annotations, labels=labels, owner_references=[ @@ -695,7 +695,7 @@ def ingress_retrieval(port, annotations=None, cluster_name="unit-test-cluster"): spec=client.V1IngressSpec( rules=[ client.V1IngressRule( - host=f"ray-{serviceName}-{cluster_name}-ns.apps.cluster.awsroute.org", + host=f"{name}-ns.apps.cluster.awsroute.org", http=client.V1HTTPIngressRuleValue( paths=[ client.V1HTTPIngressPath( @@ -714,7 +714,23 @@ def ingress_retrieval(port, annotations=None, cluster_name="unit-test-cluster"): ], ), ) - mock_ingress_list = client.V1IngressList(items=[mock_ingress]) + return mock_ingress + + +def ingress_retrieval( + cluster_name="unit-test-cluster", client_ing: bool = False, annotations: dict = None +): + dashboard_ingress = mocked_ingress(8265, cluster_name, annotations) + if client_ing: + client_ingress = mocked_ingress( + 10001, cluster_name=cluster_name, annotations=annotations + ) + mock_ingress_list = client.V1IngressList( + items=[client_ingress, dashboard_ingress] + ) + else: + mock_ingress_list = client.V1IngressList(items=[dashboard_ingress]) + return mock_ingress_list @@ -736,7 +752,7 @@ def test_ray_job_wrapping(mocker): ) mocker.patch( "kubernetes.client.NetworkingV1Api.list_namespaced_ingress", - return_value=ingress_retrieval(8265), + return_value=ingress_retrieval(), ) assert cluster.list_jobs() == cluster.cluster_dashboard_uri() @@ -959,12 +975,17 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "apiVersion": "ray.io/v1", "kind": "RayCluster", "metadata": { - "creationTimestamp": "2023-02-22T16:26:07Z", + "creationTimestamp": "2024-03-05T09:55:37Z", "generation": 1, + "annotations": { + "sdk.codeflare.dev/local_interactive": "True", + "sdk.codeflare.dev/ingress_domain": "apps.cluster.awsroute.org", + }, "labels": { - "workload.codeflare.dev/appwrapper": "quicktest", + "appwrapper.mcad.ibm.com": "quicktest", "controller-tools.k8s.io": "1.0", "resourceName": "quicktest", + "workload.codeflare.dev/appwrapper": "quicktest", "orderedinstance": "m4.xlarge_g4dn.xlarge", }, "managedFields": [ @@ -975,13 +996,14 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "f:metadata": { "f:labels": { ".": {}, - "f:workload.codeflare.dev/appwrapper": {}, + "f:appwrapper.mcad.ibm.com": {}, "f:controller-tools.k8s.io": {}, "f:resourceName": {}, + "f:workload.codeflare.dev/appwrapper": {}, }, "f:ownerReferences": { ".": {}, - 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, + 'k:{"uid":"a29b1a7a-0992-4860-a8d5-a689a751a3e8"}': {}, }, }, "f:spec": { @@ -1017,41 +1039,53 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "f:serviceType": {}, "f:template": { ".": {}, - "f:spec": {".": {}, "f:containers": {}}, + "f:spec": { + ".": {}, + "f:affinity": { + ".": {}, + "f:nodeAffinity": { + ".": {}, + "f:requiredDuringSchedulingIgnoredDuringExecution": {}, + }, + }, + "f:imagePullSecrets": {}, + "f:volumes": {}, + }, }, }, "f:rayVersion": {}, "f:workerGroupSpecs": {}, }, }, - "manager": "mcad-controller", + "manager": "codeflare-operator", "operation": "Update", - "time": "2023-02-22T16:26:07Z", + "time": "2024-03-05T09:55:37Z", }, { - "apiVersion": "ray.io/v1", + "apiVersion": "ray.io/v1alpha1", "fieldsType": "FieldsV1", "fieldsV1": { "f:status": { ".": {}, - "f:availableWorkerReplicas": {}, "f:desiredWorkerReplicas": {}, "f:endpoints": { ".": {}, "f:client": {}, "f:dashboard": {}, "f:gcs": {}, + "f:metrics": {}, }, + "f:head": {".": {}, "f:serviceIP": {}}, "f:lastUpdateTime": {}, "f:maxWorkerReplicas": {}, "f:minWorkerReplicas": {}, - "f:state": {}, + "f:observedGeneration": {}, } }, "manager": "manager", "operation": "Update", "subresource": "status", - "time": "2023-02-22T16:26:16Z", + "time": "2024-03-05T09:55:37Z", }, ], "name": "quicktest", @@ -1063,11 +1097,11 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "controller": True, "kind": "AppWrapper", "name": "quicktest", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + "uid": "a29b1a7a-0992-4860-a8d5-a689a751a3e8", } ], - "resourceVersion": "9482407", - "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285", + "resourceVersion": "5305674", + "uid": "820d065d-bf0c-4675-b951-d32ea496020e", }, "spec": { "autoscalerOptions": { @@ -1088,9 +1122,50 @@ def get_ray_obj(group, version, namespace, plural, cls=None): }, "serviceType": "ClusterIP", "template": { + "metadata": {}, "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "quicktest", + "operator": "In", + "values": ["quicktest"], + } + ] + } + ] + } + } + }, "containers": [ { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + }, + {"name": "RAY_USE_TLS", "value": "0"}, + { + "name": "RAY_TLS_SERVER_CERT", + "value": "/home/ray/workspace/tls/server.crt", + }, + { + "name": "RAY_TLS_SERVER_KEY", + "value": "/home/ray/workspace/tls/server.key", + }, + { + "name": "RAY_TLS_CA_CERT", + "value": "/home/ray/workspace/tls/ca.crt", + }, + ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", "imagePullPolicy": "Always", "lifecycle": { @@ -1134,12 +1209,62 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "nvidia.com/gpu": 0, }, }, + "volumeMounts": [ + { + "mountPath": "/etc/pki/tls/certs/odh-trusted-ca-bundle.crt", + "name": "odh-trusted-ca-cert", + "subPath": "odh-trusted-ca-bundle.crt", + }, + { + "mountPath": "/etc/ssl/certs/odh-trusted-ca-bundle.crt", + "name": "odh-trusted-ca-cert", + "subPath": "odh-trusted-ca-bundle.crt", + }, + { + "mountPath": "/etc/pki/tls/certs/odh-ca-bundle.crt", + "name": "odh-ca-cert", + "subPath": "odh-ca-bundle.crt", + }, + { + "mountPath": "/etc/ssl/certs/odh-ca-bundle.crt", + "name": "odh-ca-cert", + "subPath": "odh-ca-bundle.crt", + }, + ], } - ] - } + ], + "volumes": [ + { + "configMap": { + "items": [ + { + "key": "ca-bundle.crt", + "path": "odh-trusted-ca-bundle.crt", + } + ], + "name": "odh-trusted-ca-bundle", + "optional": True, + }, + "name": "odh-trusted-ca-cert", + }, + { + "configMap": { + "items": [ + { + "key": "odh-ca-bundle.crt", + "path": "odh-ca-bundle.crt", + } + ], + "name": "odh-trusted-ca-bundle", + "optional": True, + }, + "name": "odh-ca-cert", + }, + ], + }, }, }, - "rayVersion": "1.12.0", + "rayVersion": "2.7.0", "workerGroupSpecs": [ { "groupName": "small-group-quicktest", @@ -1147,12 +1272,30 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "minReplicas": 1, "rayStartParams": {"block": "true", "num-gpus": "0"}, "replicas": 1, + "scaleStrategy": {}, "template": { "metadata": { "annotations": {"key": "value"}, "labels": {"key": "value"}, }, "spec": { + "affinity": { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "quicktest", + "operator": "In", + "values": ["quicktest"], + } + ] + } + ] + } + } + }, "containers": [ { "env": [ @@ -1163,7 +1306,20 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "fieldPath": "status.podIP" } }, - } + }, + {"name": "RAY_USE_TLS", "value": "0"}, + { + "name": "RAY_TLS_SERVER_CERT", + "value": "/home/ray/workspace/tls/server.crt", + }, + { + "name": "RAY_TLS_SERVER_KEY", + "value": "/home/ray/workspace/tls/server.key", + }, + { + "name": "RAY_TLS_CA_CERT", + "value": "/home/ray/workspace/tls/ca.crt", + }, ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", "lifecycle": { @@ -1190,24 +1346,76 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "nvidia.com/gpu": 0, }, }, + "volumeMounts": [ + { + "mountPath": "/etc/pki/tls/certs/odh-trusted-ca-bundle.crt", + "name": "odh-trusted-ca-cert", + "subPath": "odh-trusted-ca-bundle.crt", + }, + { + "mountPath": "/etc/ssl/certs/odh-trusted-ca-bundle.crt", + "name": "odh-trusted-ca-cert", + "subPath": "odh-trusted-ca-bundle.crt", + }, + { + "mountPath": "/etc/pki/tls/certs/odh-ca-bundle.crt", + "name": "odh-ca-cert", + "subPath": "odh-ca-bundle.crt", + }, + { + "mountPath": "/etc/ssl/certs/odh-ca-bundle.crt", + "name": "odh-ca-cert", + "subPath": "odh-ca-bundle.crt", + }, + ], } ], + "volumes": [ + { + "configMap": { + "items": [ + { + "key": "ca-bundle.crt", + "path": "odh-trusted-ca-bundle.crt", + } + ], + "name": "odh-trusted-ca-bundle", + "optional": True, + }, + "name": "odh-trusted-ca-cert", + }, + { + "configMap": { + "items": [ + { + "key": "odh-ca-bundle.crt", + "path": "odh-ca-bundle.crt", + } + ], + "name": "odh-trusted-ca-bundle", + "optional": True, + }, + "name": "odh-ca-cert", + }, + ], }, }, } ], }, "status": { - "availableWorkerReplicas": 2, "desiredWorkerReplicas": 1, "endpoints": { "client": "10001", "dashboard": "8265", "gcs": "6379", + "metrics": "8080", }, - "lastUpdateTime": "2023-02-22T16:26:16Z", + "head": {"serviceIP": "172.30.179.88"}, + "lastUpdateTime": "2024-03-05T09:55:37Z", "maxWorkerReplicas": 1, "minWorkerReplicas": 1, + "observedGeneration": 1, "state": "ready", }, } @@ -1326,6 +1534,9 @@ def get_aw_obj(group, version, namespace, plural): "apiVersion": "ray.io/v1", "kind": "RayCluster", "metadata": { + "annotations": { + "sdk.codeflare.dev/local_interactive": "False" + }, "labels": { "workload.codeflare.dev/appwrapper": "quicktest1", "controller-tools.k8s.io": "1.0", @@ -1654,6 +1865,9 @@ def get_aw_obj(group, version, namespace, plural): "apiVersion": "ray.io/v1", "kind": "RayCluster", "metadata": { + "annotations": { + "sdk.codeflare.dev/local_interactive": "False" + }, "labels": { "workload.codeflare.dev/appwrapper": "quicktest2", "controller-tools.k8s.io": "1.0", @@ -1869,21 +2083,90 @@ def get_aw_obj(group, version, namespace, plural): return api_obj1 -def test_get_cluster(mocker): - mocker.patch("kubernetes.client.ApisApi.get_api_versions") +def route_list_retrieval(group, version, namespace, plural): + assert group == "route.openshift.io" + assert version == "v1" + assert namespace == "ns" + assert plural == "routes" + return { + "kind": "RouteList", + "apiVersion": "route.openshift.io/v1", + "metadata": {"resourceVersion": "6072398"}, + "items": [ + { + "metadata": { + "name": "ray-dashboard-quicktest", + "namespace": "ns", + }, + "spec": { + "host": "ray-dashboard-quicktest-opendatahub.apps.cluster.awsroute.org", + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + "weight": 100, + }, + "port": {"targetPort": "dashboard"}, + "tls": {"termination": "edge"}, + }, + }, + { + "metadata": { + "name": "rayclient-quicktest", + "namespace": "ns", + }, + "spec": { + "host": "rayclient-quicktest-opendatahub.apps.cluster.awsroute.org", + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + "weight": 100, + }, + "port": {"targetPort": "client"}, + "tls": {"termination": "passthrough"}, + }, + }, + ], + } + + +def test_get_cluster_openshift(mocker): mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + # Mock the client.ApisApi function to return a mock object + mock_api = MagicMock() + mock_api.get_api_versions.return_value.groups = [ + MagicMock(versions=[MagicMock(group_version="route.openshift.io/v1")]) + ] + mocker.patch("kubernetes.client.ApisApi", return_value=mock_api) + + assert is_openshift_cluster() + + def custom_side_effect(group, version, namespace, plural, **kwargs): + if plural == "routes": + return route_list_retrieval("route.openshift.io", "v1", "ns", "routes") + elif plural == "rayclusters": + return get_ray_obj("ray.io", "v1", "ns", "rayclusters") + elif plural == "appwrappers": + return get_aw_obj("workload.codeflare.dev", "v1beta1", "ns", "appwrappers") + + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", get_aw_obj + ) + mocker.patch( "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", - side_effect=get_ray_obj, + side_effect=custom_side_effect, ) mocker.patch( "kubernetes.client.CustomObjectsApi.get_namespaced_custom_object", - side_effect=get_named_aw, + return_value=get_named_aw, ) mocker.patch( - "kubernetes.client.NetworkingV1Api.list_namespaced_ingress", - return_value=ingress_retrieval(port=8265, cluster_name="quicktest"), + "kubernetes.client.CustomObjectsApi.get_namespaced_custom_object", + side_effect=route_list_retrieval("route.openshift.io", "v1", "ns", "routes")[ + "items" + ], ) + cluster = get_cluster("quicktest") cluster_config = cluster.config assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" @@ -1894,7 +2177,7 @@ def test_get_cluster(mocker): assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 assert cluster_config.num_gpus == 0 - assert cluster_config.instascale + assert cluster_config.local_interactive == True assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -1902,28 +2185,38 @@ def test_get_cluster(mocker): assert cluster_config.num_workers == 1 -def test_get_ingress_domain_from_client(mocker): - mocker.patch("kubernetes.config.load_kube_config") +def test_get_cluster(mocker): mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch( - "kubernetes.client.NetworkingV1Api.read_namespaced_ingress", - return_value=ingress_retrieval( - port=8265, cluster_name="unit-test-cluster" - ).items[0], + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, ) - - ingress_domain = get_ingress_domain_from_client("unit-test-cluster", "ns") - assert ingress_domain == "apps.cluster.awsroute.org" - mocker.patch( - "codeflare_sdk.utils.generate_yaml.is_openshift_cluster", return_value=True + "kubernetes.client.CustomObjectsApi.get_namespaced_custom_object", + side_effect=get_named_aw, ) mocker.patch( - "kubernetes.client.CustomObjectsApi.get_namespaced_custom_object", - side_effect=route_retrieval, + "kubernetes.client.NetworkingV1Api.list_namespaced_ingress", + return_value=ingress_retrieval(cluster_name="quicktest", client_ing=True), ) - ingress_domain = get_ingress_domain_from_client("unit-test-cluster", "ns") - assert ingress_domain == "apps.cluster.awsroute.org" + cluster = get_cluster("quicktest") + cluster_config = cluster.config + assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" + assert ( + "m4.xlarge" in cluster_config.machine_types + and "g4dn.xlarge" in cluster_config.machine_types + ) + assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 + assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 + assert cluster_config.num_gpus == 0 + assert cluster_config.instascale + assert cluster_config.local_interactive + assert ( + cluster_config.image + == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" + ) + assert cluster_config.num_workers == 1 def route_retrieval(group, version, namespace, plural, name): @@ -2166,7 +2459,7 @@ def test_wait_ready(mocker, capsys): mocker.patch("kubernetes.client.ApisApi.get_api_versions") mocker.patch( "kubernetes.client.NetworkingV1Api.list_namespaced_ingress", - return_value=ingress_retrieval(8265), + return_value=ingress_retrieval(), ) mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) @@ -3141,6 +3434,7 @@ def test_cleanup(): os.remove(f"{aw_dir}prio-test-cluster.yaml") os.remove(f"{aw_dir}test.yaml") os.remove(f"{aw_dir}raytest2.yaml") + os.remove(f"{aw_dir}unit-test-cluster-ray.yaml") os.remove("tls-cluster-namespace/ca.crt") os.remove("tls-cluster-namespace/tls.crt") os.remove("tls-cluster-namespace/tls.key")