In [None]:
# https://docs.ray.io/en/latest/serve/develop-and-deploy.html

In [1]:
import requests
from starlette.requests import Request
from typing import Dict

from transformers import pipeline

import ray
from ray import serve

from fastapi import FastAPI

from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication
from codeflare_sdk.utils import generate_cert

In [2]:
# Create and configure our cluster object (and appwrapper)
cluster = Cluster(ClusterConfiguration(
    name='raytest',
    namespace='default',
    num_workers=2,
    min_cpus=2,
    max_cpus=2,
    min_memory=4,
    max_memory=4,
    image="quay.io/project-codeflare/ray:2.5.0-py38-cu116",
    num_gpus=0,
    instascale=False
))

Written to: raytest.yaml


In [3]:
# NOTE: before running cluster.up() you need to manually add the container port 8000 field to the raytest.yaml
cluster.up()

In [4]:
cluster.wait_ready()

Waiting for requested resources to be set up...
Requested cluster up and running!


In [6]:
ray_cluster_uri = cluster.cluster_uri()

In [8]:
#install additionall libraries that will be required for model serving
runtime_env = {"pip": ["transformers", "datasets", "evaluate", "pyarrow<7.0.0", "accelerate"]}

ray.shutdown()

ray.init(address=ray_cluster_uri, runtime_env=runtime_env)

print("Ray cluster is up and running: ", ray.is_initialized())

Ray cluster is up and running:  True


In [9]:
# 1: Wrap the pretrained sentiment analysis model in a Serve deployment.
@serve.deployment(num_replicas=2)
#@serve.ingress(app)
class SentimentAnalysisDeployment:
    def __init__(self):
        self._model = pipeline("text2text-generation", model="google/flan-t5-small")

    def __call__(self, request: Request) -> Dict:
        return self._model(request.query_params["text"])[0]
    
    

In [10]:
# 2: Deploy the deployment.
serve.run(SentimentAnalysisDeployment.bind(), host="0.0.0.0")

[2m[36m(HTTPProxyActor pid=372, ip=10.128.15.147)[0m INFO:     Started server process [372]
[2m[36m(HTTPProxyActor pid=239, ip=10.128.12.42)[0m INFO:     Started server process [239]
[2m[36m(ServeController pid=324, ip=10.128.15.147)[0m INFO 2023-08-16 07:33:08,503 controller 324 deployment_state.py:1298 - Deploying new version of deployment default_SentimentAnalysisDeployment.
[2m[36m(HTTPProxyActor pid=601)[0m INFO:     Started server process [601]
[2m[36m(ServeController pid=324, ip=10.128.15.147)[0m INFO 2023-08-16 07:33:08,536 controller 324 deployment_state.py:1537 - Adding 2 replicas to deployment default_SentimentAnalysisDeployment.
Downloading (…)lve/main/config.json: 100%|██████████| 1.40k/1.40k [00:00<00:00, 434kB/s]
Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]
Downloading pytorch_model.bin:  14%|█▎        | 41.9M/308M [00:00<00:00, 330MB/s]
Downloading pytorch_model.bin:  27%|██▋       | 83.9M/308M [00:00<00:00, 368MB/s]
Downloadi

RayServeSyncHandle(deployment='default_SentimentAnalysisDeployment')

In [11]:
serve.get_deployment("default_SentimentAnalysisDeployment")

Deployment(name=default_SentimentAnalysisDeployment,version=None,route_prefix=/)

In [12]:
# 3: Query the deployment and print the result from inside the cluster.
requests.get("http://raytest-head-svc.default.svc.cluster.local:8000/", 
              params={"text": "What is the purpose of AI?"}).content

b'{"generated_text": "to provide information to the user"}'

[2m[36m(ServeReplica:default_SentimentAnalysisDeployment pid=283, ip=10.128.12.42)[0m INFO 2023-08-16 07:34:41,378 default_SentimentAnalysisDeployment default_SentimentAnalysisDeployment#YaQhKq XbSLirzWRj / default replica.py:654 - __CALL__ OK 182.9ms


In [13]:
# 3: Query the deployment and print the result from an exposed route.
# an Openshift Route called ray-service must be made for this to work
requests.post("http://ray-service-default.<CLUSTER_ADDRESS>", 
              params={"text": "What is the purpose of AI?"}).content

b'{"generated_text": "to provide information to the user"}'

[2m[36m(ServeReplica:default_SentimentAnalysisDeployment pid=703)[0m INFO 2023-08-16 07:34:51,660 default_SentimentAnalysisDeployment default_SentimentAnalysisDeployment#ilapgU eJChAUEvGS / default replica.py:654 - __CALL__ OK 182.0ms


In [14]:
serve.shutdown()

[2m[36m(ServeController pid=324, ip=10.128.15.147)[0m INFO 2023-08-16 07:35:07,128 controller 324 deployment_state.py:1264 - Deleting deployment default_SentimentAnalysisDeployment.
[2m[36m(ServeController pid=324, ip=10.128.15.147)[0m INFO 2023-08-16 07:35:07,215 controller 324 deployment_state.py:1563 - Removing 2 replicas from deployment 'default_SentimentAnalysisDeployment'.


In [16]:
cluster.down()