In [None]:
# https://docs.ray.io/en/latest/serve/develop-and-deploy.html

In [None]:
import requests
from starlette.requests import Request
from typing import Dict

from transformers import pipeline

import ray
from ray import serve

from fastapi import FastAPI

from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication
from codeflare_sdk.utils import generate_cert

### Serve a Flan T5 model using codeflare and instascale

In [None]:
# Create and configure our cluster object (and appwrapper)
cluster = Cluster(ClusterConfiguration(
    name='rayservice',
    namespace='default',
    num_workers=2,
    min_cpus=2,
    max_cpus=2,
    min_memory=8,
    max_memory=8,
    image="quay.io/project-codeflare/ray:2.5.0-py38-cu116",
    num_gpus=1,
    instascale=True,
    machine_types=["m5.xlarge", "g4dn.xlarge"] 
))

In [None]:
# NOTE: before running cluster.up() you need to manually add the container port 8000 field to the raytest.yaml
#    ports:
#    - containerPort: 8000
#    name: serve

cluster.up()

In [None]:
# WARNING: using instascale=True in your ClusterConfiguration above assumes
# that you have instascale properly installed and enabled on your cluster.
# It can take around 15 minutes for your pods to scale up. 
# if this hangs for too long, please stop it with `cluster.down()`
cluster.wait_ready()

In [None]:
ray_cluster_uri = cluster.cluster_uri()

In [None]:
#install additionall libraries that will be required for model serving
runtime_env = {"pip": ["transformers", "datasets", "evaluate", "pyarrow<7.0.0", "accelerate"]}

ray.shutdown()

ray.init(address=ray_cluster_uri, runtime_env=runtime_env)

print("Ray cluster is up and running: ", ray.is_initialized())

In [None]:
# 1: Wrap the pretrained  flan-t5 instruction model in a Serve deployment.
@serve.deployment(num_replicas=2, ray_actor_options={"num_gpus":1})
#@serve.ingress(app)
class SentimentAnalysisDeployment:
    def __init__(self):
        self._model = pipeline("text2text-generation", model="google/flan-t5-large", device_map="auto")

    def __call__(self, request: Request) -> Dict:
        return self._model(request.query_params["text"])[0]
    
    

In [None]:
# 2: Deploy the deployment.
serve.run(SentimentAnalysisDeployment.bind(), host="0.0.0.0")

In [None]:
serve.get_deployment("default_SentimentAnalysisDeployment")

In [None]:
cluster.cluster_dashboard_uri()

In [None]:
# 3: Query the deployment and print the result from inside the cluster.
requests.get("http://rayservice-head-svc.default.svc.cluster.local:8000/", 
              params={"text": "What is the purpose of AI?"}).content

In [None]:
# 3: Query the deployment and print the result from an exposed route.
# an Openshift Route called ray-service must be created for this to work
requests.post("http://ray-service-default.<CLUSTER_ADDRESS>", 
              params={"text": "What is the purpose of AI?"}).content

In [None]:
serve.shutdown()

In [None]:
cluster.down()

### Serve a LLAMA 2 model without instascale

In [None]:
# oc login to cluster
! oc whoami

In [None]:
! oc apply -f llama2-7b-ray.yaml

In [None]:
#install additional libraries that will be required for model serving
runtime_env = {"pip": ["transformers", "datasets", "evaluate", "pyarrow<7.0.0", "accelerate"]}

ray.shutdown()

ray.init(address="ray://test-llama2-head-svc.default.svc:10001", runtime_env=runtime_env)

print("Ray cluster is up and running: ", ray.is_initialized())

In [None]:
mytoken= <INSERT-HUGGINGFACE-LLAMA2-MODEL-TOKEN>

In [None]:
# 1: Wrap the pretrained  LLAMA2 instruction model in a Serve deployment.
@serve.deployment(num_replicas=1, ray_actor_options={"num_gpus":1})
#@serve.ingress(app)
class RayServeDeployment:
    def __init__(self):
        self._model = pipeline("text2text-generation", model="meta-llama/Llama-2-7b-hf", device_map="auto", token=mytoken)

    def __call__(self, request: Request) -> Dict:
        return self._model(request.query_params["text"])[0]
    
    
# 2: Deploy the deployment.
serve.run(RayServeDeployment.bind(), host="0.0.0.0")

In [None]:
serve.get_deployment("default_RayServeDeployment")

In [None]:
# 3: Query the deployment and print the result from inside the cluster.
requests.get("http://test-llama2-head-svc.default.svc.cluster.local:8000/", 
              params={"text": "What is the purpose of AI?"}).content