In [1]:
model_id = "EleutherAI/gpt-j-6B"
revision = "float16"  # use float16 weights to fit in 16GB GPUs

In [2]:
import ray

In [3]:
ray.init("ray://example-cluster-kuberay-head-svc:10001")

0,1
Python version:,3.10.9
Ray version:,2.3.0
Dashboard:,http://10.8.3.13:8265


In [4]:
import pandas as pd

from ray import serve
from starlette.requests import Request


@serve.deployment(ray_actor_options={"num_gpus": 8})
class PredictDeployment:
    def __init__(self, model_id: str, revision: str = None):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        import torch

        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map="auto",  # automatically makes use of all GPUs available to the Actor
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)

    def generate(self, text: str) -> pd.DataFrame:
        input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(
            self.model.device
        )

        gen_tokens = self.model.generate(
            input_ids,
            do_sample=True,
            temperature=0.9,
            max_length=100,
        )
        return pd.DataFrame(
            self.tokenizer.batch_decode(gen_tokens), columns=["responses"]
        )

    async def __call__(self, http_request: Request) -> str:
        json_request: str = await http_request.json()
        prompts = []
        for prompt in json_request:
            text = prompt["text"]
            if isinstance(text, list):
                prompts.extend(text)
            else:
                prompts.append(text)
        return self.generate(prompts)

In [5]:
deployment = PredictDeployment.bind(model_id=model_id, revision=revision)
serve.run(deployment, host="0.0.0.0")

RayServeSyncHandle(deployment='PredictDeployment')

In [7]:
import requests

prompt = (
#    "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
#    "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
#    "researchers was the fact that the unicorns spoke perfect English."
    "To find Flanders, you just hanve to think like Flanders."
)

sample_input = {"text": prompt}

output = requests.post("http://example-cluster-kuberay-head-svc:8000/", json=[sample_input]).json()
print(output)

[{'responses': "To find Flanders, you just hanve to think like Flanders. That's it. It really is that simple. Flanders is in the north, which is the end of the land...\n\nI really like this book. It's a little history book about the Flanders region of Belgium. In Flanders, the people are always telling a story or having a discussion to help you to get to the end of the story. The book follows that pattern...\n\nI love"}]
