In [None]:
! pip install matplotlib

In [None]:
# ! pip install Pillow # Requirement already satisfied: Pillow in /home/ray/anaconda3/lib/python3.9/site-packages (9.5.0)

In [None]:
import ray
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer
from ray.train.xgboost import XGBoostPredictor
from ray.train.batch_predictor import BatchPredictor
from ray import tune
from ray.tune import Tuner, TuneConfig
from ray import serve
from ray.serve import PredictorDeployment
from ray.serve.http_adapters import pandas_read_json

import requests, json
from starlette.requests import Request
from typing import Dict
from concurrent.futures import ThreadPoolExecutor

from transformers import SegformerForSemanticSegmentation, SegformerFeatureExtractor
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import torch
import pickle
from io import BytesIO

# Ray Serve

## Intro

### Outline

-   Deployments
    -   Resources (CPU/GPU/custom)
    -   Runtime environments support, usage (functionality)
    -   Bound deployments, ServeHandles
-   Composition Patterns
    -   Imperative
    -   Declarative / Graph Deployment API
-   Architecture / Under-the-hood
    -   Ray cluster perspective - processes / workers / actors
    -   Request routing, queuing, load balancing in Serve
-   Scaling and Performance
    -   Replicas
        -   num_replicas, autoscaling_config, max_concurrent_queries
    -   Request batching

### Example scenario: computer vision services

For our example use case, we’ll see how to leverage Ray Serve to host a CV segmentation
model and how to enhance it using additional services such as image preprocessing.

### Context: Ray AIR

Ray AIR is the Ray AI Runtime, a set of high-level easy-to-use APIs for
ingesting data, training models – including reinforcement learning
models – tuning those models and then serving them.

<img src="https://technical-training-assets.s3.us-west-2.amazonaws.com/Introduction_to_Ray_AIR/e2e_air.png" width=600 loading="lazy"/>

Key principles behind Ray and Ray AIR are
* Performance
* Developer experience and simplicity

__Read, preprocess with Ray Data__

In [None]:
dataset = ray.data.read_parquet("s3://anonymous@anyscale-training-data/intro-to-ray-air/nyc_taxi_2021.parquet").repartition(16)

train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)

__Fit model with Ray Train__

In [None]:
trainer = XGBoostTrainer(
    label_column="is_big_tip",
    scaling_config=ScalingConfig(num_workers=4, use_gpu=False),
    params={ "objective": "binary:logistic", },
    datasets={"train": train_dataset, "valid": valid_dataset},
)

result = trainer.fit()

__Optimize hyperparams with Ray Tune__

In [None]:
tuner = Tuner(trainer, 
            param_space={'params' : {'max_depth': tune.randint(2, 12)}},
            tune_config=TuneConfig(num_samples=3, metric='train-logloss', mode='min'))

checkpoint = tuner.fit().get_best_result().checkpoint

__Batch prediction__

In [None]:
batch_predictor = BatchPredictor.from_checkpoint(checkpoint, XGBoostPredictor)

predicted_probabilities = batch_predictor.predict(valid_dataset.drop_columns(['is_big_tip']))

__Online prediction with Ray Serve__

In [None]:
deployment = PredictorDeployment.bind(XGBoostPredictor, result.checkpoint, http_adapter=pandas_read_json)

serve.run(deployment)

__HTTP or Python services__

In [None]:
sample_input = dict(valid_dataset.take(1)[0])
del(sample_input['is_big_tip'])
del(sample_input['__index_level_0__'])
requests.post("http://localhost:8000/", json=[sample_input]).json()

In [None]:
serve.shutdown()

# Ray Serve

Serve is a microservices framework for serving ML – the model serving
component of Ray AIR.

<img src='https://technical-training-assets.s3.us-west-2.amazonaws.com/Ray_Serve/serve_architecture.png' width=700/>

# Deployments

`Deployment` is the fundamental user-facing element of serve.

<img src='https://technical-training-assets.s3.us-west-2.amazonaws.com/Ray_Serve/deployment.png' width=600/>

## Our First Service

Let’s jump right in and get something simple up and running on Ray
Serve.

In [None]:
@serve.deployment
class TextConverter:
    def convert(self, text):
        return "***" + str.upper(text) + "***"

In [None]:
app_handle = serve.run(TextConverter.bind())

## Key APIs and concepts

`Deployment` represents a service and is created with the `@serve.deployment` decorator
* As end users, we don't instantiate `Deployment`s directly
* Ray will create them as actors, per our scaling requirements

A __bound deployment__ is created with the `.bind` class method on the deployment class
* e.g., `TextConverter.bind(msg="Yes...")` above creates a bound deployment
* `.bind` allows us to provide constructor params for the deployment class (the `msg` param above)
* bound deployments *can* be passed to other deployments via `.bind` -- this is one way to compose services
* We can pass a bound deployment to `serve.run(...)`
    * to start a service
    * to obtain a `ServeHandle`

A `ServeHandle` can be used to invoke services through the Python API
* At runtime, services can call other services via serve handles
    * Bound deployments provided to deployment constructors via `.bind` become serve handles at runtime

In [None]:
print(type(app_handle))
print(app_handle)

Look at Actors in the dashboard. Why are deployment replicas actors?

In [None]:
app_handle.convert.remote("cat")

In [None]:
ray.get(app_handle.convert.remote("cat"))

Ok, we have a minimal deployment built and running!

In [None]:
serve.shutdown()

What do we want to do next?
* Support some image processing
* Support HTTP

Then...
* Image segmentation with SegFormer
* Service composition -- e.g., grayscale/resize/sharpen/etc. and then segment

And finally...
* Manage GPUs (and resources generally)
* Multiple replicas, autoscaling

In [None]:
@serve.deployment
class Threshold:
    def __init__(self, threshold: int):
        self._threshold = threshold # initial state
    
    def get_response(self, image):
        new_image = np.zeros_like(image)
        new_image[image > self._threshold] = 255
        new_image[new_image < 255] = 0
        return new_image

app_handle = serve.run(Threshold.bind(threshold=128), name='hello_image_world')

In [None]:
im = Image.open("cat.jpg")

im

In [None]:
im = im.resize((512,384))

im

In [None]:
np.array(im.getdata()).shape

In [None]:
arr = np.array(im.getdata())
arr = arr.reshape(-1, 512, 3)

plt.imshow(arr)

In [None]:
plt.imshow(arr.mean(axis=2), cmap='gray')

In [None]:
output_ref = app_handle.get_response.remote(arr)

In [None]:
plt.imshow(ray.get(output_ref))

In [None]:
serve.delete('hello_image_world')

Add HTTP ... this is a bit messier just because of conversion between bytes, arrays, and HTTP tools

In [None]:
@serve.deployment
class Threshold:
    def __init__(self, threshold: int):
        self._threshold = threshold # initial state

    def get_response(self, image):
        new_image = np.zeros_like(image)
        new_image[image > self._threshold] = 255
        new_image[new_image < 255] = 0
        return new_image
    
    # a lot of boilerplate as HTTP adapter for images + ndarrays (a text/JSON example would be about 3 lines)
    async def __call__(self, request: Request) -> Dict:
        import numpy as np
        import io
        from imageio import v3 as iio
        from fastapi import Response

        # async collect POST body
        body = await request.body()
        
        # unpickle serialized data
        image = pickle.loads(body)
        
        # get NDArray for our image processing
        data = np.array(image)
        
        # invoke existing business logic
        transformed_data = self.get_response(data)
        
        # convert to image
        transformed_image = Image.fromarray(transformed_data.astype(np.uint8))
        
        # prepare output buffer
        with io.BytesIO() as buf:
            iio.imwrite(buf, transformed_image, plugin="pillow", format="JPEG")
            im_bytes = buf.getvalue()
        
        # prepare and return HTTP Response
        headers = {'Content-Disposition': 'inline'} # ; filename="test.jpeg"'}
        return Response(im_bytes, headers=headers, media_type='image/jpeg')

app_handle = serve.run(Threshold.bind(threshold=128), name='hello_image_world')

Threshold our cat via HTTP

(if we are working with arrays on the client side and want to make an image from an array, we'd call `Image.fromarray(my_array)`)

In [None]:
response = requests.post("http://localhost:8000/", data = pickle.dumps(im)) # uncompressed

response

In [None]:
Image.open(BytesIO(response.content))

## Build a semantic segmentation service on SegFormer

At this point, we've done all the hard work -- we know the structure of our service code.

In this use case, we're going to build and test
* SegFormer-based segmentation service
* Image prep service (as a demo, we'll just convert the image to grayscale, but feel free to experiment with other transformations)
* an Ingress service, to separate the HTTP handling code from our other components

### Segmentation service

In [None]:
from utils import get_labels

In [None]:
id2label, label2id = get_labels()

### Load the feature extractor

In [None]:
@serve.deployment
class Segmenter:
    def __init__(
        self,
        model_name, id2label, label2id,
    ):
        self.model = SegformerForSemanticSegmentation.from_pretrained(model_name, id2label=id2label, label2id=label2id)
        self.feature_extractor = SegformerFeatureExtractor.from_pretrained(model_name, do_reduce_labels=True)

    def segment(self, batch: list) -> list[np.ndarray]: # can process PIL Image, or torch/np tensor

        # Set the device on which PyTorch will run.
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)  # Move the model to specified device.
        self.model.eval()  # Set the model in evaluation mode on test data.

        # The feature extractor processes raw images.
        inputs = self.feature_extractor(images=batch, return_tensors="pt")

        # The model is applied to input images in the inference step.
        with torch.no_grad():
            outputs = self.model(pixel_values=inputs.pixel_values.to(device))

        # Post-process the output for display.
        image_sizes = [image.size[::-1] for image in batch]
        segmentation_maps_postprocessed = (
            self.feature_extractor.post_process_semantic_segmentation(
                outputs=outputs, target_sizes=image_sizes
            )
        )

        # Return list of segmentation maps detached from the computation graph.
        return [j.detach().cpu().numpy() for j in segmentation_maps_postprocessed]

In [None]:
segmenter = Segmenter.bind("nvidia/segformer-b0-finetuned-ade-512-512")

In [None]:
serve.delete('hello_world')

In [None]:
handle = serve.run(segmenter, name='seg')

In [None]:
out = handle.segment.remote([im])

In [None]:
plt.imshow(ray.get(out)[0])

In [None]:
im.convert("L")

In [None]:
im.size

In [None]:
out2 = handle.segment.remote([im.resize((360,240))])

In [None]:
plt.imshow(ray.get(out2)[0])

In [None]:
from PIL import Image

im = Image.open("./people-room.jpg")

im

In [None]:
import matplotlib.pyplot as plt

In [None]:
id2label[5]

In [None]:
response = handle.get_response.remote('hello')
response

In order to support maximal performance, values from remote calls, such as our response string here, are returned as object references (a bit like futures or promises in some frameworks). If we want to block, wait for the result to be ready, and retrieve it, we can use `ray.get(...)`

In [None]:
ray.get(response)

Since we'll be creating a new application example, we can delete the old one -- that allows Ray to remove the replicas of our Chat deployment.

In [None]:
serve.delete('hello_world')

## Specifying service resources

Resources can be specified on a per-deployment basis and, if we want, in fractional units, via the `ray_actor_options` parameter on the `@serve.deployment` decorator.

As a realistic example, we can upgrade the "hello world" chatbot to use a Huggingface LLM employing GPU resources.

In [None]:
@serve.deployment(ray_actor_options={'num_gpus': 0.5})
class Chat:
    def __init__(self, model: str):
        # configure stateful elements of our service such as loading a model
        self._tokenizer = AutoTokenizer.from_pretrained(model)
        self._model =  AutoModelForSeq2SeqLM.from_pretrained(model).to(0)

    async def __call__(self, request: Request) -> Dict:
        # path to handle HTTP requests
        data = await request.json()
        data = json.loads(data)
        # after decoding the payload, we delegate to get_response for logic
        return {'response': self.get_response(data['user_input'], data['history']) }
    
    def get_response(self, user_input: str, history: list[str]) -> str:
        # this method receives calls directly (from Python) or from __call__ (from HTTP)
        history.append(user_input)
        # the history is client-side state and will be a list of raw strings;
        # for the default config of the model and tokenizer, history should be joined with '</s><s>'
        inputs = self._tokenizer('</s><s>'.join(history), return_tensors='pt').to(0)
        reply_ids = self._model.generate(**inputs, max_new_tokens=500)
        response = self._tokenizer.batch_decode(reply_ids.cpu(), skip_special_tokens=True)[0]
        return response

Resources can include
* `num_cpus`
* `num_gpus`
* `resources` dictionary containing custom resources
    * custom resources are tracked and accounted as symbols (or tags) in order to match actors to workers
    
Example
```python
@serve.deployment(ray_actor_options={'num_cpus' : 2, 'num_gpus' : 2, resources : {"my_super_accelerator": 1}})
class Demo:
    ...
```

The purpose of the declarative resource mechanism is to allow Ray to place code on suitable nodes in a heterogeneous cluster without our having know which nodes have which resources to where our code should run.

> Best practice: if some nodes have a distinguising feature, mark and request it as a resource, rather than trying to determine which nodes are present and where your code will run.

For more details, see https://docs.ray.io/en/latest/serve/scaling-and-resource-allocation.html#resource-management-cpus-gpus

In [None]:
chat = Chat.bind(model='facebook/blenderbot-400M-distill')

handle = serve.run(chat, name='basic_chat')

### Runtime environments

We have many options for managing dependencies -- e.g., Python libraries and versions, resource file, etc.

Dependencies can be provided at the level of Node/VM/container, Ray jobs, actors, tasks, and more.

With Ray Serve, we can optionally specify environment requirements at the `Deployment` level, and Ray will ensure that the specified environment is available to that deployment.

In the following example, we'll create 
* some services that use libraries available in our general Ray enviornment
* a service that requires a specific Python library (a language detector library) to illustrate the environment feature

Since we are discussing dependencies, its important to remember that it's a good practice to keep as many dependencies as possible in our general Ray worker environments, and to import them as usual.

> Just because we *can* create lots of custom environments in our code doesn't mean we *should*

In this first service, we import `pipeline` from Huggingface transformers. Later, the specific pipeline we need will require `sentencepiece`. We'll demo installing `sentencepiece` via the Runtime Environment. 

Beyond just specifying the library, we have to be careful about the order of imports and other calls, to ensure we don't need something from the library before it's available. We ensure that by delaying imports or use of anything with a relevant import until an actual method is called on our service. We can capture variables as usual in the constructor.

In [None]:
runtime_env = {"pip": ["sentencepiece"]}

In [None]:
@serve.deployment(ray_actor_options={"runtime_env" : runtime_env})
class Translate:
    def __init__(self, task: str, model: str):
        self._task = task
        self._model = model
        self._pipeline = None
    
    def get_response(self, user_input: str) -> str:
        if (self._pipeline is None):
            self._pipeline = pipeline(task=self._task, model=self._model)
        outputs = self._pipeline(user_input)
        response = outputs[0]['translation_text']
        return response
        
translate_en_fr = Translate.bind(task='translation_en_to_fr', model='t5-small')
translate_fr_en = Translate.bind(task='translation_fr_to_en', model='Helsinki-NLP/opus-mt-fr-en')

Notice how we have two different services but they are built on the same reusable code by calling `.bind()` with different initialization parameters.

*We don’t need to define new deployments for every service we use.*

This time we’re haven't published an application (via `serve.run()`) because these components will be invoked only by our main service deployment.

## Composition patterns

Let's bring the whole system together. We'll implement a service which represents our external endpoint for HTTP or Python invocations.
* This service will have references to the deployments we've built so far, and will implement some conditional logic to ensure the correct language is used
* Note that even if the user is interacting in French, we need to return the English response as well so that client can use that to build the chat history

### Imperative pattern

In [None]:
@serve.deployment
class Endpoint:
    def __init__(self, chat, lang_detect, translate_en_fr, translate_fr_en):
        # assign dependent service handles to instance variables
        self._chat = chat
        self._lang_detect = lang_detect
        self._translate_en_fr = translate_en_fr
        self._translate_fr_en = translate_fr_en

    async def __call__(self, request: Request) -> Dict:
        data = await request.json()
        data = json.loads(data)
        return {'response': await self.get_response(data['user_input'], data['history']) }
    
    async def get_response(self, user_input: str, history: list[str]):
        lang_obj_ref = await self._lang_detect.get_response.remote(user_input)
        
        # if we didn't need the literal value of the language yet, we could pass that (future) object reference to other services
        # here, though, we need the value in order to decide whether to call the translation services
        # we get the Python value by awaiting the object reference
        lang = await lang_obj_ref

        if (lang == 'fr'):
            user_input = await self._translate_fr_en.get_response.remote(user_input)

        response = response_en = await self._chat.get_response.remote(user_input, history)
        
        if (lang == 'fr'):
            response = await self._translate_en_fr.get_response.remote(response_en)
            user_input = await user_input
            
        response = await response
        response_en = await response_en
        
        return response  + '|' + user_input + '|' + response_en

chat = Chat.bind(model='facebook/blenderbot-400M-distill')
endpoint = Endpoint.bind(chat, lang_detect, translate_en_fr, translate_fr_en)

endpoint_handle = serve.run(endpoint, name = 'multilingual_chat')

We've implemented control flow through our services and used the async/await pattern in several places so that we don't unnecessarily block.

Then we construct the service endpoint and start a new application serving that endpoint.

In [None]:
message = 'My friends are cool but they eat too many carbs.'
history = []
response = ray.get(endpoint_handle.get_response.remote(message, history))
response.split('|')[0]

In [None]:
history += response.split('|')[1:]
history

In [None]:
message = 'Je ne suis pas sûr.'
response = ray.get(endpoint_handle.get_response.remote(message, history))
response.split('|')[0]

In [None]:
history += response.split('|')[1:]
history

At this point we have a service which can support the many functional and operational properties we expect to need in production, including scalability, separation of concerns, and composability.

In [None]:
serve.delete('multilingual_chat')

### Declarative pattern: Deployment Graph API

What is the Deployment Graph API?

* The Deployment Graph API lets us separate the flow of calls from the logic inside our services.

Why might we want to use the Deployment Graph (DAG) API to separate flow from logic?

* It may be valuable to add a layer of indirection – or abstraction – so that we can more easily create and compose reusable services
* The DAG API lets us use similar patterns across the Ray platform (e.g., Ray Workflow)
    * We can learn one general pattern for graphs and use that intuition in multiple places in our Ray applications
* Although we compose one DAG, we retain the key Ray Serve features of granular autoscaling and resource allocation

Let’s reproduce our chat service flow using the Deployment Graph API

#### Getting started with deployment graphs

As a first step, to keep things simple, let’s assume for a moment that we are always interacting with the service in French. 

<img src='https://technical-training-assets.s3.us-west-2.amazonaws.com/Ray_Serve/deployment_graph_simple.png' width=900/>

In [None]:
from ray.serve.dag import InputNode
from ray.serve.drivers import DAGDriver

`InputNode` is a special type of graph node, defined by Ray Serve, which represents values supplied to our service endpoint. 

We can only have one `InputNode` but we can get access to multiple parameters from that node using a Python context manager.

In [None]:
with InputNode() as inp:
    user_input = inp[0]
    history = inp[1]

Here is a minimal, linear pipeline that allows us to begin a chat in French.

We build up the graph step by step, `bind`ing each deployment to its dependencies.

In [None]:
user_input_en = translate_fr_en.get_response.bind(user_input)    # French->English translator depends on the user input text
chat_response = chat.get_response.bind(user_input_en, history)   # the chat deployment requires the English user input and the history
output = translate_en_fr.get_response.bind(chat_response)        # English->French translator depends on the English chat output
serve_dag = DAGDriver.bind(output)                               # the graph returns the output from the English->French translator

handle = serve.run(serve_dag, name='basic_linear')

We start the application by calling `serve.run()` on the DAGDriver, a Ray Serve component which routes HTTP requests through your call graph.

In [None]:
ray.get(handle.predict.remote('Mes amis sont cool mais ils mangent trop de glucides.', []))

In [None]:
serve.delete('basic_linear')

## Architecture / under-the-hood

### Ray cluster perspective: actors

In Ray, user code is executed by worker processes. These workers can run tasks (stateless functions) or actors (stateful class instances).

Ray Serve is built on actors, allowing deployments to collect expensive state once (such as loading a ML model) and to reuse it across many service requests.

Although you may never need to code any Ray tasks or actors yourself, your Ray Serve application has full access to those cluster capabilities and you may wish to use them to implement other functionality (e.g., service or operations that don't need to accept HTTP traffic). More information is at https://docs.ray.io/en/latest/ray-core/walkthrough.html

### Serve design

Under the hood, a few other actors are used to make up a serve instance.

* Controller: A global actor unique to each Serve instance is responsible for managing other actors. Serve API calls like creating or getting a deployment make remote calls to the Controller.

* HTTP Proxy: By default there is one HTTP proxy actor on the head node that accepts incoming requests, forwards them to replicas, and responds once they are completed. For scalability and high availability, you can also run a proxy on each node in the cluster via the location field of http_options.

* Deployment Replicas: Actors that execute the code in response to a request. Each replica processes requests from the HTTP proxy.
<img src='https://docs.ray.io/en/latest/_images/architecture-2.0.svg' width=700 />

Incoming requests, once resolved to a particular deployment, are queued. The requests from the queue are assigned round-robin to available replicas as long as capacity is available. This design provides load balancing and elasticity. 

Capacity can be managed with the `max_concurrent_queries` parameter to the deployment decorator. This value defaults to 100 and represents the maximum number of queries that will be sent to a replica of this deployment without receiving a response. Each replica has its own queue to collect and smooth incoming request traffic.

## Scaling and performance

### Replicas and autoscaling

Each deployment can have its own resource management and autoscaling configuration, with several options for scaling.

By default -- if nothing specified, as in our examples above -- the default is a single. We can specify a larger, constant number of replicas in the decorator:
```python
@serve.deployment(num_replicas=3)
```

For autoscaling, instead of `num_replicas`, we provide an `autoscaling_config` dictionary. With autoscaling, we can specify a minimum and maximum range for the number of replicas, the initial replica count, a load target, and more.

Here is example of extended configuration -- see https://docs.ray.io/en/latest/serve/scaling-and-resource-allocation.html#scaling-and-resource-allocation for more details:

```python
@serve.deployment(
    autoscaling_config={
        'min_replicas': 1,
        'initial_replicas': 2,
        'max_replicas': 5,
        'target_num_ongoing_requests_per_replica': 10,
    }
)
```

`min_replicas` can also be set to zero to create a "serverless" style design: in exchange for potentially slower startup, no actors (or their CPU/GPU resources) need to be permanently reserved.

### Autoscaling LLM chat

The LLM-baset chat service is a good example for seeing autoscaling in action, because LLM inference is relative expensive so we can easily build up a queue of requests to the service. The autoscaler responds to the dynamics of queue sizes and will launch additional replicas.

In [None]:
serve.shutdown()

In [None]:
@serve.deployment(ray_actor_options={'num_gpus': 0.5}, autoscaling_config={ 'min_replicas': 1, 'max_replicas': 4 })
class Chat:
    def __init__(self, model: str):
        self._tokenizer = AutoTokenizer.from_pretrained(model)
        self._model =  AutoModelForSeq2SeqLM.from_pretrained(model).to(0)

    async def __call__(self, request: Request) -> Dict:
        data = await request.json()
        data = json.loads(data)
        return {'response': self.get_response(data['user_input'], data['history']) }
    
    def get_response(self, user_input: str, history: list[str]) -> str:
        history.append(user_input)
        inputs = self._tokenizer('</s><s>'.join(history), return_tensors='pt').to(0)
        reply_ids = self._model.generate(**inputs, max_new_tokens=500)
        response = self._tokenizer.batch_decode(reply_ids.cpu(), skip_special_tokens=True)[0]
        return response
    
chat = Chat.bind(model='facebook/blenderbot-400M-distill')

handle = serve.run(chat, name='autoscale_chat')

We can generate a little load and look at the Ray Dashboard

What do we expect to see?

* Autoscaling of the Chat service up to 4 replicas
* Efficient use of fractional GPU resources
    * If our cluster has just 2 GPUs, we can run 4 replicase there

In [None]:
def make_request(s):
    return requests.post("http://localhost:8000/", json = s).json()

sample = '{ "user_input" : "Hello there, chatbot!", "history":[] }'
make_request(sample)

In [None]:
executor = ThreadPoolExecutor(max_workers=16)

results = executor.map(make_request, ['{ "user_input" : "Hello there, chatbot!", "history":[] }'] * 40)

list(results)

In [None]:
serve.delete('autoscale_chat')

### Request batching

In [None]:
@serve.deployment()
class Chat:
    def __init__(self):
        self._message = "Chatbot counts the batch size at "

    @serve.batch(max_batch_size=5)
    async def handle_batch(self, request_batch):
        num_requests = len(request_batch)
        return [ {'response': self._message + str(num_requests) } ] * num_requests
    
    async def __call__(self, request: Request) -> Dict:
        data = await request.json()
        data = json.loads(data)
        return await self.handle_batch(data)
    
chat = Chat.bind()

handle = serve.run(chat, name='batch_chat')

In [None]:
results = executor.map(make_request, ['{ "user_input" : "Hello there, chatbot!", "history":[] }'] * 100)

In [None]:
list(results)

In [None]:
serve.delete('batch_chat')

In [None]:
ray.shutdown()