`serve run translator:translator_app `

In [1]:
import ray
import time
from time import perf_counter
import requests

In [2]:
@ray.remote
def query_backend(text="Hello world!"):
    response = requests.post("http://127.0.0.1:8000/", json=text)
    return response.text


In [3]:
def as_available(things):
 # Make the futures
 futures = list(map(lambda x: query_backend.remote(x), things))
 # print(ray.get(futures[0]))
 # While we still have pending futures
 while len(futures) > 0:
    ready_futures, rest_futures = ray.wait(futures)
    # print(f"Ready {len(ready_futures)} rest {len(rest_futures)}")
    # for id in ready_futures:
    #     print(f'completed value {id}, result {ray.get(id)}')
        # time.sleep(1) # Business logic goes here
    # We just need to wait on the ones that are not yet available
    futures = rest_futures


In [4]:
things = ["Hello world!", "How are you?", "The sun is shinning!"] 
t1_start = perf_counter() 
as_available(things)
t1_stop = perf_counter()
print(f"Elapsed time during the {len(things)} requests in seconds:",
                                        t1_stop-t1_start)

2023-10-21 19:50:39,902	INFO worker.py:1642 -- Started a local Ray instance.


Elapsed time during the 3 requests in seconds: 2.4955870830162894


In [5]:
from transformers import pipeline

class Translator:
    def __init__(self):
        # Load model
        self.model = pipeline("translation_en_to_de", model="t5-small")

    def translate(self, text: str) -> str:
        # Run inference
        model_output = self.model(text)

        # Post-process output to return only the translation text
        translation = model_output[0]["translation_text"]

        return translation


In [None]:
t1_start = perf_counter() 
list(map(lambda x: Translator().translate(x), things))
t1_stop = perf_counter()
print(f"Elapsed time during the {len(things)} requests in seconds:",
                                        t1_stop-t1_start)