[Serve][Doc] Add handle instruction to send multiplex request (#39274)

Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com>
ray-project · Sep 7, 2023 · 3e7c8af · 3e7c8af
1 parent fecca87
commit 3e7c8af
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 3 deletions.
diff --git a/doc/source/serve/deploy-many-models/model-multiplexing.md b/doc/source/serve/deploy-many-models/model-multiplexing.md
@@ -53,9 +53,10 @@ To send a request to a specific model, include the field `serve_multiplexed_mode
 :start-after: __serve_request_send_example_begin__
 :end-before: __serve_request_send_example_end__
 ```
-
 :::{note}
-`serve_multiplexed_model_id` is required in the request header, and the value should be the model id you want to send the request to.
+`serve_multiplexed_model_id` is required in the request header, and the value should be the model ID you want to send the request to.
+
+If the `serve_multiplexed_model_id` is not found in the request header, Serve will treat it as a normal request and route it to a random replica.
 :::
 
 After you run the above code, you should see the following lines in the deployment logs:
@@ -72,3 +73,17 @@ INFO 2023-05-24 01:19:15,988 default_Model default_Model#rimNjA WzjTbJvbPN / def
 INFO 2023-05-24 01:19:15,988 default_Model default_Model#rimNjA WzjTbJvbPN / default multiplex.py:131 - Loading model '4'.
 INFO 2023-05-24 01:19:16,993 default_Model default_Model#rimNjA WzjTbJvbPN / default replica.py:542 - __CALL__ OK 1005.7ms
 ```
+
+You can also send a request to a specific model by using handle {mod}`options <ray.serve.handle.RayServeHandle>` API.
+```{literalinclude} ../doc_code/multiplexed.py
+:language: python
+:start-after: __serve_handle_send_example_begin__
+:end-before: __serve_handle_send_example_end__
+```
+
+When using model composition, you can send requests from an upstream deployment to a multiplexed deployment using the Serve DeploymentHandle. You need to set the `multiplexed_model_id` in the options. For example:
+```{literalinclude} ../doc_code/multiplexed.py
+:language: python
+:start-after: __serve_model_composition_example_begin__
+:end-before: __serve_model_composition_example_end__
+```
diff --git a/doc/source/serve/doc_code/multiplexed.py b/doc/source/serve/doc_code/multiplexed.py
@@ -29,7 +29,7 @@ async def __call__(self, request: starlette.requests.Request):
 
 # __serve_deployment_example_end__
 
-serve.run(entry)
+handle = serve.run(entry)
 
 # __serve_request_send_example_begin__
 import requests  # noqa: E402
@@ -38,3 +38,31 @@ async def __call__(self, request: starlette.requests.Request):
     "http://localhost:8000", headers={"serve_multiplexed_model_id": str("1")}
 )
 # __serve_request_send_example_end__
+
+# __serve_handle_send_example_begin__
+obj_ref = handle.options(multiplexed_model_id="1").remote("<your param>")
+# __serve_handle_send_example_end__
+
+
+from ray.serve.handle import DeploymentHandle  # noqa: E402
+
+
+# __serve_model_composition_example_begin__
+@serve.deployment
+class Downstream:
+    def __call__(self):
+        return serve.get_multiplexed_model_id()
+
+
+@serve.deployment
+class Upstream:
+    def __init__(self, downstream: DeploymentHandle):
+        self._h: DeploymentHandle = downstream.options(use_new_handle_api=True)
+
+    async def __call__(self, request: starlette.requests.Request):
+        return await self._h.options(multiplexed_model_id="bar").remote()
+
+
+serve.run(Upstream.bind(Downstream.bind()))
+resp = requests.get("http://localhost:8000")
+# __serve_model_composition_example_end__