stackhpc · sd109 · Apr 9, 2024 · Apr 9, 2024
diff --git a/chart/web-app/app.py b/chart/web-app/app.py
@@ -184,4 +184,9 @@ def inference_wrapper(*args):
         app.launch()
     # For running on cluster
     else:
-        app.launch(server_name="0.0.0.0")
+        app.queue(
+            # Allow 10 concurrent requests to backend
+            # vLLM backend should be clever enough to
+            # batch these requests appropriately.
+            default_concurrency_limit=10,
+        ).launch(server_name="0.0.0.0")
diff --git a/scripts/perf-test/perf-test.ipynb b/scripts/perf-test/perf-test.ipynb
diff --git a/scripts/perf-test/requirements.txt b/scripts/perf-test/requirements.txt
@@ -0,0 +1,26 @@
+anyio==4.3.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+filelock==3.13.3
+fsspec==2024.3.1
+gradio_client==0.15.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.22.2
+idna==3.6
+joblib==1.3.2
+numpy==1.26.4
+packaging==24.0
+pandas==2.2.1
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+requests==2.31.0
+six==1.16.0
+sniffio==1.3.1
+tqdm==4.66.2
+typing_extensions==4.10.0
+tzdata==2024.1
+urllib3==2.2.1
+websockets==11.0.3
diff --git a/scripts/perf-test/stress.py b/scripts/perf-test/stress.py
@@ -0,0 +1,39 @@
+# NOTE: In its current state this script is not a useful benchmark for the LLM system.
+# It seems that Gradio is doing some kind of session based queuing which results in serial
+# execution of requests even when multiple Gradio client instances running inside separate
+# python jobs are created via Joblib. This script should be updated in the future once the
+# Gradio session behaviour is better understood, but for now the perf-test jupyter notebook
+# should be used to benchmark an LLM running on the same Kubernetes cluster by directly
+# targetting the internal service corresponding to the backend API.
+
+import time, random
+import pandas as pd
+from gradio_client import Client
+from joblib import Parallel, delayed
+
+url = "http://localhost:7860"
+
+prompts = [
+    "Hi, how are you?",
+    "What's the weather like with you?",
+    "Who's the best footballer of all time?"
+]
+
+client_count = 3
+request_count = 5 # Requests per client
+
+def make_requests(client_id: int):
+    client = Client(url)
+    timings = []
+    for n in range(request_count):
+        print(f"Starting request {n+1}/{request_count} for client {client_id}")
+        start_time = time.time()
+        client.predict(random.choice(prompts), api_name="/chat")
+        timings.append(time.time() - start_time)
+    return timings
+
+results = list(Parallel(n_jobs=client_count)(delayed(make_requests)(i) for i in range(1, client_count+1)))
+all_timings = []
+for client_timings in results:
+    all_timings += client_timings
+print(pd.Series(all_timings).describe())