Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion chart/web-app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,4 +184,9 @@ def inference_wrapper(*args):
app.launch()
# For running on cluster
else:
app.launch(server_name="0.0.0.0")
app.queue(
# Allow 10 concurrent requests to backend
# vLLM backend should be clever enough to
# batch these requests appropriately.
default_concurrency_limit=10,
).launch(server_name="0.0.0.0")
180 changes: 180 additions & 0 deletions scripts/perf-test/perf-test.ipynb

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions scripts/perf-test/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
anyio==4.3.0
certifi==2024.2.2
charset-normalizer==3.3.2
filelock==3.13.3
fsspec==2024.3.1
gradio_client==0.15.0
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
huggingface-hub==0.22.2
idna==3.6
joblib==1.3.2
numpy==1.26.4
packaging==24.0
pandas==2.2.1
python-dateutil==2.9.0.post0
pytz==2024.1
PyYAML==6.0.1
requests==2.31.0
six==1.16.0
sniffio==1.3.1
tqdm==4.66.2
typing_extensions==4.10.0
tzdata==2024.1
urllib3==2.2.1
websockets==11.0.3
39 changes: 39 additions & 0 deletions scripts/perf-test/stress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# NOTE: In its current state this script is not a useful benchmark for the LLM system.
# It seems that Gradio is doing some kind of session based queuing which results in serial
# execution of requests even when multiple Gradio client instances running inside separate
# python jobs are created via Joblib. This script should be updated in the future once the
# Gradio session behaviour is better understood, but for now the perf-test jupyter notebook
# should be used to benchmark an LLM running on the same Kubernetes cluster by directly
# targetting the internal service corresponding to the backend API.

import time, random
import pandas as pd
from gradio_client import Client
from joblib import Parallel, delayed

url = "http://localhost:7860"

prompts = [
"Hi, how are you?",
"What's the weather like with you?",
"Who's the best footballer of all time?"
]

client_count = 3
request_count = 5 # Requests per client

def make_requests(client_id: int):
client = Client(url)
timings = []
for n in range(request_count):
print(f"Starting request {n+1}/{request_count} for client {client_id}")
start_time = time.time()
client.predict(random.choice(prompts), api_name="/chat")
timings.append(time.time() - start_time)
return timings

results = list(Parallel(n_jobs=client_count)(delayed(make_requests)(i) for i in range(1, client_count+1)))
all_timings = []
for client_timings in results:
all_timings += client_timings
print(pd.Series(all_timings).describe())