Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 66 additions & 3 deletions .github/scripts/generate_vllm_benchmark_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
1: [
"linux.aws.a100",
"linux.aws.h100",
"linux.rocm.gpu.gfx942.2", # No single ROCm GPU?
"linux.rocm.gpu.gfx942.1",
"linux.24xl.spr-metal",
"linux.dgx.b200",
],
Expand All @@ -29,8 +29,6 @@
4: [
"linux.aws.h100.4",
"linux.rocm.gpu.gfx942.4",
# TODO (huydhn): Enable this when Intel's runners are ready
# "intel-cpu-emr",
],
8: [
"linux.aws.h100.8",
Expand Down Expand Up @@ -62,6 +60,65 @@
]
)

# Model and runner skip logic, for example, just need to run DeepSeek on b200
# and not h100. This also serves as a knob to tune CI behavior. TODO (huydhn):
# Figure out how to set this in the JSON benchmark configuration instead
PLATFORM_SKIPS = {
# Already been covered in both A100 and H100
"meta-llama/Meta-Llama-3.1-8B-Instruct": [
"linux.dgx.b200",
],
"meta-llama/Meta-Llama-3.1-70B-Instruct": [
"linux.dgx.b200",
],
"mistralai/Mixtral-8x7B-Instruct-v0.1": [
"linux.dgx.b200",
],
"Qwen/Qwen3-8B": [
"linux.dgx.b200",
],
"google/gemma-3-4b-it": [
"linux.dgx.b200",
],
# Run some bigger models on B200 to share the load
"Qwen/Qwen3-30B-A3B": [
"linux.aws.a100",
"linux.aws.h100",
],
"google/gemma-3-27b-it": [
"linux.aws.a100",
"linux.aws.h100",
"linux.rocm.gpu.gfx942", # TODO (huydhn): Fail on ROCm
],
"meta-llama/Llama-4-Scout-17B-16E-Instruct": [
"linux.aws.a100",
"linux.aws.h100",
],
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [
"linux.aws.a100",
"linux.aws.h100",
"linux.rocm.gpu.gfx942", # TODO (huydhn): Hang on ROCm
],
# Run gpt-oss on both H100 and B200
"openai/gpt-oss-20b": [
"linux.aws.a100",
],
"openai/gpt-oss-120b": [
"linux.aws.a100",
],
# Deepseek can only run on B200
"deepseek-ai/DeepSeek-V3.1": [
"linux.aws.a100",
"linux.aws.h100",
],
"deepseek-ai/DeepSeek-R1": [
"linux.aws.a100",
"linux.aws.h100",
],
}
# Lower case all the model names for consistency
PLATFORM_SKIPS = {k.lower(): v for k, v in PLATFORM_SKIPS.items()}


class ValidateDir(Action):
def __call__(
Expand Down Expand Up @@ -198,6 +255,12 @@ def generate_benchmark_matrix(
if not found_runner and not use_all_runners:
continue

# Check the skip logic
if model in PLATFORM_SKIPS and any(
[r in runner for r in PLATFORM_SKIPS[model]]
):
continue

benchmark_matrix["include"].append(
{
"runner": runner,
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/vllm-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ name: vLLM Benchmark

on:
schedule:
# Run every 4 hours
- cron: '0 */4 * * *'
# Run every 6 hours
- cron: '0 */6 * * *'
workflow_dispatch:
inputs:
vllm_branch:
Expand Down Expand Up @@ -53,7 +53,7 @@ jobs:
shell: bash
env:
MODELS: ${{ inputs.models || '' }}
RUNNERS: ${{ inputs.runners || 'h100' }}
RUNNERS: ${{ inputs.runners || '' }}
run: |
set -eux

Expand Down
22 changes: 22 additions & 0 deletions vllm-benchmarks/benchmarks/cuda/latency-tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,27 @@
"num_iters": 15,
"max_model_len": 8192
}
},
{
"test_name": "latency_deepseek_v3_tp8",
"parameters": {
"model": "deepseek-ai/DeepSeek-V3.1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15,
"max_model_len": 8192
}
},
{
"test_name": "latency_deepseek_r1_tp8",
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15,
"max_model_len": 8192
}
}
]
40 changes: 40 additions & 0 deletions vllm-benchmarks/benchmarks/cuda/serving-tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -451,5 +451,45 @@
"random_input_len": 5250,
"random_output_len": 8250
}
},
{
"test_name": "serving_deepseek_v3_tp8_random_in5k_out8k",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "deepseek-ai/DeepSeek-V3.1",
"tensor_parallel_size": 8,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "deepseek-ai/DeepSeek-V3.1",
"backend": "vllm",
"dataset_name": "random",
"num_prompts": 200,
"random_input_len": 5250,
"random_output_len": 8250
}
},
{
"test_name": "serving_deepseek_r1_tp8_random_in5k_out8k",
"qps_list": [1, 4, 16, "inf"],
"server_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"backend": "vllm",
"dataset_name": "random",
"num_prompts": 200,
"random_input_len": 5250,
"random_output_len": 8250
}
}
]
24 changes: 24 additions & 0 deletions vllm-benchmarks/benchmarks/cuda/throughput-tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,29 @@
"backend": "vllm",
"max_model_len": 8192
}
},
{
"test_name": "throughput_deepseek_v3_tp8",
"parameters": {
"model": "deepseek-ai/DeepSeek-V3.1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm",
"max_model_len": 8192
}
},
{
"test_name": "throughput_deepseek_r1_tp8",
"parameters": {
"model": "deepseek-ai/DeepSeek-R1",
"tensor_parallel_size": 8,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm",
"max_model_len": 8192
}
}
]