From 5109b4fe3f096077f4c78893b6fa2f89a8e263de Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Sep 2025 18:45:50 -0700 Subject: [PATCH 1/8] Add DeepSeek-V3 and DeepSeek-R1 on B200 Signed-off-by: Huy Do --- .../scripts/generate_vllm_benchmark_matrix.py | 60 ++++++++++++++++++- .github/workflows/vllm-benchmark.yml | 2 +- .../benchmarks/cuda/latency-tests.json | 22 +++++++ .../benchmarks/cuda/serving-tests.json | 40 +++++++++++++ .../benchmarks/cuda/throughput-tests.json | 24 ++++++++ 5 files changed, 146 insertions(+), 2 deletions(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 2893fdb0..26037757 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -16,7 +16,7 @@ 1: [ "linux.aws.a100", "linux.aws.h100", - "linux.rocm.gpu.gfx942.2", # No single ROCm GPU? + "linux.rocm.gpu.gfx942.2", "linux.24xl.spr-metal", "linux.dgx.b200", ], @@ -62,6 +62,60 @@ ] ) +# Model and runner skip logic, for example, just need to run deepseek-ai/DeepSeek-V3.1 +# on b200 and not h100. This also serves as another knob to tune CI behavior +PLATFORM_SKIPS = { + # Already been covered in both A100 and H100 + "meta-llama/Meta-Llama-3.1-8B-Instruct": [ + "linux.dgx.b200", + ], + "meta-llama/Meta-Llama-3.1-70B-Instruct": [ + "linux.dgx.b200", + ], + "mistralai/Mixtral-8x7B-Instruct-v0.1": [ + "linux.dgx.b200", + ], + "Qwen/Qwen3-8B": [ + "linux.dgx.b200", + ] + "google/gemma-3-4b-it": [ + "linux.dgx.b200", + ], + # Run some bigger models on B200 to share the load + "Qwen/Qwen3-30B-A3B": [ + "linux.aws.a100", + "linux.aws.h100", + ], + "google/gemma-3-27b-it": [ + "linux.aws.a100", + "linux.aws.h100", + ] + "meta-llama/Llama-4-Scout-17B-16E-Instruct": [ + "linux.aws.a100", + "linux.aws.h100", + ], + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [ + "linux.aws.a100", + "linux.aws.h100", + ], + # Run gpt-oss on both H100 and B200 + "openai/gpt-oss-20b": [ + "linux.aws.a100", + ], + "openai/gpt-oss-120b": [ + "linux.aws.a100", + ] + # Deepseek can only run on B200 + "deepseek-ai/DeepSeek-V3.1": [ + "linux.aws.a100", + "linux.aws.h100", + ], + "deepseek-ai/DeepSeek-R1": [ + "linux.aws.a100", + "linux.aws.h100", + ,] +} + class ValidateDir(Action): def __call__( @@ -198,6 +252,10 @@ def generate_benchmark_matrix( if not found_runner and not use_all_runners: continue + # Check the skip logic + if model in PLATFORM_SKIPS and any(lambda r: r in runner, PLATFORM_SKIPS[model]): + continue + benchmark_matrix["include"].append( { "runner": runner, diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 366647a1..dc7281c7 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -53,7 +53,7 @@ jobs: shell: bash env: MODELS: ${{ inputs.models || '' }} - RUNNERS: ${{ inputs.runners || 'h100' }} + RUNNERS: ${{ inputs.runners || '' }} run: | set -eux diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index c397a5bc..ebcc2e59 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -72,5 +72,27 @@ "num_iters": 15, "max_model_len": 8192 } + }, + { + "test_name": "latency_deepseek_v3_tp8", + "parameters": { + "model": "deepseek-ai/DeepSeek-V3.1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_deepseek_r1_tp8", + "parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 59e1a659..93da8409 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -451,5 +451,45 @@ "random_input_len": 5250, "random_output_len": 8250 } + }, + { + "test_name": "serving_deepseek_v3_tp8_random_in5k_out8k", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "deepseek-ai/DeepSeek-V3.1", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "deepseek-ai/DeepSeek-V3.1", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5250, + "random_output_len": 8250 + } + }, + { + "test_name": "serving_deepseek_r1_tp8_random_in5k_out8k", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5250, + "random_output_len": 8250 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index f159426e..adb3b4de 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -79,5 +79,29 @@ "backend": "vllm", "max_model_len": 8192 } + }, + { + "test_name": "throughput_deepseek_v3_tp8", + "parameters": { + "model": "deepseek-ai/DeepSeek-V3.1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_deepseek_r1_tp8", + "parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } } ] From 37f5e1aede9a14220b637c08a4f5715c9c53d4c8 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Sep 2025 18:49:06 -0700 Subject: [PATCH 2/8] Comment Signed-off-by: Huy Do --- .github/scripts/generate_vllm_benchmark_matrix.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 26037757..2bbb294c 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -16,7 +16,7 @@ 1: [ "linux.aws.a100", "linux.aws.h100", - "linux.rocm.gpu.gfx942.2", + "linux.rocm.gpu.gfx942.1", "linux.24xl.spr-metal", "linux.dgx.b200", ], @@ -29,8 +29,6 @@ 4: [ "linux.aws.h100.4", "linux.rocm.gpu.gfx942.4", - # TODO (huydhn): Enable this when Intel's runners are ready - # "intel-cpu-emr", ], 8: [ "linux.aws.h100.8", From a9699ebc46a3e1fd3e870b94d0a46ac6779ae6bb Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Sep 2025 18:50:22 -0700 Subject: [PATCH 3/8] Format Signed-off-by: Huy Do --- .github/scripts/generate_vllm_benchmark_matrix.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 2bbb294c..61745608 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -75,7 +75,7 @@ ], "Qwen/Qwen3-8B": [ "linux.dgx.b200", - ] + ], "google/gemma-3-4b-it": [ "linux.dgx.b200", ], @@ -87,7 +87,7 @@ "google/gemma-3-27b-it": [ "linux.aws.a100", "linux.aws.h100", - ] + ], "meta-llama/Llama-4-Scout-17B-16E-Instruct": [ "linux.aws.a100", "linux.aws.h100", @@ -102,7 +102,7 @@ ], "openai/gpt-oss-120b": [ "linux.aws.a100", - ] + ], # Deepseek can only run on B200 "deepseek-ai/DeepSeek-V3.1": [ "linux.aws.a100", @@ -111,7 +111,7 @@ "deepseek-ai/DeepSeek-R1": [ "linux.aws.a100", "linux.aws.h100", - ,] + ], } @@ -251,7 +251,9 @@ def generate_benchmark_matrix( continue # Check the skip logic - if model in PLATFORM_SKIPS and any(lambda r: r in runner, PLATFORM_SKIPS[model]): + if model in PLATFORM_SKIPS and any( + lambda r: r in runner, PLATFORM_SKIPS[model] + ): continue benchmark_matrix["include"].append( From 9ae68ae3b1929d14b546c21e2f39c0f4a5621651 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Sep 2025 18:57:13 -0700 Subject: [PATCH 4/8] Minor bug Signed-off-by: Huy Do --- .github/scripts/generate_vllm_benchmark_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 61745608..919012c1 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -252,7 +252,7 @@ def generate_benchmark_matrix( # Check the skip logic if model in PLATFORM_SKIPS and any( - lambda r: r in runner, PLATFORM_SKIPS[model] + [r in runner for r in PLATFORM_SKIPS[model]] ): continue From 68714318a1093949b0dd4c83aacef33c400a3c1f Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Sep 2025 19:09:16 -0700 Subject: [PATCH 5/8] Fix another bug Signed-off-by: Huy Do --- .github/scripts/generate_vllm_benchmark_matrix.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 919012c1..0825a3d3 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -60,8 +60,9 @@ ] ) -# Model and runner skip logic, for example, just need to run deepseek-ai/DeepSeek-V3.1 -# on b200 and not h100. This also serves as another knob to tune CI behavior +# Model and runner skip logic, for example, just need to run DeepSeek on b200 +# and not h100. This also serves as a knob to tune CI behavior. TODO (huydhn): +# Figure out how to set this in the JSON benchmark configuration instead PLATFORM_SKIPS = { # Already been covered in both A100 and H100 "meta-llama/Meta-Llama-3.1-8B-Instruct": [ @@ -113,6 +114,8 @@ "linux.aws.h100", ], } +# Lower case all the model names for consistency +PLATFORM_SKIPS = {k.lower(): v for k, v in PLATFORM_SKIPS.items()} class ValidateDir(Action): From 25d9bf1acfa132a5bc16592e043cfa7ac2ba5349 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Sep 2025 22:46:19 -0700 Subject: [PATCH 6/8] Skip gemma3 on ROCm Signed-off-by: Huy Do --- .github/scripts/generate_vllm_benchmark_matrix.py | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 0825a3d3..b5b33877 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -88,6 +88,7 @@ "google/gemma-3-27b-it": [ "linux.aws.a100", "linux.aws.h100", + "linux.rocm.gpu.gfx942", # TODO (huydhn): Failed on ROCm ], "meta-llama/Llama-4-Scout-17B-16E-Instruct": [ "linux.aws.a100", From 0ef5a206420f87b9e60ea9e466dd4de1d8e1dc1b Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 10 Sep 2025 23:50:59 -0700 Subject: [PATCH 7/8] A bit more tweak Signed-off-by: Huy Do --- .github/scripts/generate_vllm_benchmark_matrix.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index b5b33877..7512c48d 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -88,7 +88,7 @@ "google/gemma-3-27b-it": [ "linux.aws.a100", "linux.aws.h100", - "linux.rocm.gpu.gfx942", # TODO (huydhn): Failed on ROCm + "linux.rocm.gpu.gfx942", # TODO (huydhn): Fail on ROCm ], "meta-llama/Llama-4-Scout-17B-16E-Instruct": [ "linux.aws.a100", @@ -97,6 +97,7 @@ "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [ "linux.aws.a100", "linux.aws.h100", + "linux.rocm.gpu.gfx942", # TODO (huydhn): Hang on ROCm ], # Run gpt-oss on both H100 and B200 "openai/gpt-oss-20b": [ From 4baa53c7f98c4725f618009d8db48d3d70de2556 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 11 Sep 2025 02:10:36 -0700 Subject: [PATCH 8/8] [no ci] Benchmark DeepSeek is a bit slow Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index dc7281c7..32098f64 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -2,8 +2,8 @@ name: vLLM Benchmark on: schedule: - # Run every 4 hours - - cron: '0 */4 * * *' + # Run every 6 hours + - cron: '0 */6 * * *' workflow_dispatch: inputs: vllm_branch: