diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 2893fdb0..7512c48d 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -16,7 +16,7 @@ 1: [ "linux.aws.a100", "linux.aws.h100", - "linux.rocm.gpu.gfx942.2", # No single ROCm GPU? + "linux.rocm.gpu.gfx942.1", "linux.24xl.spr-metal", "linux.dgx.b200", ], @@ -29,8 +29,6 @@ 4: [ "linux.aws.h100.4", "linux.rocm.gpu.gfx942.4", - # TODO (huydhn): Enable this when Intel's runners are ready - # "intel-cpu-emr", ], 8: [ "linux.aws.h100.8", @@ -62,6 +60,65 @@ ] ) +# Model and runner skip logic, for example, just need to run DeepSeek on b200 +# and not h100. This also serves as a knob to tune CI behavior. TODO (huydhn): +# Figure out how to set this in the JSON benchmark configuration instead +PLATFORM_SKIPS = { + # Already been covered in both A100 and H100 + "meta-llama/Meta-Llama-3.1-8B-Instruct": [ + "linux.dgx.b200", + ], + "meta-llama/Meta-Llama-3.1-70B-Instruct": [ + "linux.dgx.b200", + ], + "mistralai/Mixtral-8x7B-Instruct-v0.1": [ + "linux.dgx.b200", + ], + "Qwen/Qwen3-8B": [ + "linux.dgx.b200", + ], + "google/gemma-3-4b-it": [ + "linux.dgx.b200", + ], + # Run some bigger models on B200 to share the load + "Qwen/Qwen3-30B-A3B": [ + "linux.aws.a100", + "linux.aws.h100", + ], + "google/gemma-3-27b-it": [ + "linux.aws.a100", + "linux.aws.h100", + "linux.rocm.gpu.gfx942", # TODO (huydhn): Fail on ROCm + ], + "meta-llama/Llama-4-Scout-17B-16E-Instruct": [ + "linux.aws.a100", + "linux.aws.h100", + ], + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [ + "linux.aws.a100", + "linux.aws.h100", + "linux.rocm.gpu.gfx942", # TODO (huydhn): Hang on ROCm + ], + # Run gpt-oss on both H100 and B200 + "openai/gpt-oss-20b": [ + "linux.aws.a100", + ], + "openai/gpt-oss-120b": [ + "linux.aws.a100", + ], + # Deepseek can only run on B200 + "deepseek-ai/DeepSeek-V3.1": [ + "linux.aws.a100", + "linux.aws.h100", + ], + "deepseek-ai/DeepSeek-R1": [ + "linux.aws.a100", + "linux.aws.h100", + ], +} +# Lower case all the model names for consistency +PLATFORM_SKIPS = {k.lower(): v for k, v in PLATFORM_SKIPS.items()} + class ValidateDir(Action): def __call__( @@ -198,6 +255,12 @@ def generate_benchmark_matrix( if not found_runner and not use_all_runners: continue + # Check the skip logic + if model in PLATFORM_SKIPS and any( + [r in runner for r in PLATFORM_SKIPS[model]] + ): + continue + benchmark_matrix["include"].append( { "runner": runner, diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 366647a1..32098f64 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -2,8 +2,8 @@ name: vLLM Benchmark on: schedule: - # Run every 4 hours - - cron: '0 */4 * * *' + # Run every 6 hours + - cron: '0 */6 * * *' workflow_dispatch: inputs: vllm_branch: @@ -53,7 +53,7 @@ jobs: shell: bash env: MODELS: ${{ inputs.models || '' }} - RUNNERS: ${{ inputs.runners || 'h100' }} + RUNNERS: ${{ inputs.runners || '' }} run: | set -eux diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json index c397a5bc..ebcc2e59 100644 --- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json @@ -72,5 +72,27 @@ "num_iters": 15, "max_model_len": 8192 } + }, + { + "test_name": "latency_deepseek_v3_tp8", + "parameters": { + "model": "deepseek-ai/DeepSeek-V3.1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_deepseek_r1_tp8", + "parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 59e1a659..93da8409 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -451,5 +451,45 @@ "random_input_len": 5250, "random_output_len": 8250 } + }, + { + "test_name": "serving_deepseek_v3_tp8_random_in5k_out8k", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "deepseek-ai/DeepSeek-V3.1", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "deepseek-ai/DeepSeek-V3.1", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5250, + "random_output_len": 8250 + } + }, + { + "test_name": "serving_deepseek_r1_tp8_random_in5k_out8k", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5250, + "random_output_len": 8250 + } } ] diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json index f159426e..adb3b4de 100644 --- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json @@ -79,5 +79,29 @@ "backend": "vllm", "max_model_len": 8192 } + }, + { + "test_name": "throughput_deepseek_v3_tp8", + "parameters": { + "model": "deepseek-ai/DeepSeek-V3.1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_deepseek_r1_tp8", + "parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } } ]