From 5109b4fe3f096077f4c78893b6fa2f89a8e263de Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 10 Sep 2025 18:45:50 -0700
Subject: [PATCH 1/8] Add DeepSeek-V3 and DeepSeek-R1 on B200

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .../scripts/generate_vllm_benchmark_matrix.py | 60 ++++++++++++++++++-
 .github/workflows/vllm-benchmark.yml          |  2 +-
 .../benchmarks/cuda/latency-tests.json        | 22 +++++++
 .../benchmarks/cuda/serving-tests.json        | 40 +++++++++++++
 .../benchmarks/cuda/throughput-tests.json     | 24 ++++++++
 5 files changed, 146 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
index 2893fdb0..26037757 100755
--- a/.github/scripts/generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -16,7 +16,7 @@
     1: [
         "linux.aws.a100",
         "linux.aws.h100",
-        "linux.rocm.gpu.gfx942.2",  # No single ROCm GPU?
+        "linux.rocm.gpu.gfx942.2",
         "linux.24xl.spr-metal",
         "linux.dgx.b200",
     ],
@@ -62,6 +62,60 @@
     ]
 )
 
+# Model and runner skip logic, for example, just need to run deepseek-ai/DeepSeek-V3.1
+# on b200 and not h100. This also serves as another knob to tune CI behavior
+PLATFORM_SKIPS = {
+    # Already been covered in both A100 and H100
+    "meta-llama/Meta-Llama-3.1-8B-Instruct": [
+        "linux.dgx.b200",
+    ],
+    "meta-llama/Meta-Llama-3.1-70B-Instruct": [
+        "linux.dgx.b200",
+    ],
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": [
+        "linux.dgx.b200",
+    ],
+    "Qwen/Qwen3-8B": [
+        "linux.dgx.b200",
+    ]
+    "google/gemma-3-4b-it": [
+        "linux.dgx.b200",
+    ],
+    # Run some bigger models on B200 to share the load
+    "Qwen/Qwen3-30B-A3B": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ],
+    "google/gemma-3-27b-it": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ]
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ],
+    "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ],
+    # Run gpt-oss on both H100 and B200
+    "openai/gpt-oss-20b": [
+        "linux.aws.a100",
+    ],
+    "openai/gpt-oss-120b": [
+        "linux.aws.a100",
+    ]
+    # Deepseek can only run on B200
+    "deepseek-ai/DeepSeek-V3.1": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ],
+    "deepseek-ai/DeepSeek-R1": [
+        "linux.aws.a100",
+        "linux.aws.h100",
+    ,]
+}
+
 
 class ValidateDir(Action):
     def __call__(
@@ -198,6 +252,10 @@ def generate_benchmark_matrix(
                     if not found_runner and not use_all_runners:
                         continue
 
+                    # Check the skip logic
+                    if model in PLATFORM_SKIPS and any(lambda r: r in runner, PLATFORM_SKIPS[model]):
+                        continue
+
                     benchmark_matrix["include"].append(
                         {
                             "runner": runner,
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
index 366647a1..dc7281c7 100644
--- a/.github/workflows/vllm-benchmark.yml
+++ b/.github/workflows/vllm-benchmark.yml
@@ -53,7 +53,7 @@ jobs:
         shell: bash
         env:
           MODELS: ${{ inputs.models || '' }}
-          RUNNERS: ${{ inputs.runners || 'h100' }}
+          RUNNERS: ${{ inputs.runners || '' }}
         run: |
           set -eux
 
diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json
index c397a5bc..ebcc2e59 100644
--- a/vllm-benchmarks/benchmarks/cuda/latency-tests.json
+++ b/vllm-benchmarks/benchmarks/cuda/latency-tests.json
@@ -72,5 +72,27 @@
             "num_iters": 15,
             "max_model_len": 8192
         }
+    },
+    {
+        "test_name": "latency_deepseek_v3_tp8",
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-V3.1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15,
+            "max_model_len": 8192
+        }
+    },
+    {
+        "test_name": "latency_deepseek_r1_tp8",
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15,
+            "max_model_len": 8192
+        }
     }
 ]
diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json
index 59e1a659..93da8409 100644
--- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json
+++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json
@@ -451,5 +451,45 @@
             "random_input_len": 5250,
             "random_output_len": 8250
         }
+    },
+    {
+        "test_name": "serving_deepseek_v3_tp8_random_in5k_out8k",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "deepseek-ai/DeepSeek-V3.1",
+            "tensor_parallel_size": 8,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "deepseek-ai/DeepSeek-V3.1",
+            "backend": "vllm",
+            "dataset_name": "random",
+            "num_prompts": 200,
+            "random_input_len": 5250,
+            "random_output_len": 8250
+        }
+    },
+    {
+        "test_name": "serving_deepseek_r1_tp8_random_in5k_out8k",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "backend": "vllm",
+            "dataset_name": "random",
+            "num_prompts": 200,
+            "random_input_len": 5250,
+            "random_output_len": 8250
+        }
     }
 ]
diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
index f159426e..adb3b4de 100644
--- a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
+++ b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
@@ -79,5 +79,29 @@
             "backend": "vllm",
             "max_model_len": 8192
         }
+    },
+    {
+        "test_name": "throughput_deepseek_v3_tp8",
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-V3.1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm",
+            "max_model_len": 8192
+        }
+    },
+    {
+        "test_name": "throughput_deepseek_r1_tp8",
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm",
+            "max_model_len": 8192
+        }
     }
 ]

From 37f5e1aede9a14220b637c08a4f5715c9c53d4c8 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 10 Sep 2025 18:49:06 -0700
Subject: [PATCH 2/8] Comment

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .github/scripts/generate_vllm_benchmark_matrix.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
index 26037757..2bbb294c 100755
--- a/.github/scripts/generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -16,7 +16,7 @@
     1: [
         "linux.aws.a100",
         "linux.aws.h100",
-        "linux.rocm.gpu.gfx942.2",
+        "linux.rocm.gpu.gfx942.1",
         "linux.24xl.spr-metal",
         "linux.dgx.b200",
     ],
@@ -29,8 +29,6 @@
     4: [
         "linux.aws.h100.4",
         "linux.rocm.gpu.gfx942.4",
-        # TODO (huydhn): Enable this when Intel's runners are ready
-        # "intel-cpu-emr",
     ],
     8: [
         "linux.aws.h100.8",

From a9699ebc46a3e1fd3e870b94d0a46ac6779ae6bb Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 10 Sep 2025 18:50:22 -0700
Subject: [PATCH 3/8] Format

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .github/scripts/generate_vllm_benchmark_matrix.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
index 2bbb294c..61745608 100755
--- a/.github/scripts/generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -75,7 +75,7 @@
     ],
     "Qwen/Qwen3-8B": [
         "linux.dgx.b200",
-    ]
+    ],
     "google/gemma-3-4b-it": [
         "linux.dgx.b200",
     ],
@@ -87,7 +87,7 @@
     "google/gemma-3-27b-it": [
         "linux.aws.a100",
         "linux.aws.h100",
-    ]
+    ],
     "meta-llama/Llama-4-Scout-17B-16E-Instruct": [
         "linux.aws.a100",
         "linux.aws.h100",
@@ -102,7 +102,7 @@
     ],
     "openai/gpt-oss-120b": [
         "linux.aws.a100",
-    ]
+    ],
     # Deepseek can only run on B200
     "deepseek-ai/DeepSeek-V3.1": [
         "linux.aws.a100",
@@ -111,7 +111,7 @@
     "deepseek-ai/DeepSeek-R1": [
         "linux.aws.a100",
         "linux.aws.h100",
-    ,]
+    ],
 }
 
 
@@ -251,7 +251,9 @@ def generate_benchmark_matrix(
                         continue
 
                     # Check the skip logic
-                    if model in PLATFORM_SKIPS and any(lambda r: r in runner, PLATFORM_SKIPS[model]):
+                    if model in PLATFORM_SKIPS and any(
+                        lambda r: r in runner, PLATFORM_SKIPS[model]
+                    ):
                         continue
 
                     benchmark_matrix["include"].append(

From 9ae68ae3b1929d14b546c21e2f39c0f4a5621651 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 10 Sep 2025 18:57:13 -0700
Subject: [PATCH 4/8] Minor bug

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .github/scripts/generate_vllm_benchmark_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
index 61745608..919012c1 100755
--- a/.github/scripts/generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -252,7 +252,7 @@ def generate_benchmark_matrix(
 
                     # Check the skip logic
                     if model in PLATFORM_SKIPS and any(
-                        lambda r: r in runner, PLATFORM_SKIPS[model]
+                        [r in runner for r in PLATFORM_SKIPS[model]]
                     ):
                         continue
 

From 68714318a1093949b0dd4c83aacef33c400a3c1f Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 10 Sep 2025 19:09:16 -0700
Subject: [PATCH 5/8] Fix another bug

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .github/scripts/generate_vllm_benchmark_matrix.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
index 919012c1..0825a3d3 100755
--- a/.github/scripts/generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -60,8 +60,9 @@
     ]
 )
 
-# Model and runner skip logic, for example, just need to run deepseek-ai/DeepSeek-V3.1
-# on b200 and not h100. This also serves as another knob to tune CI behavior
+# Model and runner skip logic, for example, just need to run DeepSeek on b200
+# and not h100. This also serves as a knob to tune CI behavior. TODO (huydhn):
+# Figure out how to set this in the JSON benchmark configuration instead
 PLATFORM_SKIPS = {
     # Already been covered in both A100 and H100
     "meta-llama/Meta-Llama-3.1-8B-Instruct": [
@@ -113,6 +114,8 @@
         "linux.aws.h100",
     ],
 }
+# Lower case all the model names for consistency
+PLATFORM_SKIPS = {k.lower(): v for k, v in PLATFORM_SKIPS.items()}
 
 
 class ValidateDir(Action):

From 25d9bf1acfa132a5bc16592e043cfa7ac2ba5349 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 10 Sep 2025 22:46:19 -0700
Subject: [PATCH 6/8] Skip gemma3 on ROCm

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .github/scripts/generate_vllm_benchmark_matrix.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
index 0825a3d3..b5b33877 100755
--- a/.github/scripts/generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -88,6 +88,7 @@
     "google/gemma-3-27b-it": [
         "linux.aws.a100",
         "linux.aws.h100",
+        "linux.rocm.gpu.gfx942",  # TODO (huydhn): Failed on ROCm
     ],
     "meta-llama/Llama-4-Scout-17B-16E-Instruct": [
         "linux.aws.a100",

From 0ef5a206420f87b9e60ea9e466dd4de1d8e1dc1b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 10 Sep 2025 23:50:59 -0700
Subject: [PATCH 7/8] A bit more tweak

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .github/scripts/generate_vllm_benchmark_matrix.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
index b5b33877..7512c48d 100755
--- a/.github/scripts/generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -88,7 +88,7 @@
     "google/gemma-3-27b-it": [
         "linux.aws.a100",
         "linux.aws.h100",
-        "linux.rocm.gpu.gfx942",  # TODO (huydhn): Failed on ROCm
+        "linux.rocm.gpu.gfx942",  # TODO (huydhn): Fail on ROCm
     ],
     "meta-llama/Llama-4-Scout-17B-16E-Instruct": [
         "linux.aws.a100",
@@ -97,6 +97,7 @@
     "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [
         "linux.aws.a100",
         "linux.aws.h100",
+        "linux.rocm.gpu.gfx942",  # TODO (huydhn): Hang on ROCm
     ],
     # Run gpt-oss on both H100 and B200
     "openai/gpt-oss-20b": [

From 4baa53c7f98c4725f618009d8db48d3d70de2556 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 11 Sep 2025 02:10:36 -0700
Subject: [PATCH 8/8] [no ci] Benchmark DeepSeek is a bit slow

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 .github/workflows/vllm-benchmark.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
index dc7281c7..32098f64 100644
--- a/.github/workflows/vllm-benchmark.yml
+++ b/.github/workflows/vllm-benchmark.yml
@@ -2,8 +2,8 @@ name: vLLM Benchmark
 
 on:
   schedule:
-    # Run every 4 hours
-    - cron: '0 */4 * * *'
+    # Run every 6 hours
+    - cron: '0 */6 * * *'
   workflow_dispatch:
     inputs:
       vllm_branch: