From c03bbb80a4f32493f0847bec9174b9a822ec03d3 Mon Sep 17 00:00:00 2001 From: jakub-sochacki Date: Tue, 14 Oct 2025 12:32:54 +0300 Subject: [PATCH 1/2] Enable Intel Gaudi 3 benchmarks, runner placeholder --- .../scripts/generate_vllm_benchmark_matrix.py | 16 ++++ .github/workflows/vllm-benchmark.yml | 12 ++- .../benchmarks/hpu/latency-tests-hpu.json | 55 +++++++++++++ .../benchmarks/hpu/serving-tests-hpu.json | 82 +++++++++++++++++++ .../benchmarks/hpu/throughput-tests-hpu.json | 61 ++++++++++++++ 5 files changed, 225 insertions(+), 1 deletion(-) create mode 100644 vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json create mode 100644 vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json create mode 100644 vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 5a0d0780..b73d4fa5 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -20,6 +20,7 @@ "linux.24xl.spr-metal", "linux.24xl.gnr", "linux.dgx.b200", + "linux.hpu.gaudi3.8", ], # NB: There is no 2xH100 runner at the momement, so let's use the next one # in the list here which is 4xH100 @@ -27,15 +28,18 @@ "linux.aws.h100.4", "linux.rocm.gpu.gfx942.2", "linux.24xl.gnr", + "linux.hpu.gaudi3.8", ], 4: [ "linux.aws.h100.4", "linux.rocm.gpu.gfx942.4", + "linux.hpu.gaudi3.8", ], 8: [ "linux.aws.h100.8", "linux.rocm.gpu.gfx942.8", "linux.dgx.b200.8", + "linux.hpu.gaudi3.8", ], } @@ -53,6 +57,7 @@ "linux.rocm.gpu.gfx942.8": "rocm", "linux.24xl.spr-metal": "cpu", "linux.24xl.gnr": "cpu", + "linux.hpu.gaudi3.8": "hpu", } # All the different names vLLM uses to refer to their benchmark configs @@ -82,10 +87,12 @@ ], "Qwen/Qwen3-8B": [ "linux.dgx.b200", + "linux.hpu.gaudi3.8", ], "google/gemma-3-4b-it": [ "linux.dgx.b200", "linux.rocm.gpu.gfx942", # TODO: Fail on ROCm + "linux.hpu.gaudi3.8", ], # Run some bigger models on B200 to share the load "Qwen/Qwen3-30B-A3B": [ @@ -93,49 +100,58 @@ "linux.aws.h100", "linux.rocm.gpu.gfx942", # TODO: Fail on ROCm "linux.24xl.gnr", + "linux.hpu.gaudi3.8", ], "google/gemma-3-27b-it": [ "linux.aws.a100", "linux.aws.h100", "linux.rocm.gpu.gfx942", # TODO (huydhn): Fail on ROCm "linux.24xl.gnr", + "linux.hpu.gaudi3.8", ], "meta-llama/Llama-4-Scout-17B-16E-Instruct": [ "linux.aws.a100", "linux.aws.h100", "linux.rocm.gpu.gfx942", # TODO: Fail on ROCm "linux.24xl.gnr", + "linux.hpu.gaudi3.8", ], "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": [ "linux.aws.a100", "linux.aws.h100", "linux.rocm.gpu.gfx942", # TODO (huydhn): Hang on ROCm "linux.24xl.gnr", + "linux.hpu.gaudi3.8", ], # Run gpt-oss on both H100 and B200 "openai/gpt-oss-20b": [ "linux.aws.a100", "linux.24xl.gnr", + "linux.hpu.gaudi3.8", ], "openai/gpt-oss-120b": [ "linux.aws.a100", "linux.24xl.gnr", + "linux.hpu.gaudi3.8", ], # Deepseek can only run on B200 "deepseek-ai/DeepSeek-V3.1": [ "linux.aws.a100", "linux.aws.h100", "linux.24xl.gnr", + "linux.hpu.gaudi3.8", ], "deepseek-ai/DeepSeek-V3.2-Exp": [ "linux.aws.a100", "linux.aws.h100", "linux.24xl.gnr", + "linux.hpu.gaudi3.8", ], "deepseek-ai/DeepSeek-R1": [ "linux.aws.a100", "linux.24xl.gnr", "linux.aws.h100", + "linux.hpu.gaudi3.8", ], } # Lower case all the model names for consistency diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index b097f1d2..d8b9e38b 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -25,7 +25,7 @@ on: A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything) required: true type: string - default: h100,rocm,spr,gnr,b200 + default: h100,rocm,spr,gnr,b200,gaudi3 pull_request: paths: - .github/workflows/vllm-benchmark.yml @@ -104,6 +104,9 @@ jobs: elif command -v rocm-smi; then DEVICE_NAME=rocm rocm-smi + elif command -v hl-smi; then + DEVICE_NAME=hpu + hl-smi else DEVICE_NAME=cpu lscpu @@ -120,6 +123,8 @@ jobs: DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') elif [[ "${DEVICE_NAME}" == "rocm" ]]; then DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) + elif [[ "${DEVICE_NAME}" == "hpu" ]]; then + DEVICE_TYPE=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//') elif [[ "${DEVICE_NAME}" == "cpu" ]]; then DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") fi @@ -133,6 +138,9 @@ jobs: if [[ "${DEVICE_NAME}" == "rocm" ]]; then pip install -r .github/scripts/requirements.txt \ --extra-index-url https://download.pytorch.org/whl/rocm6.3 + elif [[ "${DEVICE_NAME}" == "hpu" ]]; then + grep -v "^torch==" .github/scripts/requirements.txt > /tmp/requirements_no_torch.txt + pip install -r /tmp/requirements_no_torch.txt else pip install -r .github/scripts/requirements.txt \ --extra-index-url https://download.pytorch.org/whl/cu128 @@ -155,6 +163,8 @@ jobs: DOCKER_IMAGE_SUFFIX="" if [[ "${DEVICE_NAME}" == "rocm" ]]; then DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci + elif [[ "${DEVICE_NAME}" == "hpu" ]]; then + DOCKER_IMAGE_SUFFIX=-hpu elif [[ "${DEVICE_NAME}" == "cpu" ]]; then DOCKER_IMAGE_SUFFIX=-cpu fi diff --git a/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json new file mode 100644 index 00000000..296380f7 --- /dev/null +++ b/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json @@ -0,0 +1,55 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + }, + { + "test_name": "latency_llama70B_tp4", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + } +] diff --git a/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json new file mode 100644 index 00000000..8c6b34bd --- /dev/null +++ b/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json @@ -0,0 +1,82 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json new file mode 100644 index 00000000..3127bf2f --- /dev/null +++ b/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json @@ -0,0 +1,61 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + } +] From cef3b9eed8adfe7ee85ea674c49ebf947561d6d9 Mon Sep 17 00:00:00 2001 From: jakub-sochacki Date: Wed, 15 Oct 2025 17:47:24 +0300 Subject: [PATCH 2/2] Add Intel Gaudi3 HPU benchmark support with version compatibility --- .github/workflows/vllm-benchmark.yml | 93 +++++++++++++++++++++------- 1 file changed, 71 insertions(+), 22 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index d8b9e38b..135c28de 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -198,34 +198,83 @@ jobs: if [[ -z "${HEAD_SHA}" ]]; then pushd vllm - # Looking back the latest 100 commits is enough - for i in {0..99} - do - # Check if the image is there, if it doesn't then check an older one - # because the commit is too recent - HEAD_SHA=$(git rev-parse --verify HEAD~${i}) - DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" - - # No Docker image available yet because the commit is too recent - if ! docker manifest inspect "${DOCKER_IMAGE}"; then - continue + + # Special handling for HPU: use vllm-gaudi compatible commit + # + # Problem: VLLM_STABLE_COMMIT might change between when CI builds the image + # and when this benchmark runs (every 12 hours), causing image tag mismatches. + # + # Solution: Query git history of VLLM_STABLE_COMMIT file to find the most recent + # compatible vLLM commit that has an actual Docker image built by CI. + if [[ "${DEVICE_NAME}" == "hpu" ]]; then + echo "HPU device detected - finding compatible vLLM commit from vllm-gaudi history" + + # Clone only the last-good-commit-for-vllm-gaudi branch (lightweight, single file) + git clone --depth 50 --single-branch --branch vllm/last-good-commit-for-vllm-gaudi \ + https://github.com/vllm-project/vllm-gaudi.git /tmp/vllm-gaudi + pushd /tmp/vllm-gaudi + + # Get the last 30 commits - each commit represents a VLLM_STABLE_COMMIT update + # This gives us a history of compatible vLLM versions + CANDIDATE_COMMITS=$(git log -30 --pretty=format:"%H") + popd + + # Try each candidate commit (newest to oldest) until we find an existing image + FOUND_IMAGE=0 + for VLLM_GAUDI_COMMIT in ${CANDIDATE_COMMITS}; do + # Get the vLLM commit from this version of the branch + CANDIDATE_VLLM_COMMIT=$(curl -s "https://raw.githubusercontent.com/vllm-project/vllm-gaudi/${VLLM_GAUDI_COMMIT}/VLLM_STABLE_COMMIT" | tr -d '\n') + + if [[ -z "${CANDIDATE_VLLM_COMMIT}" ]]; then + continue + fi + + DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${CANDIDATE_VLLM_COMMIT}${DOCKER_IMAGE_SUFFIX}" + echo "Checking if image exists: ${DOCKER_IMAGE}" + + if docker manifest inspect "${DOCKER_IMAGE}" > /dev/null 2>&1; then + echo "Found existing HPU image for vLLM commit: ${CANDIDATE_VLLM_COMMIT}" + HEAD_SHA="${CANDIDATE_VLLM_COMMIT}" + FOUND_IMAGE=1 + break + fi + done + + if [[ ${FOUND_IMAGE} == 0 ]]; then + echo "ERROR: No HPU Docker image found in the last 20 versions of VLLM_STABLE_COMMIT" + echo "This likely means ci-infra hasn't successfully built any HPU images yet" + exit 1 fi - - NOT_EXIST=0 - S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json" - aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1 - - if [[ ${NOT_EXIST} == "1" ]]; then - echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet" - break - fi - done + else + # For non-HPU devices: Looking back the latest 100 commits + for i in {0..99} + do + # Check if the image is there, if it doesn't then check an older one + # because the commit is too recent + HEAD_SHA=$(git rev-parse --verify HEAD~${i}) + DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" + + # No Docker image available yet because the commit is too recent + if ! docker manifest inspect "${DOCKER_IMAGE}"; then + continue + fi + + NOT_EXIST=0 + S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json" + aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1 + + if [[ ${NOT_EXIST} == "1" ]]; then + echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet" + break + fi + done + fi popd fi echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV - # Print the benchmark commit for rereference + # Print the benchmark commit for reference echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" - name: Setup CUDA GPU_FLAG for docker run