From a2ed59c25377ad02faae4bb18ac478f2c2ad8f49 Mon Sep 17 00:00:00 2001 From: Ioana Ghiban Date: Thu, 9 Oct 2025 15:44:53 +0200 Subject: [PATCH] Add Linux Aarch64 G3 runners to vLLM bms --- .../scripts/generate_vllm_benchmark_matrix.py | 6 +- .../test_generate_vllm_benchmark_matrix.py | 32 ++++- .github/workflows/vllm-benchmark.yml | 32 +++-- LICENSE | 3 + .../cpu/latency-tests-arm64-cpu.json | 30 +++++ .../cpu/serving-tests-arm64-cpu.json | 121 ++++++++++++++++++ .../cpu/throughput-tests-arm64-cpu.json | 32 +++++ 7 files changed, 244 insertions(+), 12 deletions(-) create mode 100644 vllm-benchmarks/benchmarks/cpu/latency-tests-arm64-cpu.json create mode 100644 vllm-benchmarks/benchmarks/cpu/serving-tests-arm64-cpu.json create mode 100644 vllm-benchmarks/benchmarks/cpu/throughput-tests-arm64-cpu.json diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index a4b35f8c..bf702958 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -18,6 +18,7 @@ "linux.aws.h100", "linux.rocm.gpu.gfx942.1", "linux.24xl.spr-metal", + "linux.arm64.m7g.4xlarge", "linux.dgx.b200", ], # NB: There is no 2xH100 runner at the momement, so let's use the next one @@ -50,6 +51,7 @@ "linux.rocm.gpu.gfx942.4": "rocm", "linux.rocm.gpu.gfx942.8": "rocm", "linux.24xl.spr-metal": "cpu", + "linux.arm64.m7g.4xlarge": "cpu", } # All the different names vLLM uses to refer to their benchmark configs @@ -198,8 +200,8 @@ def generate_benchmark_matrix( ) -> Dict[str, Any]: """ Parse all the JSON files in vLLM benchmark configs directory to get the - model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes) - """ + model name and tensor parallel size (aka number of GPUs, CPU NUMA nodes - Intel + or CPUs - ARM)""" benchmark_matrix: Dict[str, Any] = { "include": [], } diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py index b3459567..659467d5 100644 --- a/.github/scripts/test_generate_vllm_benchmark_matrix.py +++ b/.github/scripts/test_generate_vllm_benchmark_matrix.py @@ -21,6 +21,10 @@ def test_generate_benchmark_matrix(): """\ { "include": [ + { + "runner": "linux.arm64.m7g.4xlarge", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, { "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" @@ -80,6 +84,10 @@ def test_generate_benchmark_matrix(): """\ { "include": [ + { + "runner": "linux.arm64.m7g.4xlarge", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, { "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" @@ -110,6 +118,10 @@ def test_generate_benchmark_matrix(): """\ { "include": [ + { + "runner": "linux.arm64.m7g.4xlarge", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, { "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" @@ -145,6 +157,10 @@ def test_generate_benchmark_matrix(): """\ { "include": [ + { + "runner": "linux.arm64.m7g.4xlarge", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, { "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" @@ -172,6 +188,10 @@ def test_generate_benchmark_matrix(): """\ { "include": [ + { + "runner": "linux.arm64.m7g.4xlarge", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, { "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" @@ -225,7 +245,7 @@ def test_generate_benchmark_matrix(): # Select multiple runners models = [] - runners = ["h100", "spr"] + runners = ["h100", "spr", "m7g"] output = json.dumps( generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 ) @@ -234,6 +254,10 @@ def test_generate_benchmark_matrix(): """\ { "include": [ + { + "runner": "linux.arm64.m7g.4xlarge", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, { "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" @@ -356,7 +380,7 @@ def test_generate_benchmark_matrix(): "meta-llama/meta-llama-3.1-8b-instruct", "mistralai/mixtral-8x7b-instruct-v0.1", ] - runners = ["rocm", "spr"] + runners = ["rocm", "spr", "m7g"] output = json.dumps( generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 ) @@ -365,6 +389,10 @@ def test_generate_benchmark_matrix(): """\ { "include": [ + { + "runner": "linux.arm64.m7g.4xlarge", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, { "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index d4202ed3..efd5971b 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -25,7 +25,7 @@ on: A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything) required: true type: string - default: h100,rocm,spr,b200 + default: h100,rocm,spr,b200,m7g pull_request: paths: - .github/workflows/vllm-benchmark.yml @@ -104,8 +104,17 @@ jobs: elif command -v rocm-smi; then DEVICE_NAME=rocm rocm-smi - else - DEVICE_NAME=cpu + else + arch=$(uname -m) + + case "$arch" in + aarch64|arm64) + DEVICE_NAME=arm64-cpu + ;; + *) + DEVICE_NAME=cpu + ;; + esac lscpu fi echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV @@ -122,6 +131,8 @@ jobs: DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) elif [[ "${DEVICE_NAME}" == "cpu" ]]; then DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") + elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then + DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") fi echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV @@ -157,6 +168,8 @@ jobs: DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci elif [[ "${DEVICE_NAME}" == "cpu" ]]; then DOCKER_IMAGE_SUFFIX=-cpu + elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then + DOCKER_IMAGE_SUFFIX=-arm64-cpu fi echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV @@ -266,11 +279,13 @@ jobs: run: | set -eux - if [[ "${DEVICE_NAME}" == "cpu" ]]; then - ON_CPU=1 - else - ON_CPU=0 - fi + ON_ARM64_CPU=0 + ON_CPU=0 + + case "$DEVICE_NAME" in + cpu) ON_CPU=1 ;; + arm64-cpu) ON_ARM64_CPU=1 ;; + esac container_name=$(docker run \ ${GPU_FLAG:-} \ @@ -283,6 +298,7 @@ jobs: -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ -e ON_CPU="${ON_CPU}" \ + -e ON_ARM64_CPU="${ON_ARM64_CPU}" \ --ipc=host \ --tty \ --detach \ diff --git a/LICENSE b/LICENSE index b96dcb04..00171360 100644 --- a/LICENSE +++ b/LICENSE @@ -2,6 +2,9 @@ MIT License Copyright (c) Facebook, Inc. and its affiliates. +All contributions by Arm: +Copyright (c) 2025 Arm Limited and/or its affiliates + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights diff --git a/vllm-benchmarks/benchmarks/cpu/latency-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/cpu/latency-tests-arm64-cpu.json new file mode 100644 index 00000000..da93fdd1 --- /dev/null +++ b/vllm-benchmarks/benchmarks/cpu/latency-tests-arm64-cpu.json @@ -0,0 +1,30 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/vllm-benchmarks/benchmarks/cpu/serving-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/cpu/serving-tests-arm64-cpu.json new file mode 100644 index 00000000..c5f1c27e --- /dev/null +++ b/vllm-benchmarks/benchmarks/cpu/serving-tests-arm64-cpu.json @@ -0,0 +1,121 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 16, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 16, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 16, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_random_1024_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 16, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 1024, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 100 + } + } +] diff --git a/vllm-benchmarks/benchmarks/cpu/throughput-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/cpu/throughput-tests-arm64-cpu.json new file mode 100644 index 00000000..f159c306 --- /dev/null +++ b/vllm-benchmarks/benchmarks/cpu/throughput-tests-arm64-cpu.json @@ -0,0 +1,32 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +]