Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/scripts/generate_vllm_benchmark_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"linux.aws.h100",
"linux.rocm.gpu.gfx942.1",
"linux.24xl.spr-metal",
"linux.arm64.m7g.4xlarge",
"linux.dgx.b200",
],
# NB: There is no 2xH100 runner at the momement, so let's use the next one
Expand Down Expand Up @@ -50,6 +51,7 @@
"linux.rocm.gpu.gfx942.4": "rocm",
"linux.rocm.gpu.gfx942.8": "rocm",
"linux.24xl.spr-metal": "cpu",
"linux.arm64.m7g.4xlarge": "cpu",
}

# All the different names vLLM uses to refer to their benchmark configs
Expand Down Expand Up @@ -198,8 +200,8 @@ def generate_benchmark_matrix(
) -> Dict[str, Any]:
"""
Parse all the JSON files in vLLM benchmark configs directory to get the
model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
"""
model name and tensor parallel size (aka number of GPUs, CPU NUMA nodes - Intel
or CPUs - ARM)"""
benchmark_matrix: Dict[str, Any] = {
"include": [],
}
Expand Down
32 changes: 30 additions & 2 deletions .github/scripts/test_generate_vllm_benchmark_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ def test_generate_benchmark_matrix():
"""\
{
"include": [
{
"runner": "linux.arm64.m7g.4xlarge",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
},
{
"runner": "linux.24xl.spr-metal",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
Expand Down Expand Up @@ -80,6 +84,10 @@ def test_generate_benchmark_matrix():
"""\
{
"include": [
{
"runner": "linux.arm64.m7g.4xlarge",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
},
{
"runner": "linux.24xl.spr-metal",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
Expand Down Expand Up @@ -110,6 +118,10 @@ def test_generate_benchmark_matrix():
"""\
{
"include": [
{
"runner": "linux.arm64.m7g.4xlarge",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
},
{
"runner": "linux.24xl.spr-metal",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
Expand Down Expand Up @@ -145,6 +157,10 @@ def test_generate_benchmark_matrix():
"""\
{
"include": [
{
"runner": "linux.arm64.m7g.4xlarge",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
},
{
"runner": "linux.24xl.spr-metal",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
Expand Down Expand Up @@ -172,6 +188,10 @@ def test_generate_benchmark_matrix():
"""\
{
"include": [
{
"runner": "linux.arm64.m7g.4xlarge",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
},
{
"runner": "linux.24xl.spr-metal",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
Expand Down Expand Up @@ -225,7 +245,7 @@ def test_generate_benchmark_matrix():

# Select multiple runners
models = []
runners = ["h100", "spr"]
runners = ["h100", "spr", "m7g"]
output = json.dumps(
generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
)
Expand All @@ -234,6 +254,10 @@ def test_generate_benchmark_matrix():
"""\
{
"include": [
{
"runner": "linux.arm64.m7g.4xlarge",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
},
{
"runner": "linux.24xl.spr-metal",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
Expand Down Expand Up @@ -356,7 +380,7 @@ def test_generate_benchmark_matrix():
"meta-llama/meta-llama-3.1-8b-instruct",
"mistralai/mixtral-8x7b-instruct-v0.1",
]
runners = ["rocm", "spr"]
runners = ["rocm", "spr", "m7g"]
output = json.dumps(
generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
)
Expand All @@ -365,6 +389,10 @@ def test_generate_benchmark_matrix():
"""\
{
"include": [
{
"runner": "linux.arm64.m7g.4xlarge",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
},
{
"runner": "linux.24xl.spr-metal",
"models": "meta-llama/meta-llama-3.1-8b-instruct"
Expand Down
32 changes: 24 additions & 8 deletions .github/workflows/vllm-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ on:
A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
required: true
type: string
default: h100,rocm,spr,b200
default: h100,rocm,spr,b200,m7g
pull_request:
paths:
- .github/workflows/vllm-benchmark.yml
Expand Down Expand Up @@ -104,8 +104,17 @@ jobs:
elif command -v rocm-smi; then
DEVICE_NAME=rocm
rocm-smi
else
DEVICE_NAME=cpu
else
arch=$(uname -m)

case "$arch" in
aarch64|arm64)
DEVICE_NAME=arm64-cpu
;;
*)
DEVICE_NAME=cpu
;;
esac
lscpu
fi
echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
Expand All @@ -122,6 +131,8 @@ jobs:
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
fi
echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV

Expand Down Expand Up @@ -157,6 +168,8 @@ jobs:
DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
DOCKER_IMAGE_SUFFIX=-cpu
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
DOCKER_IMAGE_SUFFIX=-arm64-cpu
fi
echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
Expand Down Expand Up @@ -266,11 +279,13 @@ jobs:
run: |
set -eux

if [[ "${DEVICE_NAME}" == "cpu" ]]; then
ON_CPU=1
else
ON_CPU=0
fi
ON_ARM64_CPU=0
ON_CPU=0

case "$DEVICE_NAME" in
cpu) ON_CPU=1 ;;
arm64-cpu) ON_ARM64_CPU=1 ;;
esac

container_name=$(docker run \
${GPU_FLAG:-} \
Expand All @@ -283,6 +298,7 @@ jobs:
-e ENGINE_VERSION \
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
-e ON_CPU="${ON_CPU}" \
-e ON_ARM64_CPU="${ON_ARM64_CPU}" \
--ipc=host \
--tty \
--detach \
Expand Down
3 changes: 3 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ MIT License

Copyright (c) Facebook, Inc. and its affiliates.

All contributions by Arm:
Copyright (c) 2025 Arm Limited and/or its affiliates

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
Expand Down
30 changes: 30 additions & 0 deletions vllm-benchmarks/benchmarks/cpu/latency-tests-arm64-cpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
{
"test_name": "latency_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
}
]
121 changes: 121 additions & 0 deletions vllm-benchmarks/benchmarks/cpu/serving-tests-arm64-cpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"device": "cpu",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 16,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"device": "cpu",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 16,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"device": "cpu",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 16,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"device": "cpu",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 16,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 100
}
}
]
32 changes: 32 additions & 0 deletions vllm-benchmarks/benchmarks/cpu/throughput-tests-arm64-cpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]
Loading