pytorch · ioana-ghiban-arm · Oct 9, 2025
diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -18,6 +18,7 @@
         "linux.aws.h100",
         "linux.rocm.gpu.gfx942.1",
         "linux.24xl.spr-metal",
+        "linux.arm64.m7g.4xlarge",
         "linux.dgx.b200",
     ],
     # NB: There is no 2xH100 runner at the momement, so let's use the next one
@@ -50,6 +51,7 @@
     "linux.rocm.gpu.gfx942.4": "rocm",
     "linux.rocm.gpu.gfx942.8": "rocm",
     "linux.24xl.spr-metal": "cpu",
+    "linux.arm64.m7g.4xlarge": "cpu",
 }
 
 # All the different names vLLM uses to refer to their benchmark configs
@@ -198,8 +200,8 @@ def generate_benchmark_matrix(
 ) -> Dict[str, Any]:
     """
     Parse all the JSON files in vLLM benchmark configs directory to get the
-    model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
-    """
+    model name and tensor parallel size (aka number of GPUs, CPU NUMA nodes - Intel
+    or CPUs - ARM)"""
     benchmark_matrix: Dict[str, Any] = {
         "include": [],
     }

diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py
@@ -21,6 +21,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -80,6 +84,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -110,6 +118,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -145,6 +157,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -172,6 +188,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -225,7 +245,7 @@ def test_generate_benchmark_matrix():
 
     # Select multiple runners
     models = []
-    runners = ["h100", "spr"]
+    runners = ["h100", "spr", "m7g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -234,6 +254,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -356,7 +380,7 @@ def test_generate_benchmark_matrix():
         "meta-llama/meta-llama-3.1-8b-instruct",
         "mistralai/mixtral-8x7b-instruct-v0.1",
     ]
-    runners = ["rocm", "spr"]
+    runners = ["rocm", "spr", "m7g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -365,6 +389,10 @@ def test_generate_benchmark_matrix():
         """\
 {
   "include": [
+    {
+      "runner": "linux.arm64.m7g.4xlarge",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
     {
       "runner": "linux.24xl.spr-metal",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"

diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -25,7 +25,7 @@ on:
           A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
         required: true
         type: string
-        default: h100,rocm,spr,b200
+        default: h100,rocm,spr,b200,m7g
   pull_request:
     paths:
       - .github/workflows/vllm-benchmark.yml
@@ -104,8 +104,17 @@ jobs:
           elif command -v rocm-smi; then
             DEVICE_NAME=rocm
             rocm-smi
-          else
-            DEVICE_NAME=cpu
+          else 
+            arch=$(uname -m)
+
+            case "$arch" in
+              aarch64|arm64)
+                DEVICE_NAME=arm64-cpu
+                ;;
+              *)
+                DEVICE_NAME=cpu
+                ;;
+            esac
             lscpu
           fi
           echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
@@ -122,6 +131,8 @@ jobs:
             DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
           elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
             DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
+          elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
+            DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
           fi
           echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
 
@@ -157,6 +168,8 @@ jobs:
             DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
           elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
             DOCKER_IMAGE_SUFFIX=-cpu
+          elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
+            DOCKER_IMAGE_SUFFIX=-arm64-cpu
           fi
           echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
           echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
@@ -266,11 +279,13 @@ jobs:
         run: |
           set -eux
 
-          if [[ "${DEVICE_NAME}" == "cpu" ]]; then
-            ON_CPU=1
-          else
-            ON_CPU=0
-          fi
+          ON_ARM64_CPU=0
+          ON_CPU=0
+
+          case "$DEVICE_NAME" in
+            cpu)       ON_CPU=1 ;;
+            arm64-cpu) ON_ARM64_CPU=1 ;;
+          esac
 
           container_name=$(docker run \
             ${GPU_FLAG:-} \
@@ -283,6 +298,7 @@ jobs:
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
             -e ON_CPU="${ON_CPU}" \
+            -e ON_ARM64_CPU="${ON_ARM64_CPU}" \
             --ipc=host \
             --tty \
             --detach \

diff --git a/LICENSE b/LICENSE
@@ -2,6 +2,9 @@ MIT License
 
 Copyright (c) Facebook, Inc. and its affiliates.
 
+All contributions by Arm:
+Copyright (c) 2025 Arm Limited and/or its affiliates
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights

diff --git a/vllm-benchmarks/benchmarks/cpu/latency-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/cpu/latency-tests-arm64-cpu.json
@@ -0,0 +1,30 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/cpu/serving-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/cpu/serving-tests-arm64-cpu.json
@@ -0,0 +1,121 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 1024,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 100
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/cpu/throughput-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/cpu/throughput-tests-arm64-cpu.json
@@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]