From 4964a592a02de0eaea0f931d26d813d1be891f17 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 14:02:23 -0700 Subject: [PATCH 01/57] Added sglang workflow files --- .github/workflows/sglang-benchmark.yml | 340 ++++++++++++++++++ .../benchmarks/cuda/latency-tests.json | 12 + .../benchmarks/cuda/serving-tests.json | 21 ++ .../benchmarks/cuda/throughput-tests.json | 13 + 4 files changed, 386 insertions(+) create mode 100644 .github/workflows/sglang-benchmark.yml create mode 100644 sglang-benchmarks/benchmarks/cuda/latency-tests.json create mode 100644 sglang-benchmarks/benchmarks/cuda/serving-tests.json create mode 100644 sglang-benchmarks/benchmarks/cuda/throughput-tests.json diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml new file mode 100644 index 00000000..d1609425 --- /dev/null +++ b/.github/workflows/sglang-benchmark.yml @@ -0,0 +1,340 @@ +name: SGLang Benchmark + +on: + workflow_dispatch: + inputs: + vllm_branch: + description: vLLM branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER/head for pre-merge check on pull request) + required: true + type: string + default: main + vllm_commit: + description: vLLM commit (optional, default to the latest commit in the branch that has not yet been benchmarked) + required: false + type: string + sglang_branch: + description: SGLang branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER) + required: true + type: string + default: main + models: + description: | + A comma-separated list of models from sglang-benchmarks/benchmarks (optional, default to run everything) + required: false + type: string + runners: + description: | + A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything) + required: true + type: string + default: h100 + pull_request: + paths: + - .github/workflows/sglang-benchmark.yml + - sglang-benchmarks/** + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + set-parameters: + runs-on: ubuntu-latest + outputs: + benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Set parameters + id: set-parameters + shell: bash + env: + MODELS: ${{ inputs.models || '' }} + RUNNERS: ${{ inputs.runners || '' }} + run: | + set -eux + + # The generated matrix is grouped by model and runner + python .github/scripts/generate_vllm_benchmark_matrix.py \ + --benchmark-configs-dir sglang-benchmarks/benchmarks \ + --models "${MODELS}" \ + --runners "${RUNNERS}" + + benchmarks: + name: Run SGLang benchmarks + needs: set-parameters + strategy: + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + environment: pytorch-x-vllm + permissions: + id-token: write + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Checkout vLLM repository + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: sglang-benchmarks/vllm + ref: ${{ inputs.vllm_branch || 'main' }} + fetch-depth: 0 + + - name: Checkout SGLang repository + uses: actions/checkout@v4 + with: + repository: sgl-project/sglang.git + path: sglang-benchmarks/sglang + ref: ${{ inputs.sglang_branch || 'main' }} + fetch-depth: 0 + + - uses: actions/setup-python@v5 + # Amazon Linux fails on this step + continue-on-error: true + with: + python-version: '3.12' + cache: 'pip' + + - name: Check if the device is supported + shell: bash + run: | + set -eux + + if command -v nvidia-smi; then + DEVICE_NAME=cuda + nvidia-smi + elif command -v rocm-smi; then + DEVICE_NAME=rocm + rocm-smi + else + DEVICE_NAME=cpu + lscpu + fi + echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV + + - name: Set GPU name and type + working-directory: sglang-benchmarks + shell: bash + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "cuda" ]]; then + DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') + elif [[ "${DEVICE_NAME}" == "rocm" ]]; then + DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") + fi + echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV + + - name: Install dependencies + shell: bash + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "rocm" ]]; then + pip install -r .github/scripts/requirements.txt \ + --extra-index-url https://download.pytorch.org/whl/rocm6.3 + else + pip install -r .github/scripts/requirements.txt \ + --extra-index-url https://download.pytorch.org/whl/cu128 + fi + + - name: Install SGLang + working-directory: sglang-benchmarks/sglang + shell: bash + run: | + set -eux + pip install -e "python[all]" + + - name: Set Docker registry + shell: bash + env: + HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }} + run: | + set -eux + + # Mimic the logic from vllm ci-infra test template + if [[ "${HEAD_BRANCH}" == "main" ]]; then + DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo + else + DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-test-repo + fi + + DOCKER_IMAGE_SUFFIX="" + if [[ "${DEVICE_NAME}" == "rocm" ]]; then + DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DOCKER_IMAGE_SUFFIX=-cpu + fi + echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV + echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV + + - name: Authenticate with AWS + # Only need for DGX hosts + if: contains(env.DEVICE_TYPE, 'B200') + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/public_ecr_read_only + role-duration-seconds: 18000 + aws-region: us-east-1 + + - name: Login to public.ecr.aws + # Only need for DGX hosts + if: contains(env.DEVICE_TYPE, 'B200') + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + with: + registry-type: public + + - name: Check for last benchmark commit + working-directory: sglang-benchmarks + env: + HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }} + HEAD_SHA: ${{ inputs.vllm_commit || '' }} + MODELS: ${{ matrix.models }} + run: | + set -eux + + if [[ -z "${HEAD_SHA}" ]]; then + pushd vllm + # Looking back the latest 100 commits is enough + for i in {0..99} + do + # Check if the image is there, if it doesn't then check an older one + # because the commit is too recent + HEAD_SHA=$(git rev-parse --verify HEAD~${i}) + DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" + + # No Docker image available yet because the commit is too recent + if ! docker manifest inspect "${DOCKER_IMAGE}"; then + continue + fi + + NOT_EXIST=0 + S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json" + aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1 + + if [[ ${NOT_EXIST} == "1" ]]; then + echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet" + break + fi + done + popd + fi + + echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV + + # Print the benchmark commit for rereference + echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" + + - name: Setup CUDA GPU_FLAG for docker run + if: env.DEVICE_NAME == 'cuda' + run: | + echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + - name: Setup ROCm + if: env.DEVICE_NAME == 'rocm' + uses: pytorch/pytorch/./.github/actions/setup-rocm@main + + - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container + run: | + echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" + + - name: Setup benchmark tests + env: + MODELS: ${{ matrix.models }} + run: | + set -eux + + pushd sglang-benchmarks/vllm + git checkout "${HEAD_SHA}" + rm .buildkite/nightly-benchmarks/tests/*.json || true + popd + + # Set the list of benchmarks we want to cover in this runner + python3 .github/scripts/setup_vllm_benchmark.py \ + --from-benchmark-configs-dir sglang-benchmarks/benchmarks \ + --to-benchmark-configs-dir sglang-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ + --models "${MODELS}" \ + --device "${DEVICE_NAME}" + + pushd sglang-benchmarks/vllm + ls -lah .buildkite/nightly-benchmarks/tests + find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \; + popd + + - name: Run SGLang benchmark + env: + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SCCACHE_REGION: us-east-1 + HF_TOKEN: ${{ secrets.HF_TOKEN }} + DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }} + # SGLang-specific environment variables + HF_HUB_DISABLE_XET: 1 + NIGHTLY_BACKENDS: sglang + CURRENT_LLM_SERVING_ENGINE: sglang + ENGINE_VERSION: v1 + SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "cpu" ]]; then + ON_CPU=1 + else + ON_CPU=0 + fi + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ + -e SCCACHE_BUCKET \ + -e SCCACHE_REGION \ + -e DEVICE_NAME \ + -e DEVICE_TYPE \ + -e HF_TOKEN \ + -e HF_HUB_DISABLE_XET \ + -e NIGHTLY_BACKENDS \ + -e CURRENT_LLM_SERVING_ENGINE \ + -e ENGINE_VERSION \ + -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + -e ON_CPU="${ON_CPU}" \ + --ipc=host \ + --tty \ + --detach \ + --security-opt seccomp=unconfined \ + --shm-size=4g \ + -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ + -w /tmp/workspace \ + "${DOCKER_IMAGE}" + ) + + # Set VLLM_SOURCE_CODE inside the container and run SGLang benchmark + docker exec -t "${container_name}" bash -c " + export VLLM_SOURCE_CODE=/tmp/workspace/sglang-benchmarks/vllm + cd sglang-benchmarks/vllm && + bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh + " + + - name: Authenticate with AWS + # AWS CUDA runners already have access to the bucket via its runner IAM role + if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200') + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results + # The max duration enforced by the server side + role-duration-seconds: 18000 + aws-region: us-east-1 + + # Keep a copy of the benchmark results on GitHub for reference + - uses: actions/upload-artifact@v4 + with: + name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }} + path: vllm-benchmarks/vllm/benchmarks/results diff --git a/sglang-benchmarks/benchmarks/cuda/latency-tests.json b/sglang-benchmarks/benchmarks/cuda/latency-tests.json new file mode 100644 index 00000000..ace766b7 --- /dev/null +++ b/sglang-benchmarks/benchmarks/cuda/latency-tests.json @@ -0,0 +1,12 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json new file mode 100644 index 00000000..e87b9212 --- /dev/null +++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json @@ -0,0 +1,21 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/sglang-benchmarks/benchmarks/cuda/throughput-tests.json b/sglang-benchmarks/benchmarks/cuda/throughput-tests.json new file mode 100644 index 00000000..f339ffef --- /dev/null +++ b/sglang-benchmarks/benchmarks/cuda/throughput-tests.json @@ -0,0 +1,13 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] From e6a91f9f2e2b686718da80c76c9b96632ea27fda Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 14:28:24 -0700 Subject: [PATCH 02/57] fixing the source code location --- .github/workflows/sglang-benchmark.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index d1609425..c0e72907 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -283,6 +283,7 @@ jobs: CURRENT_LLM_SERVING_ENGINE: sglang ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + VLLM_SOURCE_CODE: $(pwd) run: | set -eux @@ -306,6 +307,7 @@ jobs: -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ -e ON_CPU="${ON_CPU}" \ + -e VLLM_SOURCE_CODE \ --ipc=host \ --tty \ --detach \ @@ -316,10 +318,10 @@ jobs: "${DOCKER_IMAGE}" ) - # Set VLLM_SOURCE_CODE inside the container and run SGLang benchmark + # Set VLLM_SOURCE_CODE_LOC inside the container and run SGLang benchmark docker exec -t "${container_name}" bash -c " - export VLLM_SOURCE_CODE=/tmp/workspace/sglang-benchmarks/vllm cd sglang-benchmarks/vllm && + export VLLM_SOURCE_CODE_LOC=$(pwd) bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh " From 843e7cc8f4e15025ef7fc668ff6e69e451622e74 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 15:15:46 -0700 Subject: [PATCH 03/57] fix source code location --- .github/workflows/sglang-benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index c0e72907..96d85c72 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -283,7 +283,7 @@ jobs: CURRENT_LLM_SERVING_ENGINE: sglang ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - VLLM_SOURCE_CODE: $(pwd) + VLLM_SOURCE_CODE: /tmp/workspace/sglang-benchmarks/vllm run: | set -eux @@ -321,7 +321,7 @@ jobs: # Set VLLM_SOURCE_CODE_LOC inside the container and run SGLang benchmark docker exec -t "${container_name}" bash -c " cd sglang-benchmarks/vllm && - export VLLM_SOURCE_CODE_LOC=$(pwd) + export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh " From 8d4eac45b14ccfe26c78f7d31ecec22a7d66f3da Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 16:02:21 -0700 Subject: [PATCH 04/57] add missing json files --- .github/workflows/sglang-benchmark.yml | 2 +- .../benchmarks/cuda/genai-perf-tests.json | 23 ++ .../benchmarks/cuda/nightly-tests.json | 323 ++++++++++++++++++ 3 files changed, 347 insertions(+), 1 deletion(-) create mode 100644 sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json create mode 100644 sglang-benchmarks/benchmarks/cuda/nightly-tests.json diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 96d85c72..81599caa 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -339,4 +339,4 @@ jobs: - uses: actions/upload-artifact@v4 with: name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }} - path: vllm-benchmarks/vllm/benchmarks/results + path: sglang-benchmarks/vllm/benchmarks/results diff --git a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json new file mode 100644 index 00000000..a207dc93 --- /dev/null +++ b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json @@ -0,0 +1,23 @@ +[ + { + "test_name": "llama8B_tp1_genai_perf", + "qps_list": [4,8,16,32], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tp": 1, + "port": 8000, + "num_prompts": 500, + "reuse_server": false + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "genai_perf_input_parameters": { + } + } +] diff --git a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json new file mode 100644 index 00000000..9fe7b5b1 --- /dev/null +++ b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json @@ -0,0 +1,323 @@ +[ + { + "test_name": "llama8B_tp1_sharegpt", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tp": 1, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000, + "reuse_server": false + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama8B_tp1_sonnet_512_16", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tp": 1, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 16, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama8B_tp1_sonnet_512_256", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tp": 1, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 256, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "enable_torch_compile": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4_sharegpt", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tp": 4, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000, + "reuse_server": false + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4_sonnet_512_16", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tp": 4, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 16, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4_sonnet_512_256", + "qps_list": [4,8,16,32,"inf"], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tp": 4, + "dataset_name": "sonnet", + "dataset_path": "./sonnet_4x.txt", + "num_prompts": 500, + "port": 8000, + "sonnet_input_len": 512, + "sonnet_output_len": 256, + "sonnet_prefix_len": 50, + "reuse_server": true + }, + "lmdeploy_server_parameters": { + "dtype": "bfloat16" + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "bfloat16", + "max_batch_size": 2048, + "max_input_len": 4096, + "max_seq_len": 6144, + "max_num_tokens": 16384, + "trt_llm_version": "v0.11.0" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "vllm_client_parameters": { + }, + "sglang_server_parameters": { + "disable_radix_cache": "", + "dtype": "bfloat16" + }, + "sglang_client_parameters": { + } + } +] From 2230a99c0b67a4a1f2632d1621799ba3390a19e1 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 16:22:14 -0700 Subject: [PATCH 05/57] update params --- sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json | 1 - sglang-benchmarks/benchmarks/cuda/nightly-tests.json | 6 ------ 2 files changed, 7 deletions(-) diff --git a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json index a207dc93..ca9027e6 100644 --- a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json +++ b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json @@ -13,7 +13,6 @@ "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, diff --git a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json index 9fe7b5b1..9bdc2dee 100644 --- a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json +++ b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json @@ -37,7 +37,6 @@ "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -92,7 +91,6 @@ "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -147,7 +145,6 @@ "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -199,7 +196,6 @@ "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -253,7 +249,6 @@ "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -307,7 +302,6 @@ "disable_log_stats": "", "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, From 66d328f84e0a79e55035dc3d6aa06e374d7e0579 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 16:50:32 -0700 Subject: [PATCH 06/57] only run on h100 for now --- .github/workflows/sglang-benchmark.yml | 26 ++- .../benchmarks/cuda/nightly-tests.json | 156 ------------------ 2 files changed, 22 insertions(+), 160 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 81599caa..423fd2cb 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -55,7 +55,7 @@ jobs: shell: bash env: MODELS: ${{ inputs.models || '' }} - RUNNERS: ${{ inputs.runners || '' }} + RUNNERS: ${{ inputs.runners || 'h100' }} run: | set -eux @@ -335,8 +335,26 @@ jobs: role-duration-seconds: 18000 aws-region: us-east-1 - # Keep a copy of the benchmark results on GitHub for reference - - uses: actions/upload-artifact@v4 + - name: Create results summary + if: always() + run: | + RESULTS_DIR="sglang-benchmarks/vllm/benchmarks/results" + if [ -d "$RESULTS_DIR" ]; then + echo "## Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY + echo "- Device: ${{ env.DEVICE_TYPE }}" >> $GITHUB_STEP_SUMMARY + echo "- Models: ${{ matrix.models }}" >> $GITHUB_STEP_SUMMARY + echo "- Runner: ${{ matrix.runner }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Files Generated:" >> $GITHUB_STEP_SUMMARY + find "$RESULTS_DIR" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY + else + echo "⚠️ No benchmark results found in $RESULTS_DIR" >> $GITHUB_STEP_SUMMARY + fi + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + if: always() with: - name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }} + name: sglang-benchmark-results-${{ matrix.runner }}-${{ matrix.models }} path: sglang-benchmarks/vllm/benchmarks/results + retention-days: 30 diff --git a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json index 9bdc2dee..10bed8ab 100644 --- a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json +++ b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json @@ -157,161 +157,5 @@ }, "sglang_client_parameters": { } - }, - { - "test_name": "llama70B_tp4_sharegpt", - "qps_list": [4,8,16,32,"inf"], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "tp": 4, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 500, - "port": 8000, - "reuse_server": false - }, - "lmdeploy_server_parameters": { - "dtype": "bfloat16" - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "bfloat16", - "max_batch_size": 2048, - "max_input_len": 4096, - "max_seq_len": 6144, - "max_num_tokens": 16384, - "trt_llm_version": "v0.11.0" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "", - "gpu_memory_utilization": 0.9, - "max_num_seqs": 512, - "dtype": "bfloat16" - }, - "vllm_client_parameters": { - }, - "sglang_server_parameters": { - "disable_radix_cache": "", - "dtype": "bfloat16" - }, - "sglang_client_parameters": { - } - }, - { - "test_name": "llama70B_tp4_sonnet_512_16", - "qps_list": [4,8,16,32,"inf"], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "tp": 4, - "dataset_name": "sonnet", - "dataset_path": "./sonnet_4x.txt", - "num_prompts": 500, - "port": 8000, - "sonnet_input_len": 512, - "sonnet_output_len": 16, - "sonnet_prefix_len": 50, - "reuse_server": true - }, - "lmdeploy_server_parameters": { - "dtype": "bfloat16" - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "bfloat16", - "max_batch_size": 2048, - "max_input_len": 4096, - "max_seq_len": 6144, - "max_num_tokens": 16384, - "trt_llm_version": "v0.11.0" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "", - "gpu_memory_utilization": 0.9, - "max_num_seqs": 512, - "dtype": "bfloat16" - }, - "vllm_client_parameters": { - }, - "sglang_server_parameters": { - "disable_radix_cache": "", - "dtype": "bfloat16" - }, - "sglang_client_parameters": { - } - }, - { - "test_name": "llama70B_tp4_sonnet_512_256", - "qps_list": [4,8,16,32,"inf"], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "tp": 4, - "dataset_name": "sonnet", - "dataset_path": "./sonnet_4x.txt", - "num_prompts": 500, - "port": 8000, - "sonnet_input_len": 512, - "sonnet_output_len": 256, - "sonnet_prefix_len": 50, - "reuse_server": true - }, - "lmdeploy_server_parameters": { - "dtype": "bfloat16" - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "bfloat16", - "max_batch_size": 2048, - "max_input_len": 4096, - "max_seq_len": 6144, - "max_num_tokens": 16384, - "trt_llm_version": "v0.11.0" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "", - "gpu_memory_utilization": 0.9, - "max_num_seqs": 512, - "dtype": "bfloat16" - }, - "vllm_client_parameters": { - }, - "sglang_server_parameters": { - "disable_radix_cache": "", - "dtype": "bfloat16" - }, - "sglang_client_parameters": { - } } ] From 5938df2d9b72da243f3b784deaf3a5cb849efe11 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 17:46:45 -0700 Subject: [PATCH 07/57] fix serving model engine to sglang --- .github/workflows/sglang-benchmark.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 423fd2cb..72fb4960 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -279,7 +279,6 @@ jobs: DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }} # SGLang-specific environment variables HF_HUB_DISABLE_XET: 1 - NIGHTLY_BACKENDS: sglang CURRENT_LLM_SERVING_ENGINE: sglang ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 @@ -302,7 +301,6 @@ jobs: -e DEVICE_TYPE \ -e HF_TOKEN \ -e HF_HUB_DISABLE_XET \ - -e NIGHTLY_BACKENDS \ -e CURRENT_LLM_SERVING_ENGINE \ -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ @@ -318,9 +316,11 @@ jobs: "${DOCKER_IMAGE}" ) - # Set VLLM_SOURCE_CODE_LOC inside the container and run SGLang benchmark + # Run SGLang benchmark with proper environment variables docker exec -t "${container_name}" bash -c " + touch /sgl-workspace && cd sglang-benchmarks/vllm && + export CURRENT_LLM_SERVING_ENGINE=sglang && export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh " From 56558ec6ba96ec96e1e2f9b5665f4abd2e0c0259 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 17:53:22 -0700 Subject: [PATCH 08/57] sanitized results section --- .github/workflows/sglang-benchmark.yml | 36 ++++++++++++++++++-------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 72fb4960..da79f40e 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -335,26 +335,40 @@ jobs: role-duration-seconds: 18000 aws-region: us-east-1 - - name: Create results summary + - name: Upload the benchmark results if: always() + env: + BENCHMARK_RESULTS: sglang-benchmarks/vllm/benchmarks/results + MODELS: ${{ matrix.models }} run: | - RESULTS_DIR="sglang-benchmarks/vllm/benchmarks/results" - if [ -d "$RESULTS_DIR" ]; then - echo "## Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY - echo "- Device: ${{ env.DEVICE_TYPE }}" >> $GITHUB_STEP_SUMMARY - echo "- Models: ${{ matrix.models }}" >> $GITHUB_STEP_SUMMARY + set -eux + + sudo chown -R ${UID} "${BENCHMARK_RESULTS}" || true + ls -lah "${BENCHMARK_RESULTS}" || echo "Results directory not found" + + SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") + SANITIZED_MODELS="${MODELS//\//_}" + + # Create results summary + if [ -d "${BENCHMARK_RESULTS}" ]; then + echo "## SGLang Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY + echo "- Device: ${DEVICE_TYPE}" >> $GITHUB_STEP_SUMMARY + echo "- Models: ${MODELS}" >> $GITHUB_STEP_SUMMARY echo "- Runner: ${{ matrix.runner }}" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "### Files Generated:" >> $GITHUB_STEP_SUMMARY - find "$RESULTS_DIR" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY + find "${BENCHMARK_RESULTS}" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY || echo "- No JSON files found" >> $GITHUB_STEP_SUMMARY else - echo "⚠️ No benchmark results found in $RESULTS_DIR" >> $GITHUB_STEP_SUMMARY + echo "⚠️ No benchmark results found in ${BENCHMARK_RESULTS}" >> $GITHUB_STEP_SUMMARY fi - - name: Upload benchmark results - uses: actions/upload-artifact@v4 + echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV + echo "SANITIZED_MODELS=$SANITIZED_MODELS" >> $GITHUB_ENV + + # Keep a copy of the benchmark results on GitHub for reference + - uses: actions/upload-artifact@v4 if: always() with: - name: sglang-benchmark-results-${{ matrix.runner }}-${{ matrix.models }} + name: sglang-benchmark-results-${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }} path: sglang-benchmarks/vllm/benchmarks/results retention-days: 30 From 0e0998ae1a88c3971e897d4e6f55ec8a509d3e6c Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 19:31:56 -0700 Subject: [PATCH 09/57] fix sglang issues --- .github/workflows/sglang-benchmark.yml | 31 ++++++++++++++++---------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index da79f40e..983d2a40 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -148,13 +148,6 @@ jobs: --extra-index-url https://download.pytorch.org/whl/cu128 fi - - name: Install SGLang - working-directory: sglang-benchmarks/sglang - shell: bash - run: | - set -eux - pip install -e "python[all]" - - name: Set Docker registry shell: bash env: @@ -316,12 +309,26 @@ jobs: "${DOCKER_IMAGE}" ) - # Run SGLang benchmark with proper environment variables + echo "container_name=${container_name}" >> $GITHUB_ENV + + # Install SGLang inside the container and run benchmark docker exec -t "${container_name}" bash -c " - touch /sgl-workspace && - cd sglang-benchmarks/vllm && - export CURRENT_LLM_SERVING_ENGINE=sglang && - export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm && + set -eux + + # Install SGLang inside the container + cd /tmp/workspace/sglang-benchmarks/sglang + pip install -e 'python[all]' + + # (TODO: Remove this once verified) + python3 -c 'import sglang; print(\"SGLang installed successfully\")' + + # Create SGLang workspace marker and set environment + touch /sgl-workspace + export CURRENT_LLM_SERVING_ENGINE=sglang + export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm + + # Run the benchmark + cd /tmp/workspace/sglang-benchmarks/vllm bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh " From 5a18dc53c3380ae81fd8cdc8ffd207b262ccb621 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 20 Aug 2025 23:29:05 -0700 Subject: [PATCH 10/57] remove unused files for now --- .../benchmarks/cuda/latency-tests.json | 12 ----------- .../benchmarks/cuda/serving-tests.json | 21 ------------------- .../benchmarks/cuda/throughput-tests.json | 13 ------------ 3 files changed, 46 deletions(-) delete mode 100644 sglang-benchmarks/benchmarks/cuda/latency-tests.json delete mode 100644 sglang-benchmarks/benchmarks/cuda/serving-tests.json delete mode 100644 sglang-benchmarks/benchmarks/cuda/throughput-tests.json diff --git a/sglang-benchmarks/benchmarks/cuda/latency-tests.json b/sglang-benchmarks/benchmarks/cuda/latency-tests.json deleted file mode 100644 index ace766b7..00000000 --- a/sglang-benchmarks/benchmarks/cuda/latency-tests.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "test_name": "latency_llama8B_tp1", - "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "num_iters_warmup": 5, - "num_iters": 15 - } - } -] diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json deleted file mode 100644 index e87b9212..00000000 --- a/sglang-benchmarks/benchmarks/cuda/serving-tests.json +++ /dev/null @@ -1,21 +0,0 @@ -[ - { - "test_name": "serving_llama8B_tp1_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "swap_space": 16, - "disable_log_stats": "", - "disable_log_requests": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - } -] diff --git a/sglang-benchmarks/benchmarks/cuda/throughput-tests.json b/sglang-benchmarks/benchmarks/cuda/throughput-tests.json deleted file mode 100644 index f339ffef..00000000 --- a/sglang-benchmarks/benchmarks/cuda/throughput-tests.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - { - "test_name": "throughput_llama8B_tp1", - "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - } -] From 4b03ad0a5b890d7898cba7501944cc8727c3b8d7 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 19:53:48 -0700 Subject: [PATCH 11/57] updated workflow execution --- .../run-sglang-performance-benchmarks.sh | 346 ++++++++++++++++++ .github/workflows/sglang-benchmark.yml | 194 ++-------- .../benchmarks/cuda/serving-tests.json | 77 ++++ 3 files changed, 449 insertions(+), 168 deletions(-) create mode 100644 .github/scripts/run-sglang-performance-benchmarks.sh create mode 100644 sglang-benchmarks/benchmarks/cuda/serving-tests.json diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh new file mode 100644 index 00000000..dc96ae0b --- /dev/null +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -0,0 +1,346 @@ +#!/bin/bash + +# This script should be run inside the CI process +# This script assumes that we are already inside the sglang-benchmarks/benchmarks/ directory +# Benchmarking results will be available inside sglang-benchmarks/benchmarks/results/ + +# Do not set -e, as some models may crash occasionally +# and we still want to see other benchmarking results even when some models crash. +set -x +set -o pipefail + +check_gpus() { + if command -v nvidia-smi; then + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + elif command -v amd-smi; then + declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) + fi + + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + if command -v nvidia-smi; then + declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') + elif command -v amd-smi; then + declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}') + fi + echo "GPU type is $gpu_type" +} + +check_cpus() { + # check the number of CPUs and NUMA Node and GPU type. + declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') + if [[ $numa_count -gt 0 ]]; then + echo "NUMA found." + echo $numa_count + else + echo "Need at least 1 NUMA to run benchmarking." + exit 1 + fi + declare -g gpu_type="cpu" + echo "GPU type is $gpu_type" +} + +check_hf_token() { + # check if HF_TOKEN is available and valid + if [[ -z "$HF_TOKEN" ]]; then + echo "Error: HF_TOKEN is not set." + exit 1 + elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then + echo "Error: HF_TOKEN does not start with 'hf_'." + exit 1 + else + echo "HF_TOKEN is set and valid." + fi +} + +ensure_sharegpt_downloaded() { + local FILE=ShareGPT_V3_unfiltered_cleaned_split.json + if [ ! -f "$FILE" ]; then + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE + else + echo "$FILE already exists." + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +json2envs() { + # transforms the JSON string to environment variables. + # example: + # input: { "SGLANG_DISABLE_CUDA_GRAPH": 1 } + # output: SGLANG_DISABLE_CUDA_GRAPH=1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map((.key ) + "=" + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + # wait for sglang server to start + # return 1 if sglang server crashes + timeout 1200 bash -c ' + until curl -s localhost:30000/v1/completions > /dev/null; do + sleep 1 + done' && return 0 || return 1 +} + +kill_processes_launched_by_current_bash() { + # Kill all python processes launched from current bash script + current_shell_pid=$$ + processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}') + if [ -n "$processes" ]; then + echo "Killing the following processes matching '$1':" + echo "$processes" + echo "$processes" | xargs kill -9 + else + echo "No processes found matching '$1'." + fi +} + +kill_gpu_processes() { +# ps -aux + lsof -t -i:30000 | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 + pgrep python | xargs -r kill -9 + pgrep -f "sglang" | xargs -r kill -9 + + # wait until GPU memory usage smaller than 1GB + if command -v nvidia-smi; then + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do + sleep 1 + done + elif command -v amd-smi; then + while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do + sleep 1 + done + fi +} + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent + if command -v buildkite-agent >/dev/null 2>&1; then + BUILDKITE_AGENT_COMMAND="buildkite-agent" + elif [ -f /workspace/buildkite-agent ]; then + BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent" + else + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + + # Use the determined command to annotate and upload artifacts + $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "sglang-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md" + $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" +} + +run_serving_tests() { + # run serving tests using `sglang.bench_serving` command + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^serving_ ]]; then + echo "In serving-test.json, test_name must start with \"serving_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.server_parameters') + server_envs=$(echo "$params" | jq -r '.server_environment_variables') + client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") + server_envs=$(json2envs "$server_envs") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # Extract only specific SGLang server parameters + model_path=$(echo "$server_params" | jq -r '.model_path // .model') + context_length=$(echo "$server_params" | jq -r '.context_length // 4096') + + # check if there is enough resources to run the test + tp=$(echo "$server_params" | jq -r '.tp // 1') + if [ "$ON_CPU" == "1" ]; then + if [[ $numa_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + continue + fi + else + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + fi + + # check if server model and client model is aligned + server_model="$model_path" + client_model=$(echo "$client_params" | jq -r '.model // .model_path') + if [[ $server_model != "$client_model" ]]; then + echo "Server model and client model must be the same. Skip testcase $test_name." + continue + fi + + server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + bash -c "$server_command" & + server_pid=$! + + # wait until the server is alive + if wait_for_server; then + echo "" + echo "SGLang server is up and running." + else + echo "" + echo "SGLang failed to start within the timeout period." + kill -9 $server_pid + continue + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + echo "new test name $new_test_name" + + # SGLang bench_serving command + client_command="python3 -m sglang.bench_serving \ + --backend sglang \ + --dataset-name sharegpt \ + --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ + --model $client_model \ + --request-rate $qps \ + --port 30000 \ + --output-file $RESULTS_FOLDER/${new_test_name}.json \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + bash -c "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done + + # clean up + kill -9 $server_pid + kill_gpu_processes + done +} + +main() { + local ARCH + ARCH='' + if [ "$ON_CPU" == "1" ];then + check_cpus + ARCH='-cpu' + else + check_gpus + fi + check_hf_token + + # dependencies + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + (which lsof) || (apt-get update && apt-get install -y lsof) + + # get the current IP address, required by SGLang bench commands + export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}') + # turn off the reporting of the status of each request, to clean up the terminal output + export SGLANG_LOGGING_LEVEL="WARNING" + + # prepare for benchmarking + ensure_sharegpt_downloaded + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + BENCHMARK_ROOT=tests/ + + # benchmarking - look for test files in the tests/ directory + if [ -f "$BENCHMARK_ROOT/serving-tests$ARCH.json" ]; then + run_serving_tests "$BENCHMARK_ROOT/serving-tests$ARCH.json" + elif [ -f "$BENCHMARK_ROOT/serving-tests.json" ]; then + run_serving_tests "$BENCHMARK_ROOT/serving-tests.json" + else + echo "No serving test file found" + fi + + # postprocess benchmarking results + pip install tabulate pandas + + # Create a simple markdown summary of results + echo "# SGLang Benchmark Results" > "$RESULTS_FOLDER/benchmark_results.md" + echo "" >> "$RESULTS_FOLDER/benchmark_results.md" + echo "## Test Results Summary" >> "$RESULTS_FOLDER/benchmark_results.md" + echo "" >> "$RESULTS_FOLDER/benchmark_results.md" + + # List all JSON result files + if ls "$RESULTS_FOLDER"/*.json 1> /dev/null 2>&1; then + echo "### Generated Result Files:" >> "$RESULTS_FOLDER/benchmark_results.md" + for file in "$RESULTS_FOLDER"/*.json; do + echo "- $(basename "$file")" >> "$RESULTS_FOLDER/benchmark_results.md" + done + else + echo "No JSON result files were generated." >> "$RESULTS_FOLDER/benchmark_results.md" + fi + + upload_to_buildkite +} + +main "$@" diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 983d2a40..0b4328fd 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -3,15 +3,6 @@ name: SGLang Benchmark on: workflow_dispatch: inputs: - vllm_branch: - description: vLLM branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER/head for pre-merge check on pull request) - required: true - type: string - default: main - vllm_commit: - description: vLLM commit (optional, default to the latest commit in the branch that has not yet been benchmarked) - required: false - type: string sglang_branch: description: SGLang branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER) required: true @@ -80,14 +71,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Checkout vLLM repository - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - path: sglang-benchmarks/vllm - ref: ${{ inputs.vllm_branch || 'main' }} - fetch-depth: 0 - - name: Checkout SGLang repository uses: actions/checkout@v4 with: @@ -148,29 +131,6 @@ jobs: --extra-index-url https://download.pytorch.org/whl/cu128 fi - - name: Set Docker registry - shell: bash - env: - HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }} - run: | - set -eux - - # Mimic the logic from vllm ci-infra test template - if [[ "${HEAD_BRANCH}" == "main" ]]; then - DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo - else - DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-test-repo - fi - - DOCKER_IMAGE_SUFFIX="" - if [[ "${DEVICE_NAME}" == "rocm" ]]; then - DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci - elif [[ "${DEVICE_NAME}" == "cpu" ]]; then - DOCKER_IMAGE_SUFFIX=-cpu - fi - echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV - echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV - - name: Authenticate with AWS # Only need for DGX hosts if: contains(env.DEVICE_TYPE, 'B200') @@ -187,59 +147,23 @@ jobs: with: registry-type: public - - name: Check for last benchmark commit + - name: Install vLLM and SGLang working-directory: sglang-benchmarks - env: - HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }} - HEAD_SHA: ${{ inputs.vllm_commit || '' }} - MODELS: ${{ matrix.models }} + shell: bash run: | set -eux - if [[ -z "${HEAD_SHA}" ]]; then - pushd vllm - # Looking back the latest 100 commits is enough - for i in {0..99} - do - # Check if the image is there, if it doesn't then check an older one - # because the commit is too recent - HEAD_SHA=$(git rev-parse --verify HEAD~${i}) - DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" - - # No Docker image available yet because the commit is too recent - if ! docker manifest inspect "${DOCKER_IMAGE}"; then - continue - fi - - NOT_EXIST=0 - S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json" - aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1 - - if [[ ${NOT_EXIST} == "1" ]]; then - echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet" - break - fi - done - popd - fi - - echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV + # Install vLLM + pip install vllm - # Print the benchmark commit for rereference - echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" + # Install SGLang from source + pushd sglang + pip install -e "python[all]" - - name: Setup CUDA GPU_FLAG for docker run - if: env.DEVICE_NAME == 'cuda' - run: | - echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" - - - name: Setup ROCm - if: env.DEVICE_NAME == 'rocm' - uses: pytorch/pytorch/./.github/actions/setup-rocm@main - - - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container - run: | - echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" + # Verify installations + python3 -c "import vllm; print('vLLM installed successfully')" + python3 -c "import sglang; print('SGLang installed successfully')" + popd - name: Setup benchmark tests env: @@ -247,105 +171,39 @@ jobs: run: | set -eux - pushd sglang-benchmarks/vllm - git checkout "${HEAD_SHA}" - rm .buildkite/nightly-benchmarks/tests/*.json || true - popd + # Create benchmarks directory structure + mkdir -p sglang-benchmarks/benchmarks/results + mkdir -p sglang-benchmarks/benchmarks/tests # Set the list of benchmarks we want to cover in this runner python3 .github/scripts/setup_vllm_benchmark.py \ --from-benchmark-configs-dir sglang-benchmarks/benchmarks \ - --to-benchmark-configs-dir sglang-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ + --to-benchmark-configs-dir sglang-benchmarks/benchmarks/tests \ --models "${MODELS}" \ --device "${DEVICE_NAME}" - pushd sglang-benchmarks/vllm - ls -lah .buildkite/nightly-benchmarks/tests - find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \; - popd + ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found" + find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display" - name: Run SGLang benchmark env: - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - SCCACHE_REGION: us-east-1 HF_TOKEN: ${{ secrets.HF_TOKEN }} - DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }} - # SGLang-specific environment variables - HF_HUB_DISABLE_XET: 1 - CURRENT_LLM_SERVING_ENGINE: sglang - ENGINE_VERSION: v1 - SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - VLLM_SOURCE_CODE: /tmp/workspace/sglang-benchmarks/vllm + MODELS: ${{ matrix.models }} run: | set -eux - if [[ "${DEVICE_NAME}" == "cpu" ]]; then - ON_CPU=1 - else - ON_CPU=0 - fi - - container_name=$(docker run \ - ${GPU_FLAG:-} \ - ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ - -e SCCACHE_BUCKET \ - -e SCCACHE_REGION \ - -e DEVICE_NAME \ - -e DEVICE_TYPE \ - -e HF_TOKEN \ - -e HF_HUB_DISABLE_XET \ - -e CURRENT_LLM_SERVING_ENGINE \ - -e ENGINE_VERSION \ - -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ - -e ON_CPU="${ON_CPU}" \ - -e VLLM_SOURCE_CODE \ - --ipc=host \ - --tty \ - --detach \ - --security-opt seccomp=unconfined \ - --shm-size=4g \ - -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ - -w /tmp/workspace \ - "${DOCKER_IMAGE}" - ) - - echo "container_name=${container_name}" >> $GITHUB_ENV - - # Install SGLang inside the container and run benchmark - docker exec -t "${container_name}" bash -c " - set -eux - - # Install SGLang inside the container - cd /tmp/workspace/sglang-benchmarks/sglang - pip install -e 'python[all]' - - # (TODO: Remove this once verified) - python3 -c 'import sglang; print(\"SGLang installed successfully\")' - - # Create SGLang workspace marker and set environment - touch /sgl-workspace - export CURRENT_LLM_SERVING_ENGINE=sglang - export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm - - # Run the benchmark - cd /tmp/workspace/sglang-benchmarks/vllm - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - " + # Set environment variables for SGLang + export CURRENT_LLM_SERVING_ENGINE=sglang + export SGLANG_SOURCE_CODE_LOC=$(pwd)/sglang-benchmarks/sglang - - name: Authenticate with AWS - # AWS CUDA runners already have access to the bucket via its runner IAM role - if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200') - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results - # The max duration enforced by the server side - role-duration-seconds: 18000 - aws-region: us-east-1 + # Run the SGLang benchmark script + cd sglang-benchmarks/benchmarks + bash ../../.github/scripts/run-sglang-performance-benchmarks.sh - name: Upload the benchmark results if: always() env: - BENCHMARK_RESULTS: sglang-benchmarks/vllm/benchmarks/results + BENCHMARK_RESULTS: sglang-benchmarks/benchmarks/results MODELS: ${{ matrix.models }} run: | set -eux @@ -377,5 +235,5 @@ jobs: if: always() with: name: sglang-benchmark-results-${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }} - path: sglang-benchmarks/vllm/benchmarks/results + path: sglang-benchmarks/benchmarks/results retention-days: 30 diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json new file mode 100644 index 00000000..6b786de8 --- /dev/null +++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json @@ -0,0 +1,77 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "sglang", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "sglang", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "sglang", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt_specdecode", + "qps_list": [2], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "speculative_config": { + "model": "turboderp/Qwama-0.5B-Instruct", + "num_speculative_tokens": 4, + "draft_tensor_parallel_size": 1 + } + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "sglang", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] From de0d0f84bf9d33e247c03016b92b7040fe807d3f Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 19:54:52 -0700 Subject: [PATCH 12/57] remove comment --- .github/scripts/run-sglang-performance-benchmarks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index dc96ae0b..e35bec1a 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -122,7 +122,7 @@ kill_processes_launched_by_current_bash() { } kill_gpu_processes() { -# ps -aux + ps -aux lsof -t -i:30000 | xargs -r kill -9 pgrep python3 | xargs -r kill -9 pgrep python | xargs -r kill -9 From 2b4325a2d29829d4ec38d6db14cffd1791767e16 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 21:10:15 -0700 Subject: [PATCH 13/57] trying a different method --- .github/workflows/sglang-benchmark.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 0b4328fd..fbbbb54d 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -153,13 +153,13 @@ jobs: run: | set -eux - # Install vLLM - pip install vllm - # Install SGLang from source pushd sglang pip install -e "python[all]" + # Install vLLM + pip install vllm + # Verify installations python3 -c "import vllm; print('vLLM installed successfully')" python3 -c "import sglang; print('SGLang installed successfully')" @@ -194,7 +194,6 @@ jobs: # Set environment variables for SGLang export CURRENT_LLM_SERVING_ENGINE=sglang - export SGLANG_SOURCE_CODE_LOC=$(pwd)/sglang-benchmarks/sglang # Run the SGLang benchmark script cd sglang-benchmarks/benchmarks From 5ca157c71c605110984d6d4e69446bb4bbf0f293 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 22:02:20 -0700 Subject: [PATCH 14/57] fix numa installation issue --- .github/scripts/run-sglang-performance-benchmarks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index e35bec1a..e9895fb2 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -299,7 +299,7 @@ main() { # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) - (which lsof) || (apt-get update && apt-get install -y lsof) + (which lsof) || (apt-get update && apt-get install -y lsof libnuma-dev) # get the current IP address, required by SGLang bench commands export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}') From e40a38e75bd8efd5bb7cb74eb15291fd0b9a401f Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 22:35:17 -0700 Subject: [PATCH 15/57] fix issues --- .github/scripts/run-sglang-performance-benchmarks.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index e9895fb2..9e9caf16 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -299,7 +299,8 @@ main() { # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) - (which lsof) || (apt-get update && apt-get install -y lsof libnuma-dev) + (which lsof) || (apt-get update && apt-get install -y lsof) + (apt-get install -y libnuma-dev) # get the current IP address, required by SGLang bench commands export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}') From 7b763ac9506a15215f582563bdfdebae3ac56782 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 22:55:12 -0700 Subject: [PATCH 16/57] fix package --- .github/scripts/run-sglang-performance-benchmarks.sh | 1 - .github/workflows/sglang-benchmark.yml | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index 9e9caf16..e35bec1a 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -300,7 +300,6 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) (which lsof) || (apt-get update && apt-get install -y lsof) - (apt-get install -y libnuma-dev) # get the current IP address, required by SGLang bench commands export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}') diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index fbbbb54d..648a90af 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -41,6 +41,10 @@ jobs: with: python-version: '3.12' + - name: Install Packages for SGLang + run: | + sudo apt-get install -y libnuma-dev + - name: Set parameters id: set-parameters shell: bash From 79d4ccf82ecc3c7d71c82b6cdc89a1c1926c9c8f Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 23:13:18 -0700 Subject: [PATCH 17/57] fix package --- .github/workflows/sglang-benchmark.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 648a90af..6ca63795 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -41,10 +41,6 @@ jobs: with: python-version: '3.12' - - name: Install Packages for SGLang - run: | - sudo apt-get install -y libnuma-dev - - name: Set parameters id: set-parameters shell: bash @@ -90,6 +86,10 @@ jobs: python-version: '3.12' cache: 'pip' + - name: Install Packages for SGLang + run: | + sudo apt-get install -y libnuma-dev + - name: Check if the device is supported shell: bash run: | From cd6456865fa3ff2926188677b99ac73a8bfcd767 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 23:19:06 -0700 Subject: [PATCH 18/57] fix package --- .github/workflows/sglang-benchmark.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 6ca63795..c8936981 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -41,6 +41,10 @@ jobs: with: python-version: '3.12' + - name: Install Packages for SGLang + run: | + sudo apt-get install -y libnuma-dev + - name: Set parameters id: set-parameters shell: bash @@ -74,7 +78,7 @@ jobs: - name: Checkout SGLang repository uses: actions/checkout@v4 with: - repository: sgl-project/sglang.git + repository: sgl-project/sglang path: sglang-benchmarks/sglang ref: ${{ inputs.sglang_branch || 'main' }} fetch-depth: 0 @@ -86,10 +90,6 @@ jobs: python-version: '3.12' cache: 'pip' - - name: Install Packages for SGLang - run: | - sudo apt-get install -y libnuma-dev - - name: Check if the device is supported shell: bash run: | From aeecd6ae03364cd1e2e6ac7dd2446ffb4290a4e3 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 23:31:57 -0700 Subject: [PATCH 19/57] fix package --- .github/workflows/sglang-benchmark.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index c8936981..3a2bb7b5 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -44,6 +44,7 @@ jobs: - name: Install Packages for SGLang run: | sudo apt-get install -y libnuma-dev + sudo apt install numactl - name: Set parameters id: set-parameters From d121b3419880a2697452169ba2311581b9c62a2c Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 23:38:16 -0700 Subject: [PATCH 20/57] fix package --- .github/workflows/sglang-benchmark.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 3a2bb7b5..74b6b9d1 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -43,8 +43,7 @@ jobs: - name: Install Packages for SGLang run: | - sudo apt-get install -y libnuma-dev - sudo apt install numactl + sudo apt-get install -y libnuma-dev numactl - name: Set parameters id: set-parameters From 2498860fe8eaf8857c20cb207261910c321eda5c Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Thu, 21 Aug 2025 23:57:01 -0700 Subject: [PATCH 21/57] fix package --- .github/workflows/sglang-benchmark.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 74b6b9d1..c82982b6 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -41,10 +41,6 @@ jobs: with: python-version: '3.12' - - name: Install Packages for SGLang - run: | - sudo apt-get install -y libnuma-dev numactl - - name: Set parameters id: set-parameters shell: bash @@ -75,6 +71,12 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Install system dependencies + shell: bash + run: | + sudo apt-get update + sudo apt-get install -y libnuma-dev numactl + - name: Checkout SGLang repository uses: actions/checkout@v4 with: From b3800a2a2a70d152c724e09cdf77f5dbd3d9be41 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Fri, 22 Aug 2025 00:14:40 -0700 Subject: [PATCH 22/57] fix package --- .github/workflows/sglang-benchmark.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index c82982b6..060ccafc 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -159,13 +159,13 @@ jobs: run: | set -eux + # Install vLLM + pip install vllm + # Install SGLang from source pushd sglang pip install -e "python[all]" - # Install vLLM - pip install vllm - # Verify installations python3 -c "import vllm; print('vLLM installed successfully')" python3 -c "import sglang; print('SGLang installed successfully')" From 564c0b5a970a1bdb0c61411768ebcbacad0b06d7 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Fri, 22 Aug 2025 00:39:57 -0700 Subject: [PATCH 23/57] fix process --- .github/scripts/run-sglang-performance-benchmarks.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index e35bec1a..b96cb78d 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -126,7 +126,6 @@ kill_gpu_processes() { lsof -t -i:30000 | xargs -r kill -9 pgrep python3 | xargs -r kill -9 pgrep python | xargs -r kill -9 - pgrep -f "sglang" | xargs -r kill -9 # wait until GPU memory usage smaller than 1GB if command -v nvidia-smi; then From 2e0eb3d5aea731388a81a643d907d1878fe92d29 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Fri, 22 Aug 2025 10:49:22 -0700 Subject: [PATCH 24/57] replace sglang benchmarking command with vllm bench serve --- .../run-sglang-performance-benchmarks.sh | 5 +- .../benchmarks/cuda/serving-tests.json | 61 +------------------ 2 files changed, 4 insertions(+), 62 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index b96cb78d..e4f49419 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -248,9 +248,8 @@ run_serving_tests() { new_test_name=$test_name"_qps_"$qps echo "new test name $new_test_name" - # SGLang bench_serving command - client_command="python3 -m sglang.bench_serving \ - --backend sglang \ + # Bench serving command + client_command="vllm bench serve \ --dataset-name sharegpt \ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ --model $client_model \ diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json index 6b786de8..e2c30eca 100644 --- a/sglang-benchmarks/benchmarks/cuda/serving-tests.json +++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json @@ -1,7 +1,7 @@ [ { "test_name": "serving_llama8B_tp1_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": [1, 4], "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, @@ -11,64 +11,7 @@ }, "client_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "backend": "sglang", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama70B_tp4_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "tensor_parallel_size": 4, - "swap_space": 16, - "disable_log_stats": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "backend": "sglang", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_mixtral8x7B_tp2_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tensor_parallel_size": 2, - "swap_space": 16, - "disable_log_stats": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "backend": "sglang", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama70B_tp4_sharegpt_specdecode", - "qps_list": [2], - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "tensor_parallel_size": 4, - "swap_space": 16, - "speculative_config": { - "model": "turboderp/Qwama-0.5B-Instruct", - "num_speculative_tokens": 4, - "draft_tensor_parallel_size": 1 - } - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "backend": "sglang", + "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 From 1dd83dc8e0aabca0813986894aa3b00e759904fc Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Fri, 22 Aug 2025 16:37:41 -0700 Subject: [PATCH 25/57] fix import --- .github/workflows/sglang-benchmark.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 060ccafc..56d27dc4 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -135,6 +135,7 @@ jobs: else pip install -r .github/scripts/requirements.txt \ --extra-index-url https://download.pytorch.org/whl/cu128 + pip install flash-attn --no-build-isolation fi - name: Authenticate with AWS From 5aa0db18e9d5f67033327662bbb050c1fd96d69f Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Sun, 24 Aug 2025 20:52:33 -0700 Subject: [PATCH 26/57] running vllm through docker --- .github/workflows/sglang-benchmark.yml | 74 ++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 10 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 56d27dc4..680baf39 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -41,6 +41,9 @@ jobs: with: python-version: '3.12' + - name: Install uv + uses: astral-sh/setup-uv@v6 + - name: Set parameters id: set-parameters shell: bash @@ -135,8 +138,36 @@ jobs: else pip install -r .github/scripts/requirements.txt \ --extra-index-url https://download.pytorch.org/whl/cu128 - pip install flash-attn --no-build-isolation + pip install flashinfer-python + fi + + - name: Set Docker registry + shell: bash + run: | + set -eux + + DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo + DOCKER_IMAGE_SUFFIX="" + if [[ "${DEVICE_NAME}" == "rocm" ]]; then + DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DOCKER_IMAGE_SUFFIX=-cpu fi + echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV + echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV + + - name: Setup CUDA GPU_FLAG for docker run + if: env.DEVICE_NAME == 'cuda' + run: | + echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + - name: Setup ROCm + if: env.DEVICE_NAME == 'rocm' + uses: pytorch/pytorch/./.github/actions/setup-rocm@main + + - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container + run: | + echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" - name: Authenticate with AWS # Only need for DGX hosts @@ -154,23 +185,18 @@ jobs: with: registry-type: public - - name: Install vLLM and SGLang + - name: Install SGLang working-directory: sglang-benchmarks shell: bash run: | set -eux - # Install vLLM - pip install vllm - # Install SGLang from source pushd sglang pip install -e "python[all]" # Verify installations - python3 -c "import vllm; print('vLLM installed successfully')" python3 -c "import sglang; print('SGLang installed successfully')" - popd - name: Setup benchmark tests env: @@ -196,15 +222,43 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} MODELS: ${{ matrix.models }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SCCACHE_REGION: us-east-1 + ENGINE_VERSION: v1 + SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 run: | set -eux # Set environment variables for SGLang export CURRENT_LLM_SERVING_ENGINE=sglang - # Run the SGLang benchmark script - cd sglang-benchmarks/benchmarks - bash ../../.github/scripts/run-sglang-performance-benchmarks.sh + if [[ "${DEVICE_NAME}" == "cpu" ]]; then + ON_CPU=1 + else + ON_CPU=0 + fi + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ + -e SCCACHE_BUCKET \ + -e SCCACHE_REGION \ + -e DEVICE_NAME \ + -e DEVICE_TYPE \ + -e HF_TOKEN \ + -e ENGINE_VERSION \ + -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + -e ON_CPU="${ON_CPU}" \ + --ipc=host \ + --tty \ + --detach \ + --security-opt seccomp=unconfined \ + --shm-size=4g \ + -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ + -w /tmp/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" - name: Upload the benchmark results if: always() From 403e20da80ab543d60dacb0c4e4b8db044256cc5 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Sun, 24 Aug 2025 21:29:05 -0700 Subject: [PATCH 27/57] add docker image --- .github/workflows/sglang-benchmark.yml | 40 ++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 680baf39..ae5e5adc 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -185,6 +185,46 @@ jobs: with: registry-type: public + - name: Check for latest vLLM commit with Docker image + working-directory: sglang-benchmarks + env: + HEAD_BRANCH: main + HEAD_SHA: '' + MODELS: ${{ matrix.models }} + run: | + set -eux + + # Clone vLLM repository to get the latest commit + git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp + pushd vllm-temp + + # Looking back the latest 100 commits is enough + for i in {0..99} + do + # Check if the image is there, if it doesn't then check an older one + # because the commit is too recent + HEAD_SHA=$(git rev-parse --verify HEAD~${i}) + DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" + + # No Docker image available yet because the commit is too recent + if ! docker manifest inspect "${DOCKER_IMAGE}"; then + continue + fi + + echo "Found vLLM commit ${HEAD_SHA} with available Docker image" + break + done + popd + + # Clean up temporary vLLM repo + rm -rf vllm-temp + + echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV + echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV + + # Print the benchmark commit for reference + echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" + - name: Install SGLang working-directory: sglang-benchmarks shell: bash From f01f72fca43f9d11fedfa541c45639a52a4456ca Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Sun, 24 Aug 2025 22:22:57 -0700 Subject: [PATCH 28/57] test sglang docker image --- .github/workflows/sglang-benchmark.yml | 71 ++---- .github/workflows/sglang-v2.yml | 339 +++++++++++++++++++++++++ 2 files changed, 358 insertions(+), 52 deletions(-) create mode 100644 .github/workflows/sglang-v2.yml diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index ae5e5adc..3dd88bdb 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -41,9 +41,6 @@ jobs: with: python-version: '3.12' - - name: Install uv - uses: astral-sh/setup-uv@v6 - - name: Set parameters id: set-parameters shell: bash @@ -185,58 +182,16 @@ jobs: with: registry-type: public - - name: Check for latest vLLM commit with Docker image - working-directory: sglang-benchmarks - env: - HEAD_BRANCH: main - HEAD_SHA: '' - MODELS: ${{ matrix.models }} - run: | - set -eux - - # Clone vLLM repository to get the latest commit - git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp - pushd vllm-temp - - # Looking back the latest 100 commits is enough - for i in {0..99} - do - # Check if the image is there, if it doesn't then check an older one - # because the commit is too recent - HEAD_SHA=$(git rev-parse --verify HEAD~${i}) - DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" - - # No Docker image available yet because the commit is too recent - if ! docker manifest inspect "${DOCKER_IMAGE}"; then - continue - fi - - echo "Found vLLM commit ${HEAD_SHA} with available Docker image" - break - done - popd - - # Clean up temporary vLLM repo - rm -rf vllm-temp - - echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV - echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV - - # Print the benchmark commit for reference - echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" - - - name: Install SGLang - working-directory: sglang-benchmarks + - name: Setup SGLang Docker Environment shell: bash run: | set -eux - # Install SGLang from source - pushd sglang - pip install -e "python[all]" + # Set SGLang Docker image + echo "SGLANG_DOCKER_IMAGE=lmsysorg/sglang:latest" >> $GITHUB_ENV - # Verify installations - python3 -c "import sglang; print('SGLang installed successfully')" + # Pull SGLang image + docker pull lmsysorg/sglang:latest - name: Setup benchmark tests env: @@ -278,6 +233,7 @@ jobs: ON_CPU=0 fi + # Use SGLang Docker image instead of vLLM image container_name=$(docker run \ ${GPU_FLAG:-} \ ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ @@ -289,15 +245,26 @@ jobs: -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ -e ON_CPU="${ON_CPU}" \ + -e CURRENT_LLM_SERVING_ENGINE \ --ipc=host \ --tty \ --detach \ --security-opt seccomp=unconfined \ - --shm-size=4g \ + --shm-size=32g \ + -p 30000:30000 \ -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ -w /tmp/workspace \ - "${DOCKER_IMAGE}" + "${SGLANG_DOCKER_IMAGE}" ) + + # Install vLLM client tools inside SGLang container (needed for 'vllm bench serve') + docker exec -t "${container_name}" bash -c "pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128" + + # Install additional dependencies that might be needed + docker exec -t "${container_name}" bash -c "apt-get update && apt-get install -y wget curl jq lsof" + + # Run the benchmark script inside the SGLang container docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" - name: Upload the benchmark results diff --git a/.github/workflows/sglang-v2.yml b/.github/workflows/sglang-v2.yml new file mode 100644 index 00000000..ae5e5adc --- /dev/null +++ b/.github/workflows/sglang-v2.yml @@ -0,0 +1,339 @@ +name: SGLang Benchmark + +on: + workflow_dispatch: + inputs: + sglang_branch: + description: SGLang branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER) + required: true + type: string + default: main + models: + description: | + A comma-separated list of models from sglang-benchmarks/benchmarks (optional, default to run everything) + required: false + type: string + runners: + description: | + A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything) + required: true + type: string + default: h100 + pull_request: + paths: + - .github/workflows/sglang-benchmark.yml + - sglang-benchmarks/** + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + set-parameters: + runs-on: ubuntu-latest + outputs: + benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install uv + uses: astral-sh/setup-uv@v6 + + - name: Set parameters + id: set-parameters + shell: bash + env: + MODELS: ${{ inputs.models || '' }} + RUNNERS: ${{ inputs.runners || 'h100' }} + run: | + set -eux + + # The generated matrix is grouped by model and runner + python .github/scripts/generate_vllm_benchmark_matrix.py \ + --benchmark-configs-dir sglang-benchmarks/benchmarks \ + --models "${MODELS}" \ + --runners "${RUNNERS}" + + benchmarks: + name: Run SGLang benchmarks + needs: set-parameters + strategy: + matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + environment: pytorch-x-vllm + permissions: + id-token: write + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install system dependencies + shell: bash + run: | + sudo apt-get update + sudo apt-get install -y libnuma-dev numactl + + - name: Checkout SGLang repository + uses: actions/checkout@v4 + with: + repository: sgl-project/sglang + path: sglang-benchmarks/sglang + ref: ${{ inputs.sglang_branch || 'main' }} + fetch-depth: 0 + + - uses: actions/setup-python@v5 + # Amazon Linux fails on this step + continue-on-error: true + with: + python-version: '3.12' + cache: 'pip' + + - name: Check if the device is supported + shell: bash + run: | + set -eux + + if command -v nvidia-smi; then + DEVICE_NAME=cuda + nvidia-smi + elif command -v rocm-smi; then + DEVICE_NAME=rocm + rocm-smi + else + DEVICE_NAME=cpu + lscpu + fi + echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV + + - name: Set GPU name and type + working-directory: sglang-benchmarks + shell: bash + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "cuda" ]]; then + DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') + elif [[ "${DEVICE_NAME}" == "rocm" ]]; then + DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") + fi + echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV + + - name: Install dependencies + shell: bash + run: | + set -eux + + if [[ "${DEVICE_NAME}" == "rocm" ]]; then + pip install -r .github/scripts/requirements.txt \ + --extra-index-url https://download.pytorch.org/whl/rocm6.3 + else + pip install -r .github/scripts/requirements.txt \ + --extra-index-url https://download.pytorch.org/whl/cu128 + pip install flashinfer-python + fi + + - name: Set Docker registry + shell: bash + run: | + set -eux + + DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo + DOCKER_IMAGE_SUFFIX="" + if [[ "${DEVICE_NAME}" == "rocm" ]]; then + DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DOCKER_IMAGE_SUFFIX=-cpu + fi + echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV + echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV + + - name: Setup CUDA GPU_FLAG for docker run + if: env.DEVICE_NAME == 'cuda' + run: | + echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + - name: Setup ROCm + if: env.DEVICE_NAME == 'rocm' + uses: pytorch/pytorch/./.github/actions/setup-rocm@main + + - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container + run: | + echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" + + - name: Authenticate with AWS + # Only need for DGX hosts + if: contains(env.DEVICE_TYPE, 'B200') + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/public_ecr_read_only + role-duration-seconds: 18000 + aws-region: us-east-1 + + - name: Login to public.ecr.aws + # Only need for DGX hosts + if: contains(env.DEVICE_TYPE, 'B200') + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 + with: + registry-type: public + + - name: Check for latest vLLM commit with Docker image + working-directory: sglang-benchmarks + env: + HEAD_BRANCH: main + HEAD_SHA: '' + MODELS: ${{ matrix.models }} + run: | + set -eux + + # Clone vLLM repository to get the latest commit + git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp + pushd vllm-temp + + # Looking back the latest 100 commits is enough + for i in {0..99} + do + # Check if the image is there, if it doesn't then check an older one + # because the commit is too recent + HEAD_SHA=$(git rev-parse --verify HEAD~${i}) + DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" + + # No Docker image available yet because the commit is too recent + if ! docker manifest inspect "${DOCKER_IMAGE}"; then + continue + fi + + echo "Found vLLM commit ${HEAD_SHA} with available Docker image" + break + done + popd + + # Clean up temporary vLLM repo + rm -rf vllm-temp + + echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV + echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV + + # Print the benchmark commit for reference + echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" + + - name: Install SGLang + working-directory: sglang-benchmarks + shell: bash + run: | + set -eux + + # Install SGLang from source + pushd sglang + pip install -e "python[all]" + + # Verify installations + python3 -c "import sglang; print('SGLang installed successfully')" + + - name: Setup benchmark tests + env: + MODELS: ${{ matrix.models }} + run: | + set -eux + + # Create benchmarks directory structure + mkdir -p sglang-benchmarks/benchmarks/results + mkdir -p sglang-benchmarks/benchmarks/tests + + # Set the list of benchmarks we want to cover in this runner + python3 .github/scripts/setup_vllm_benchmark.py \ + --from-benchmark-configs-dir sglang-benchmarks/benchmarks \ + --to-benchmark-configs-dir sglang-benchmarks/benchmarks/tests \ + --models "${MODELS}" \ + --device "${DEVICE_NAME}" + + ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found" + find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display" + + - name: Run SGLang benchmark + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + MODELS: ${{ matrix.models }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SCCACHE_REGION: us-east-1 + ENGINE_VERSION: v1 + SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + run: | + set -eux + + # Set environment variables for SGLang + export CURRENT_LLM_SERVING_ENGINE=sglang + + if [[ "${DEVICE_NAME}" == "cpu" ]]; then + ON_CPU=1 + else + ON_CPU=0 + fi + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ + -e SCCACHE_BUCKET \ + -e SCCACHE_REGION \ + -e DEVICE_NAME \ + -e DEVICE_TYPE \ + -e HF_TOKEN \ + -e ENGINE_VERSION \ + -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + -e ON_CPU="${ON_CPU}" \ + --ipc=host \ + --tty \ + --detach \ + --security-opt seccomp=unconfined \ + --shm-size=4g \ + -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ + -w /tmp/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" + + - name: Upload the benchmark results + if: always() + env: + BENCHMARK_RESULTS: sglang-benchmarks/benchmarks/results + MODELS: ${{ matrix.models }} + run: | + set -eux + + sudo chown -R ${UID} "${BENCHMARK_RESULTS}" || true + ls -lah "${BENCHMARK_RESULTS}" || echo "Results directory not found" + + SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") + SANITIZED_MODELS="${MODELS//\//_}" + + # Create results summary + if [ -d "${BENCHMARK_RESULTS}" ]; then + echo "## SGLang Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY + echo "- Device: ${DEVICE_TYPE}" >> $GITHUB_STEP_SUMMARY + echo "- Models: ${MODELS}" >> $GITHUB_STEP_SUMMARY + echo "- Runner: ${{ matrix.runner }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Files Generated:" >> $GITHUB_STEP_SUMMARY + find "${BENCHMARK_RESULTS}" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY || echo "- No JSON files found" >> $GITHUB_STEP_SUMMARY + else + echo "⚠️ No benchmark results found in ${BENCHMARK_RESULTS}" >> $GITHUB_STEP_SUMMARY + fi + + echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV + echo "SANITIZED_MODELS=$SANITIZED_MODELS" >> $GITHUB_ENV + + # Keep a copy of the benchmark results on GitHub for reference + - uses: actions/upload-artifact@v4 + if: always() + with: + name: sglang-benchmark-results-${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }} + path: sglang-benchmarks/benchmarks/results + retention-days: 30 From 0d0379cd433405c35076e78712cc584aee5b73b7 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Sun, 24 Aug 2025 23:33:35 -0700 Subject: [PATCH 29/57] test different approach - 1 --- .github/workflows/sglang-benchmark.yml | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 3dd88bdb..62f2a1db 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -258,12 +258,25 @@ jobs: "${SGLANG_DOCKER_IMAGE}" ) - # Install vLLM client tools inside SGLang container (needed for 'vllm bench serve') - docker exec -t "${container_name}" bash -c "pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128" - - # Install additional dependencies that might be needed - docker exec -t "${container_name}" bash -c "apt-get update && apt-get install -y wget curl jq lsof" - + # Install system dependencies and build tools + docker exec -t "${container_name}" bash -c " + apt-get update && + apt-get install -y wget curl jq lsof git build-essential python3-dev && + pip install uv + " + + # Check current PyTorch version + docker exec -t "${container_name}" bash -c "python3 -c 'import torch; print(f\"SGLang container PyTorch: {torch.__version__}\")'" + + # Clone vLLM and build with existing PyTorch + docker exec -t "${container_name}" bash -c " + cd /tmp && + git clone https://github.com/vllm-project/vllm.git && + cd vllm && + python use_existing_torch.py && + uv pip install -r requirements/build.txt && + uv pip install --no-build-isolation -e . + " # Run the benchmark script inside the SGLang container docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" From 3b99ff448f37badc2348b80d6f4213c355421b16 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 00:12:46 -0700 Subject: [PATCH 30/57] remove uv and use pip --- .github/workflows/sglang-benchmark.yml | 6 +- .github/workflows/sglang-v2.yml | 339 ------------------------- 2 files changed, 3 insertions(+), 342 deletions(-) delete mode 100644 .github/workflows/sglang-v2.yml diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 62f2a1db..40c23c28 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -274,10 +274,10 @@ jobs: git clone https://github.com/vllm-project/vllm.git && cd vllm && python use_existing_torch.py && - uv pip install -r requirements/build.txt && - uv pip install --no-build-isolation -e . + pip install -r requirements/build.txt && + pip install --no-build-isolation -e . " - # Run the benchmark script inside the SGLang container + # Run the benchmark script inside the SGLang container to run the benchmarks docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" - name: Upload the benchmark results diff --git a/.github/workflows/sglang-v2.yml b/.github/workflows/sglang-v2.yml deleted file mode 100644 index ae5e5adc..00000000 --- a/.github/workflows/sglang-v2.yml +++ /dev/null @@ -1,339 +0,0 @@ -name: SGLang Benchmark - -on: - workflow_dispatch: - inputs: - sglang_branch: - description: SGLang branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER) - required: true - type: string - default: main - models: - description: | - A comma-separated list of models from sglang-benchmarks/benchmarks (optional, default to run everything) - required: false - type: string - runners: - description: | - A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything) - required: true - type: string - default: h100 - pull_request: - paths: - - .github/workflows/sglang-benchmark.yml - - sglang-benchmarks/** - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} - cancel-in-progress: true - -jobs: - set-parameters: - runs-on: ubuntu-latest - outputs: - benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install uv - uses: astral-sh/setup-uv@v6 - - - name: Set parameters - id: set-parameters - shell: bash - env: - MODELS: ${{ inputs.models || '' }} - RUNNERS: ${{ inputs.runners || 'h100' }} - run: | - set -eux - - # The generated matrix is grouped by model and runner - python .github/scripts/generate_vllm_benchmark_matrix.py \ - --benchmark-configs-dir sglang-benchmarks/benchmarks \ - --models "${MODELS}" \ - --runners "${RUNNERS}" - - benchmarks: - name: Run SGLang benchmarks - needs: set-parameters - strategy: - matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - environment: pytorch-x-vllm - permissions: - id-token: write - contents: read - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install system dependencies - shell: bash - run: | - sudo apt-get update - sudo apt-get install -y libnuma-dev numactl - - - name: Checkout SGLang repository - uses: actions/checkout@v4 - with: - repository: sgl-project/sglang - path: sglang-benchmarks/sglang - ref: ${{ inputs.sglang_branch || 'main' }} - fetch-depth: 0 - - - uses: actions/setup-python@v5 - # Amazon Linux fails on this step - continue-on-error: true - with: - python-version: '3.12' - cache: 'pip' - - - name: Check if the device is supported - shell: bash - run: | - set -eux - - if command -v nvidia-smi; then - DEVICE_NAME=cuda - nvidia-smi - elif command -v rocm-smi; then - DEVICE_NAME=rocm - rocm-smi - else - DEVICE_NAME=cpu - lscpu - fi - echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV - - - name: Set GPU name and type - working-directory: sglang-benchmarks - shell: bash - run: | - set -eux - - if [[ "${DEVICE_NAME}" == "cuda" ]]; then - DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') - elif [[ "${DEVICE_NAME}" == "rocm" ]]; then - DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) - elif [[ "${DEVICE_NAME}" == "cpu" ]]; then - DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") - fi - echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV - - - name: Install dependencies - shell: bash - run: | - set -eux - - if [[ "${DEVICE_NAME}" == "rocm" ]]; then - pip install -r .github/scripts/requirements.txt \ - --extra-index-url https://download.pytorch.org/whl/rocm6.3 - else - pip install -r .github/scripts/requirements.txt \ - --extra-index-url https://download.pytorch.org/whl/cu128 - pip install flashinfer-python - fi - - - name: Set Docker registry - shell: bash - run: | - set -eux - - DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo - DOCKER_IMAGE_SUFFIX="" - if [[ "${DEVICE_NAME}" == "rocm" ]]; then - DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci - elif [[ "${DEVICE_NAME}" == "cpu" ]]; then - DOCKER_IMAGE_SUFFIX=-cpu - fi - echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV - echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV - - - name: Setup CUDA GPU_FLAG for docker run - if: env.DEVICE_NAME == 'cuda' - run: | - echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" - - - name: Setup ROCm - if: env.DEVICE_NAME == 'rocm' - uses: pytorch/pytorch/./.github/actions/setup-rocm@main - - - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container - run: | - echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" - - - name: Authenticate with AWS - # Only need for DGX hosts - if: contains(env.DEVICE_TYPE, 'B200') - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 - with: - role-to-assume: arn:aws:iam::308535385114:role/public_ecr_read_only - role-duration-seconds: 18000 - aws-region: us-east-1 - - - name: Login to public.ecr.aws - # Only need for DGX hosts - if: contains(env.DEVICE_TYPE, 'B200') - uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 - with: - registry-type: public - - - name: Check for latest vLLM commit with Docker image - working-directory: sglang-benchmarks - env: - HEAD_BRANCH: main - HEAD_SHA: '' - MODELS: ${{ matrix.models }} - run: | - set -eux - - # Clone vLLM repository to get the latest commit - git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp - pushd vllm-temp - - # Looking back the latest 100 commits is enough - for i in {0..99} - do - # Check if the image is there, if it doesn't then check an older one - # because the commit is too recent - HEAD_SHA=$(git rev-parse --verify HEAD~${i}) - DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" - - # No Docker image available yet because the commit is too recent - if ! docker manifest inspect "${DOCKER_IMAGE}"; then - continue - fi - - echo "Found vLLM commit ${HEAD_SHA} with available Docker image" - break - done - popd - - # Clean up temporary vLLM repo - rm -rf vllm-temp - - echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV - echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV - - # Print the benchmark commit for reference - echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" - - - name: Install SGLang - working-directory: sglang-benchmarks - shell: bash - run: | - set -eux - - # Install SGLang from source - pushd sglang - pip install -e "python[all]" - - # Verify installations - python3 -c "import sglang; print('SGLang installed successfully')" - - - name: Setup benchmark tests - env: - MODELS: ${{ matrix.models }} - run: | - set -eux - - # Create benchmarks directory structure - mkdir -p sglang-benchmarks/benchmarks/results - mkdir -p sglang-benchmarks/benchmarks/tests - - # Set the list of benchmarks we want to cover in this runner - python3 .github/scripts/setup_vllm_benchmark.py \ - --from-benchmark-configs-dir sglang-benchmarks/benchmarks \ - --to-benchmark-configs-dir sglang-benchmarks/benchmarks/tests \ - --models "${MODELS}" \ - --device "${DEVICE_NAME}" - - ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found" - find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display" - - - name: Run SGLang benchmark - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - MODELS: ${{ matrix.models }} - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - SCCACHE_REGION: us-east-1 - ENGINE_VERSION: v1 - SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - run: | - set -eux - - # Set environment variables for SGLang - export CURRENT_LLM_SERVING_ENGINE=sglang - - if [[ "${DEVICE_NAME}" == "cpu" ]]; then - ON_CPU=1 - else - ON_CPU=0 - fi - - container_name=$(docker run \ - ${GPU_FLAG:-} \ - ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ - -e SCCACHE_BUCKET \ - -e SCCACHE_REGION \ - -e DEVICE_NAME \ - -e DEVICE_TYPE \ - -e HF_TOKEN \ - -e ENGINE_VERSION \ - -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ - -e ON_CPU="${ON_CPU}" \ - --ipc=host \ - --tty \ - --detach \ - --security-opt seccomp=unconfined \ - --shm-size=4g \ - -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ - -w /tmp/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" - - - name: Upload the benchmark results - if: always() - env: - BENCHMARK_RESULTS: sglang-benchmarks/benchmarks/results - MODELS: ${{ matrix.models }} - run: | - set -eux - - sudo chown -R ${UID} "${BENCHMARK_RESULTS}" || true - ls -lah "${BENCHMARK_RESULTS}" || echo "Results directory not found" - - SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") - SANITIZED_MODELS="${MODELS//\//_}" - - # Create results summary - if [ -d "${BENCHMARK_RESULTS}" ]; then - echo "## SGLang Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY - echo "- Device: ${DEVICE_TYPE}" >> $GITHUB_STEP_SUMMARY - echo "- Models: ${MODELS}" >> $GITHUB_STEP_SUMMARY - echo "- Runner: ${{ matrix.runner }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Files Generated:" >> $GITHUB_STEP_SUMMARY - find "${BENCHMARK_RESULTS}" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY || echo "- No JSON files found" >> $GITHUB_STEP_SUMMARY - else - echo "⚠️ No benchmark results found in ${BENCHMARK_RESULTS}" >> $GITHUB_STEP_SUMMARY - fi - - echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV - echo "SANITIZED_MODELS=$SANITIZED_MODELS" >> $GITHUB_ENV - - # Keep a copy of the benchmark results on GitHub for reference - - uses: actions/upload-artifact@v4 - if: always() - with: - name: sglang-benchmark-results-${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }} - path: sglang-benchmarks/benchmarks/results - retention-days: 30 From 8fc7488005b8ef87109e35d1f59b76c2b5e6153f Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 08:38:39 -0700 Subject: [PATCH 31/57] try different approach --- .github/workflows/sglang-benchmark.yml | 89 ++++++++++++++++---------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 40c23c28..c0f8b654 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -41,6 +41,9 @@ jobs: with: python-version: '3.12' + - name: Install uv + uses: astral-sh/setup-uv@v6 + - name: Set parameters id: set-parameters shell: bash @@ -135,7 +138,6 @@ jobs: else pip install -r .github/scripts/requirements.txt \ --extra-index-url https://download.pytorch.org/whl/cu128 - pip install flashinfer-python fi - name: Set Docker registry @@ -182,16 +184,58 @@ jobs: with: registry-type: public - - name: Setup SGLang Docker Environment - shell: bash + - name: Check for latest vLLM commit with Docker image + working-directory: sglang-benchmarks + env: + HEAD_BRANCH: main + HEAD_SHA: '' + MODELS: ${{ matrix.models }} run: | set -eux - # Set SGLang Docker image - echo "SGLANG_DOCKER_IMAGE=lmsysorg/sglang:latest" >> $GITHUB_ENV + # Clone vLLM repository to get the latest commit + git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp + pushd vllm-temp + + # Looking back the latest 100 commits is enough + for i in {0..99} + do + # Check if the image is there, if it doesn't then check an older one + # because the commit is too recent + HEAD_SHA=$(git rev-parse --verify HEAD~${i}) + DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" + + # No Docker image available yet because the commit is too recent + if ! docker manifest inspect "${DOCKER_IMAGE}"; then + continue + fi - # Pull SGLang image - docker pull lmsysorg/sglang:latest + echo "Found vLLM commit ${HEAD_SHA} with available Docker image" + break + done + popd + + # Clean up temporary vLLM repo + rm -rf vllm-temp + + echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV + echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV + + # Print the benchmark commit for reference + echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" + + # - name: Install SGLang + # working-directory: sglang-benchmarks + # shell: bash + # run: | + # set -eux + + # # Install SGLang from source + # pushd sglang + # pip install -e "python[all]" + + # # Verify installations + # python3 -c "import sglang; print('SGLang installed successfully')" - name: Setup benchmark tests env: @@ -233,7 +277,6 @@ jobs: ON_CPU=0 fi - # Use SGLang Docker image instead of vLLM image container_name=$(docker run \ ${GPU_FLAG:-} \ ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ @@ -245,40 +288,16 @@ jobs: -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ -e ON_CPU="${ON_CPU}" \ - -e CURRENT_LLM_SERVING_ENGINE \ --ipc=host \ --tty \ --detach \ --security-opt seccomp=unconfined \ - --shm-size=32g \ - -p 30000:30000 \ + --shm-size=4g \ -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ -w /tmp/workspace \ - "${SGLANG_DOCKER_IMAGE}" + "${DOCKER_IMAGE}" ) - - # Install system dependencies and build tools - docker exec -t "${container_name}" bash -c " - apt-get update && - apt-get install -y wget curl jq lsof git build-essential python3-dev && - pip install uv - " - - # Check current PyTorch version - docker exec -t "${container_name}" bash -c "python3 -c 'import torch; print(f\"SGLang container PyTorch: {torch.__version__}\")'" - - # Clone vLLM and build with existing PyTorch - docker exec -t "${container_name}" bash -c " - cd /tmp && - git clone https://github.com/vllm-project/vllm.git && - cd vllm && - python use_existing_torch.py && - pip install -r requirements/build.txt && - pip install --no-build-isolation -e . - " - # Run the benchmark script inside the SGLang container to run the benchmarks - docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" + docker exec -t "${container_name}" bash -c "pip install sglang && cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" - name: Upload the benchmark results if: always() From 14d33f934cf4184910b080bfdef69108ec43a46e Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 08:55:06 -0700 Subject: [PATCH 32/57] remove uv and use pip --- .github/workflows/sglang-benchmark.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index c0f8b654..e8c076f7 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -298,6 +298,26 @@ jobs: "${DOCKER_IMAGE}" ) docker exec -t "${container_name}" bash -c "pip install sglang && cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" + docker exec -t "${container_name}" bash -c " + # Install system dependencies + apt-get update && + apt-get install -y wget curl jq lsof git build-essential python3-dev && + + # Check current PyTorch version + python3 -c 'import torch; print(f\"vLLM container PyTorch: {torch.__version__}\")' && + + # Clone SGLang repository + cd /tmp && + git clone https://github.com/sgl-project/sglang.git && + cd sglang && + + # Install SGLang with existing PyTorch + pip install -e 'python[all]' && + + # Navigate to workspace and run benchmarks + cd /tmp/workspace/sglang-benchmarks/benchmarks && + bash ../../.github/scripts/run-sglang-performance-benchmarks.sh + " - name: Upload the benchmark results if: always() From 017e2528d65f25bce54614cc92a155432fd73171 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 08:55:28 -0700 Subject: [PATCH 33/57] remove uv and use pip --- .github/workflows/sglang-benchmark.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index e8c076f7..e486b973 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -297,7 +297,6 @@ jobs: -w /tmp/workspace \ "${DOCKER_IMAGE}" ) - docker exec -t "${container_name}" bash -c "pip install sglang && cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh" docker exec -t "${container_name}" bash -c " # Install system dependencies apt-get update && From cac7fc1f2a66737de2397452787d658f4aa7236f Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 11:10:54 -0700 Subject: [PATCH 34/57] create diff venvs --- .../run-sglang-performance-benchmarks.sh | 14 ++ .github/workflows/sglang-benchmark.yml | 123 ++---------------- 2 files changed, 26 insertions(+), 111 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index e4f49419..b5511cfb 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -236,6 +236,15 @@ run_serving_tests() { continue fi + # Create a new uv environment for vllm client (once per test case) + echo "Creating new uv environment for vllm client..." + uv venv vllm_client_env + + # Activate the environment and install vllm + echo "Installing vllm in the new environment..." + source vllm_client_env/bin/activate + pip install vllm + # iterate over different QPS for qps in $qps_list; do # remove the surrounding single quote from qps @@ -261,6 +270,7 @@ run_serving_tests() { echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" + # Run the vllm bench serve command in the activated environment bash -c "$client_command" # record the benchmarking commands @@ -277,6 +287,10 @@ run_serving_tests() { done + # Deactivate and clean up the environment after all QPS tests + deactivate + rm -rf vllm_client_env + # clean up kill -9 $server_pid kill_gpu_processes diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index e486b973..8c8a369b 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -95,6 +95,9 @@ jobs: python-version: '3.12' cache: 'pip' + - name: Install uv + uses: astral-sh/setup-uv@v6 + - name: Check if the device is supported shell: bash run: | @@ -140,21 +143,6 @@ jobs: --extra-index-url https://download.pytorch.org/whl/cu128 fi - - name: Set Docker registry - shell: bash - run: | - set -eux - - DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo - DOCKER_IMAGE_SUFFIX="" - if [[ "${DEVICE_NAME}" == "rocm" ]]; then - DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci - elif [[ "${DEVICE_NAME}" == "cpu" ]]; then - DOCKER_IMAGE_SUFFIX=-cpu - fi - echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV - echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV - - name: Setup CUDA GPU_FLAG for docker run if: env.DEVICE_NAME == 'cuda' run: | @@ -184,58 +172,18 @@ jobs: with: registry-type: public - - name: Check for latest vLLM commit with Docker image + - name: Install SGLang working-directory: sglang-benchmarks - env: - HEAD_BRANCH: main - HEAD_SHA: '' - MODELS: ${{ matrix.models }} + shell: bash run: | set -eux - # Clone vLLM repository to get the latest commit - git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp - pushd vllm-temp - - # Looking back the latest 100 commits is enough - for i in {0..99} - do - # Check if the image is there, if it doesn't then check an older one - # because the commit is too recent - HEAD_SHA=$(git rev-parse --verify HEAD~${i}) - DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" - - # No Docker image available yet because the commit is too recent - if ! docker manifest inspect "${DOCKER_IMAGE}"; then - continue - fi - - echo "Found vLLM commit ${HEAD_SHA} with available Docker image" - break - done - popd - - # Clean up temporary vLLM repo - rm -rf vllm-temp - - echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV - echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV - - # Print the benchmark commit for reference - echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" + # Install SGLang from source + pushd sglang + pip install -e "python[all]" - # - name: Install SGLang - # working-directory: sglang-benchmarks - # shell: bash - # run: | - # set -eux - - # # Install SGLang from source - # pushd sglang - # pip install -e "python[all]" - - # # Verify installations - # python3 -c "import sglang; print('SGLang installed successfully')" + # Verify installations + python3 -c "import sglang; print('SGLang installed successfully')" - name: Setup benchmark tests env: @@ -260,10 +208,6 @@ jobs: - name: Run SGLang benchmark env: HF_TOKEN: ${{ secrets.HF_TOKEN }} - MODELS: ${{ matrix.models }} - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - SCCACHE_REGION: us-east-1 - ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 run: | set -eux @@ -271,51 +215,8 @@ jobs: # Set environment variables for SGLang export CURRENT_LLM_SERVING_ENGINE=sglang - if [[ "${DEVICE_NAME}" == "cpu" ]]; then - ON_CPU=1 - else - ON_CPU=0 - fi - - container_name=$(docker run \ - ${GPU_FLAG:-} \ - ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ - -e SCCACHE_BUCKET \ - -e SCCACHE_REGION \ - -e DEVICE_NAME \ - -e DEVICE_TYPE \ - -e HF_TOKEN \ - -e ENGINE_VERSION \ - -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ - -e ON_CPU="${ON_CPU}" \ - --ipc=host \ - --tty \ - --detach \ - --security-opt seccomp=unconfined \ - --shm-size=4g \ - -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ - -w /tmp/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" bash -c " - # Install system dependencies - apt-get update && - apt-get install -y wget curl jq lsof git build-essential python3-dev && - - # Check current PyTorch version - python3 -c 'import torch; print(f\"vLLM container PyTorch: {torch.__version__}\")' && - - # Clone SGLang repository - cd /tmp && - git clone https://github.com/sgl-project/sglang.git && - cd sglang && - - # Install SGLang with existing PyTorch - pip install -e 'python[all]' && - - # Navigate to workspace and run benchmarks - cd /tmp/workspace/sglang-benchmarks/benchmarks && - bash ../../.github/scripts/run-sglang-performance-benchmarks.sh + cd sglang-benchmarks/benchmarks && + bash ../../.github/scripts/run-sglang-performance-benchmarks.sh " - name: Upload the benchmark results From 90f049398df9636dbc68db9a83829542a544ae15 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 11:44:40 -0700 Subject: [PATCH 35/57] update arguments --- .github/scripts/run-sglang-performance-benchmarks.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index b5511cfb..c88a50d4 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -257,14 +257,16 @@ run_serving_tests() { new_test_name=$test_name"_qps_"$qps echo "new test name $new_test_name" - # Bench serving command + # Bench serving command with proper parameters for connecting to external server client_command="vllm bench serve \ + --port 30000 \ + --model $client_model \ --dataset-name sharegpt \ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ - --model $client_model \ --request-rate $qps \ - --port 30000 \ - --output-file $RESULTS_FOLDER/${new_test_name}.json \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ $client_args" echo "Running test case $test_name with qps $qps" From 5160576437a9e8d3e391233e32f5330894773876 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 12:35:51 -0700 Subject: [PATCH 36/57] added max concurrency --- .../run-sglang-performance-benchmarks.sh | 72 ++++++++++--------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index c88a50d4..962e1f9e 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -190,6 +190,13 @@ run_serving_tests() { qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" + max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') + if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then + num_prompts=$(echo "$client_params" | jq -r '.num_prompts') + max_concurrency_list="[$num_prompts]" + fi + max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') + echo "Running over max concurrency list $max_concurrency_list" # Extract only specific SGLang server parameters model_path=$(echo "$server_params" | jq -r '.model_path // .model') @@ -254,39 +261,38 @@ run_serving_tests() { echo "now qps is $qps" fi - new_test_name=$test_name"_qps_"$qps - echo "new test name $new_test_name" - - # Bench serving command with proper parameters for connecting to external server - client_command="vllm bench serve \ - --port 30000 \ - --model $client_model \ - --dataset-name sharegpt \ - --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ - --request-rate $qps \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - $client_args" - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - # Run the vllm bench serve command in the activated environment - bash -c "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - + for max_concurrency in $max_concurrency_list; do + new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency + echo " new test name $new_test_name" + # pass the tensor parallel size to the client so that it can be displayed + # on the benchmark dashboard + client_command="vllm bench serve \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --max-concurrency $max_concurrency \ + --metadata "tensor_parallel_size=$tp" \ + --port 30000 \ + $client_args " + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + bash -c "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + done done # Deactivate and clean up the environment after all QPS tests From 110929b4c9b23f5ae7efa9f7dda073f0561bfb8b Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 14:29:10 -0700 Subject: [PATCH 37/57] add virtual env for sglang as well --- .github/scripts/run-sglang-performance-benchmarks.sh | 2 +- .github/workflows/sglang-benchmark.yml | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index 962e1f9e..506e768c 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -224,7 +224,7 @@ run_serving_tests() { continue fi - server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp" + server_command="source sglang_env/bin/activate && python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp" # run the server echo "Running test case $test_name" diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 8c8a369b..31dfdfa5 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -172,12 +172,16 @@ jobs: with: registry-type: public - - name: Install SGLang + - name: Install SGLang in virtual environment working-directory: sglang-benchmarks shell: bash run: | set -eux + # Create virtual environment for SGLang + uv venv sglang_env + source sglang_env/bin/activate + # Install SGLang from source pushd sglang pip install -e "python[all]" @@ -185,6 +189,9 @@ jobs: # Verify installations python3 -c "import sglang; print('SGLang installed successfully')" + # Deactivate for now - will be activated in the benchmark script + deactivate + - name: Setup benchmark tests env: MODELS: ${{ matrix.models }} @@ -212,9 +219,6 @@ jobs: run: | set -eux - # Set environment variables for SGLang - export CURRENT_LLM_SERVING_ENGINE=sglang - cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh " From 33f9cbd11c07d579b3984ada4bf2ce29d13ee5ee Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 14:57:39 -0700 Subject: [PATCH 38/57] test --- .../run-sglang-performance-benchmarks.sh | 70 ++++++++----------- .github/workflows/sglang-benchmark.yml | 9 +-- 2 files changed, 31 insertions(+), 48 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index 506e768c..aa5dcaf4 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -190,13 +190,6 @@ run_serving_tests() { qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" - max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') - if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then - num_prompts=$(echo "$client_params" | jq -r '.num_prompts') - max_concurrency_list="[$num_prompts]" - fi - max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') - echo "Running over max concurrency list $max_concurrency_list" # Extract only specific SGLang server parameters model_path=$(echo "$server_params" | jq -r '.model_path // .model') @@ -224,7 +217,7 @@ run_serving_tests() { continue fi - server_command="source sglang_env/bin/activate && python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp" + server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp" # run the server echo "Running test case $test_name" @@ -261,38 +254,35 @@ run_serving_tests() { echo "now qps is $qps" fi - for max_concurrency in $max_concurrency_list; do - new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency - echo " new test name $new_test_name" - # pass the tensor parallel size to the client so that it can be displayed - # on the benchmark dashboard - client_command="vllm bench serve \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --max-concurrency $max_concurrency \ - --metadata "tensor_parallel_size=$tp" \ - --port 30000 \ - $client_args " - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - bash -c "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - done + new_test_name=$test_name"_qps_"$qps"_concurrency_" + echo " new test name $new_test_name" + # pass the tensor parallel size to the client so that it can be displayed + # on the benchmark dashboard + client_command="vllm bench serve \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --metadata "tensor_parallel_size=$tp" \ + --port 30000 \ + $client_args " + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + bash -c "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" done # Deactivate and clean up the environment after all QPS tests diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 31dfdfa5..b1566dd2 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -172,16 +172,12 @@ jobs: with: registry-type: public - - name: Install SGLang in virtual environment + - name: Install SGLang working-directory: sglang-benchmarks shell: bash run: | set -eux - # Create virtual environment for SGLang - uv venv sglang_env - source sglang_env/bin/activate - # Install SGLang from source pushd sglang pip install -e "python[all]" @@ -189,9 +185,6 @@ jobs: # Verify installations python3 -c "import sglang; print('SGLang installed successfully')" - # Deactivate for now - will be activated in the benchmark script - deactivate - - name: Setup benchmark tests env: MODELS: ${{ matrix.models }} From 1bb0f3442f228746a46b318e3e277457db596a97 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 15:38:02 -0700 Subject: [PATCH 39/57] trying with env variables --- .github/workflows/sglang-benchmark.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index b1566dd2..80da7ff1 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -175,6 +175,10 @@ jobs: - name: Install SGLang working-directory: sglang-benchmarks shell: bash + env: + TORCH_COMPILE_DISABLE: "1" + TORCHDYNAMO_DISABLE: "1" + TRITON_DISABLE_LINE_INFO: "1" run: | set -eux @@ -209,6 +213,9 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + CUDA_VISIBLE_DEVICES: "0" + TORCH_COMPILE_DISABLE: "1" + TORCHDYNAMO_DISABLE: "1" run: | set -eux From fc897adea79587ae9bb31dc45b99c28123e2892b Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 16:02:37 -0700 Subject: [PATCH 40/57] final touches --- .github/scripts/run-sglang-performance-benchmarks.sh | 2 +- .github/workflows/sglang-benchmark.yml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index aa5dcaf4..c0203a15 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -254,7 +254,7 @@ run_serving_tests() { echo "now qps is $qps" fi - new_test_name=$test_name"_qps_"$qps"_concurrency_" + new_test_name=$test_name"_qps_"$qps" echo " new test name $new_test_name" # pass the tensor parallel size to the client so that it can be displayed # on the benchmark dashboard diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 80da7ff1..6127d56d 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -221,7 +221,6 @@ jobs: cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh - " - name: Upload the benchmark results if: always() From 747817dc7fd46e75d0cecb3da452b8f5d28d9d34 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 16:14:58 -0700 Subject: [PATCH 41/57] fix extra character --- .github/scripts/run-sglang-performance-benchmarks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index c0203a15..1de0312f 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -254,7 +254,7 @@ run_serving_tests() { echo "now qps is $qps" fi - new_test_name=$test_name"_qps_"$qps" + new_test_name=$test_name"_qps_"$qps echo " new test name $new_test_name" # pass the tensor parallel size to the client so that it can be displayed # on the benchmark dashboard From 0fc1017e38b76393a1097fe33184cbd4db66b705 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 16:50:04 -0700 Subject: [PATCH 42/57] cleanup and adding more tests --- .github/workflows/sglang-benchmark.yml | 4 ---- .../benchmarks/cuda/serving-tests.json | 22 ++++++++++++++++++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 6127d56d..00490413 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -152,10 +152,6 @@ jobs: if: env.DEVICE_NAME == 'rocm' uses: pytorch/pytorch/./.github/actions/setup-rocm@main - - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container - run: | - echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" - - name: Authenticate with AWS # Only need for DGX hosts if: contains(env.DEVICE_TYPE, 'B200') diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json index e2c30eca..3b3981dd 100644 --- a/sglang-benchmarks/benchmarks/cuda/serving-tests.json +++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json @@ -1,12 +1,13 @@ [ { "test_name": "serving_llama8B_tp1_sharegpt", - "qps_list": [1, 4], + "qps_list": [1, 4, 16, "inf"], "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", + "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -16,5 +17,24 @@ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] From 545c19b9e6d6bd8b3db94a3746a282b6f471cb7d Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 16:56:35 -0700 Subject: [PATCH 43/57] removing not needed files and tests --- .../run-sglang-performance-benchmarks.sh | 13 +- .../benchmarks/cuda/genai-perf-tests.json | 22 --- .../benchmarks/cuda/nightly-tests.json | 161 ------------------ 3 files changed, 2 insertions(+), 194 deletions(-) delete mode 100644 sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json delete mode 100644 sglang-benchmarks/benchmarks/cuda/nightly-tests.json diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index 1de0312f..39372288 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -296,14 +296,7 @@ run_serving_tests() { } main() { - local ARCH - ARCH='' - if [ "$ON_CPU" == "1" ];then - check_cpus - ARCH='-cpu' - else - check_gpus - fi + check_gpus check_hf_token # dependencies @@ -323,9 +316,7 @@ main() { BENCHMARK_ROOT=tests/ # benchmarking - look for test files in the tests/ directory - if [ -f "$BENCHMARK_ROOT/serving-tests$ARCH.json" ]; then - run_serving_tests "$BENCHMARK_ROOT/serving-tests$ARCH.json" - elif [ -f "$BENCHMARK_ROOT/serving-tests.json" ]; then + if [ -f "$BENCHMARK_ROOT/serving-tests.json" ]; then run_serving_tests "$BENCHMARK_ROOT/serving-tests.json" else echo "No serving test file found" diff --git a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json deleted file mode 100644 index ca9027e6..00000000 --- a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json +++ /dev/null @@ -1,22 +0,0 @@ -[ - { - "test_name": "llama8B_tp1_genai_perf", - "qps_list": [4,8,16,32], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tp": 1, - "port": 8000, - "num_prompts": 500, - "reuse_server": false - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "", - "gpu_memory_utilization": 0.9, - "max_num_seqs": 512, - "dtype": "bfloat16" - }, - "genai_perf_input_parameters": { - } - } -] diff --git a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json deleted file mode 100644 index 10bed8ab..00000000 --- a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json +++ /dev/null @@ -1,161 +0,0 @@ -[ - { - "test_name": "llama8B_tp1_sharegpt", - "qps_list": [4,8,16,32,"inf"], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tp": 1, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 500, - "port": 8000, - "reuse_server": false - }, - "lmdeploy_server_parameters": { - "dtype": "bfloat16" - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "bfloat16", - "max_batch_size": 2048, - "max_input_len": 4096, - "max_seq_len": 6144, - "max_num_tokens": 16384, - "trt_llm_version": "v0.11.0" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "", - "gpu_memory_utilization": 0.9, - "max_num_seqs": 512, - "dtype": "bfloat16" - }, - "vllm_client_parameters": { - }, - "sglang_server_parameters": { - "disable_radix_cache": "", - "enable_torch_compile": "", - "dtype": "bfloat16" - }, - "sglang_client_parameters": { - } - }, - { - "test_name": "llama8B_tp1_sonnet_512_16", - "qps_list": [4,8,16,32,"inf"], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tp": 1, - "dataset_name": "sonnet", - "dataset_path": "./sonnet_4x.txt", - "num_prompts": 500, - "port": 8000, - "sonnet_input_len": 512, - "sonnet_output_len": 16, - "sonnet_prefix_len": 50, - "reuse_server": true - }, - "lmdeploy_server_parameters": { - "dtype": "bfloat16" - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "bfloat16", - "max_batch_size": 2048, - "max_input_len": 4096, - "max_seq_len": 6144, - "max_num_tokens": 16384, - "trt_llm_version": "v0.11.0" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "", - "gpu_memory_utilization": 0.9, - "max_num_seqs": 512, - "dtype": "bfloat16" - }, - "vllm_client_parameters": { - }, - "sglang_server_parameters": { - "disable_radix_cache": "", - "enable_torch_compile": "", - "dtype": "bfloat16" - }, - "sglang_client_parameters": { - } - }, - { - "test_name": "llama8B_tp1_sonnet_512_256", - "qps_list": [4,8,16,32,"inf"], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "tp": 1, - "dataset_name": "sonnet", - "dataset_path": "./sonnet_4x.txt", - "num_prompts": 500, - "port": 8000, - "sonnet_input_len": 512, - "sonnet_output_len": 256, - "sonnet_prefix_len": 50, - "reuse_server": true - }, - "lmdeploy_server_parameters": { - "dtype": "bfloat16" - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "bfloat16", - "max_batch_size": 2048, - "max_input_len": 4096, - "max_seq_len": 6144, - "max_num_tokens": 16384, - "trt_llm_version": "v0.11.0" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "", - "gpu_memory_utilization": 0.9, - "max_num_seqs": 512, - "dtype": "bfloat16" - }, - "vllm_client_parameters": { - }, - "sglang_server_parameters": { - "disable_radix_cache": "", - "enable_torch_compile": "", - "dtype": "bfloat16" - }, - "sglang_client_parameters": { - } - } -] From 5b3f9f9a3e174e0a861caf90665b3808b970645c Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 19:31:54 -0700 Subject: [PATCH 44/57] try running inside docker container --- .github/workflows/sglang-benchmark.yml | 37 ++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 00490413..3b4b4941 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -205,6 +205,20 @@ jobs: ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found" find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display" + # - name: Run SGLang benchmark + # env: + # HF_TOKEN: ${{ secrets.HF_TOKEN }} + # SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + # CUDA_VISIBLE_DEVICES: "0" + # TORCH_COMPILE_DISABLE: "1" + # TORCHDYNAMO_DISABLE: "1" + # run: | + # set -eux + + # cd sglang-benchmarks/benchmarks && + # bash ../../.github/scripts/run-sglang-performance-benchmarks.sh + + - name: Run SGLang benchmark env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -215,8 +229,27 @@ jobs: run: | set -eux - cd sglang-benchmarks/benchmarks && - bash ../../.github/scripts/run-sglang-performance-benchmarks.sh + container_name=$(docker run \ + --gpus all \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + -e HF_TOKEN \ + -e CUDA_VISIBLE_DEVICES \ + -e TORCH_COMPILE_DISABLE \ + -e TORCHDYNAMO_DISABLE \ + -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + --ipc=host \ + --shm-size=4g \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ + -w /tmp/workspace \ + python:3.12 + ) + + docker exec -t "${container_name}" bash -c " + cd sglang-benchmarks/benchmarks && + bash ../../.github/scripts/run-sglang-performance-benchmarks.sh + " - name: Upload the benchmark results if: always() From f8bd1c8f456661ead113839eaf51f9bc55cb7f00 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 19:44:28 -0700 Subject: [PATCH 45/57] try sglang docker image --- .github/workflows/sglang-benchmark.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 3b4b4941..42b6bad3 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -231,19 +231,19 @@ jobs: container_name=$(docker run \ --gpus all \ - -e NVIDIA_DRIVER_CAPABILITIES=all \ + --shm-size 4g \ -e HF_TOKEN \ -e CUDA_VISIBLE_DEVICES \ -e TORCH_COMPILE_DISABLE \ -e TORCHDYNAMO_DISABLE \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ --ipc=host \ - --shm-size=4g \ --tty \ --detach \ -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ -w /tmp/workspace \ - python:3.12 + lmsysorg/sglang:latest ) docker exec -t "${container_name}" bash -c " From c3d6657a6441bc1cadde0bad8563133189ad0637 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 20:31:23 -0700 Subject: [PATCH 46/57] try with gpu cleaning --- .../run-sglang-performance-benchmarks.sh | 1 + .github/workflows/sglang-benchmark.yml | 37 +------------------ 2 files changed, 3 insertions(+), 35 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index 39372288..fbe078f8 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -126,6 +126,7 @@ kill_gpu_processes() { lsof -t -i:30000 | xargs -r kill -9 pgrep python3 | xargs -r kill -9 pgrep python | xargs -r kill -9 + pgrep VLLM | xargs -r kill -9 # wait until GPU memory usage smaller than 1GB if command -v nvidia-smi; then diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 42b6bad3..00490413 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -205,20 +205,6 @@ jobs: ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found" find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display" - # - name: Run SGLang benchmark - # env: - # HF_TOKEN: ${{ secrets.HF_TOKEN }} - # SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - # CUDA_VISIBLE_DEVICES: "0" - # TORCH_COMPILE_DISABLE: "1" - # TORCHDYNAMO_DISABLE: "1" - # run: | - # set -eux - - # cd sglang-benchmarks/benchmarks && - # bash ../../.github/scripts/run-sglang-performance-benchmarks.sh - - - name: Run SGLang benchmark env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -229,27 +215,8 @@ jobs: run: | set -eux - container_name=$(docker run \ - --gpus all \ - --shm-size 4g \ - -e HF_TOKEN \ - -e CUDA_VISIBLE_DEVICES \ - -e TORCH_COMPILE_DISABLE \ - -e TORCHDYNAMO_DISABLE \ - -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ - --ipc=host \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -w /tmp/workspace \ - lmsysorg/sglang:latest - ) - - docker exec -t "${container_name}" bash -c " - cd sglang-benchmarks/benchmarks && - bash ../../.github/scripts/run-sglang-performance-benchmarks.sh - " + cd sglang-benchmarks/benchmarks && + bash ../../.github/scripts/run-sglang-performance-benchmarks.sh - name: Upload the benchmark results if: always() From f877d7b83f77de3d6bd80fe0a6ffbd66cc79c349 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Mon, 25 Aug 2025 20:56:15 -0700 Subject: [PATCH 47/57] remove cuda check --- .github/workflows/sglang-benchmark.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 00490413..1cb7bf45 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -209,7 +209,6 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - CUDA_VISIBLE_DEVICES: "0" TORCH_COMPILE_DISABLE: "1" TORCHDYNAMO_DISABLE: "1" run: | From f8912f4849a2493de1ab6e8eee24d8ef1fccc0e7 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Tue, 26 Aug 2025 10:23:30 -0700 Subject: [PATCH 48/57] try using vllm docker image --- .github/scripts/common_utils.py | 0 .../run-sglang-performance-benchmarks.sh | 25 +++++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) create mode 100644 .github/scripts/common_utils.py diff --git a/.github/scripts/common_utils.py b/.github/scripts/common_utils.py new file mode 100644 index 00000000..e69de29b diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index fbe078f8..a3dfdd23 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -238,13 +238,13 @@ run_serving_tests() { fi # Create a new uv environment for vllm client (once per test case) - echo "Creating new uv environment for vllm client..." + # echo "Creating new uv environment for vllm client..." uv venv vllm_client_env - - # Activate the environment and install vllm - echo "Installing vllm in the new environment..." + # echo "Installing vllm in the new environment..." source vllm_client_env/bin/activate - pip install vllm + + echo "Pulling official vLLM Docker image..." + docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:latest # iterate over different QPS for qps in $qps_list; do @@ -257,8 +257,7 @@ run_serving_tests() { new_test_name=$test_name"_qps_"$qps echo " new test name $new_test_name" - # pass the tensor parallel size to the client so that it can be displayed - # on the benchmark dashboard + client_command="vllm bench serve \ --save-result \ --result-dir $RESULTS_FOLDER \ @@ -271,7 +270,17 @@ run_serving_tests() { echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" - bash -c "$client_command" + # Run vLLM client inside Docker container + docker run --rm \ + --gpus all \ + -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ + -w /tmp/workspace \ + --ipc=host \ + -e HF_TOKEN="$HF_TOKEN" \ + --shm-size=4g \ + --security-opt seccomp=unconfined \ + public.ecr.aws/q9t5s3a7/vllm-release-repo:latest \ + $client_command # record the benchmarking commands jq_output=$(jq -n \ From 936bd02311b4d71813516270416323d62b8b9864 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Tue, 26 Aug 2025 10:42:25 -0700 Subject: [PATCH 49/57] check valid docker image --- .github/scripts/run-sglang-performance-benchmarks.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index a3dfdd23..4d5fb38c 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -244,7 +244,6 @@ run_serving_tests() { source vllm_client_env/bin/activate echo "Pulling official vLLM Docker image..." - docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:latest # iterate over different QPS for qps in $qps_list; do @@ -279,7 +278,7 @@ run_serving_tests() { -e HF_TOKEN="$HF_TOKEN" \ --shm-size=4g \ --security-opt seccomp=unconfined \ - public.ecr.aws/q9t5s3a7/vllm-release-repo:latest \ + vllm/vllm-openai:latest \ $client_command # record the benchmarking commands From 53d83bbfb87027f57729a3e6f693c212f60de6fe Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Tue, 26 Aug 2025 11:40:42 -0700 Subject: [PATCH 50/57] removing not needed tests and back to original implementation --- .../run-sglang-performance-benchmarks.sh | 23 +++++++------------ .github/workflows/sglang-benchmark.yml | 2 -- .../benchmarks/cuda/serving-tests.json | 19 --------------- 3 files changed, 8 insertions(+), 36 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index 4d5fb38c..ac381961 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -238,12 +238,13 @@ run_serving_tests() { fi # Create a new uv environment for vllm client (once per test case) - # echo "Creating new uv environment for vllm client..." + echo "Creating new uv environment for vllm client..." uv venv vllm_client_env - # echo "Installing vllm in the new environment..." - source vllm_client_env/bin/activate - echo "Pulling official vLLM Docker image..." + # Activate the environment and install vllm + echo "Installing vllm in the new environment..." + source vllm_client_env/bin/activate + pip install vllm # iterate over different QPS for qps in $qps_list; do @@ -257,6 +258,8 @@ run_serving_tests() { new_test_name=$test_name"_qps_"$qps echo " new test name $new_test_name" + # pass the tensor parallel size to the client so that it can be displayed + # on the benchmark dashboard client_command="vllm bench serve \ --save-result \ --result-dir $RESULTS_FOLDER \ @@ -269,17 +272,7 @@ run_serving_tests() { echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" - # Run vLLM client inside Docker container - docker run --rm \ - --gpus all \ - -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ - -w /tmp/workspace \ - --ipc=host \ - -e HF_TOKEN="$HF_TOKEN" \ - --shm-size=4g \ - --security-opt seccomp=unconfined \ - vllm/vllm-openai:latest \ - $client_command + bash -c "$client_command" # record the benchmarking commands jq_output=$(jq -n \ diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 1cb7bf45..fee1fd6c 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -172,7 +172,6 @@ jobs: working-directory: sglang-benchmarks shell: bash env: - TORCH_COMPILE_DISABLE: "1" TORCHDYNAMO_DISABLE: "1" TRITON_DISABLE_LINE_INFO: "1" run: | @@ -209,7 +208,6 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - TORCH_COMPILE_DISABLE: "1" TORCHDYNAMO_DISABLE: "1" run: | set -eux diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json index 3b3981dd..e87b9212 100644 --- a/sglang-benchmarks/benchmarks/cuda/serving-tests.json +++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json @@ -17,24 +17,5 @@ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } - }, - { - "test_name": "serving_llama70B_tp4_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "tensor_parallel_size": 4, - "swap_space": 16, - "disable_log_stats": "", - "disable_log_requests": "", - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } } ] From 9edcfaa50d8fcf5f36b6bcab640a6c4e274ce48b Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Tue, 26 Aug 2025 12:03:23 -0700 Subject: [PATCH 51/57] try after removing extra env variables --- .github/scripts/common_utils.py | 0 .github/workflows/sglang-benchmark.yml | 4 ---- 2 files changed, 4 deletions(-) delete mode 100644 .github/scripts/common_utils.py diff --git a/.github/scripts/common_utils.py b/.github/scripts/common_utils.py deleted file mode 100644 index e69de29b..00000000 diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index fee1fd6c..cad98280 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -171,9 +171,6 @@ jobs: - name: Install SGLang working-directory: sglang-benchmarks shell: bash - env: - TORCHDYNAMO_DISABLE: "1" - TRITON_DISABLE_LINE_INFO: "1" run: | set -eux @@ -208,7 +205,6 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - TORCHDYNAMO_DISABLE: "1" run: | set -eux From ca9c3d85052541895ee9bd90d9030cacd63e8c00 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Tue, 26 Aug 2025 12:24:11 -0700 Subject: [PATCH 52/57] adding dynamo variable --- .github/scripts/run-sglang-performance-benchmarks.sh | 3 +++ .github/workflows/sglang-benchmark.yml | 1 + 2 files changed, 4 insertions(+) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index ac381961..130f384c 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -9,6 +9,9 @@ set -x set -o pipefail +# The helper functions and their implementations are referred from the implementation +# of the run-performance-benchmarks.sh script in the official vllm repo +# Path:- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh check_gpus() { if command -v nvidia-smi; then # check the number of GPUs and GPU type. diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index cad98280..a333d24a 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -205,6 +205,7 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + TORCHDYNAMO_DISABLE: "1" run: | set -eux From a414e4bccaa6b01b572ab8778ae36b9ad6c13972 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Tue, 26 Aug 2025 13:53:00 -0700 Subject: [PATCH 53/57] run sglang in a diff venv --- .github/scripts/run-sglang-performance-benchmarks.sh | 8 +++++--- .github/workflows/sglang-benchmark.yml | 8 +++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index 130f384c..e8c1478b 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -125,7 +125,7 @@ kill_processes_launched_by_current_bash() { } kill_gpu_processes() { - ps -aux + # ps -aux lsof -t -i:30000 | xargs -r kill -9 pgrep python3 | xargs -r kill -9 pgrep python | xargs -r kill -9 @@ -221,9 +221,11 @@ run_serving_tests() { continue fi - server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp" + # Use SGLang environment's Python directly for complete isolation + sglang_python="../sglang_env/bin/python3" + server_command="$sglang_python -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp" - # run the server + # run the server in a completely separate process with its own environment echo "Running test case $test_name" echo "Server command: $server_command" bash -c "$server_command" & diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index a333d24a..417dab43 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -174,6 +174,10 @@ jobs: run: | set -eux + # Create a virtual environment for SGLang + uv venv sglang_env + source sglang_env/bin/activate + # Install SGLang from source pushd sglang pip install -e "python[all]" @@ -181,6 +185,9 @@ jobs: # Verify installations python3 -c "import sglang; print('SGLang installed successfully')" + # Deactivate the environment + deactivate + - name: Setup benchmark tests env: MODELS: ${{ matrix.models }} @@ -205,7 +212,6 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - TORCHDYNAMO_DISABLE: "1" run: | set -eux From 0b7f1cf5ed722aab95dd5ca6f4767ee371bf82e2 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Tue, 26 Aug 2025 14:11:59 -0700 Subject: [PATCH 54/57] debug issue --- .github/scripts/run-sglang-performance-benchmarks.sh | 2 +- .github/workflows/sglang-benchmark.yml | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index e8c1478b..6cbcb8c4 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -125,7 +125,7 @@ kill_processes_launched_by_current_bash() { } kill_gpu_processes() { - # ps -aux + ps -aux lsof -t -i:30000 | xargs -r kill -9 pgrep python3 | xargs -r kill -9 pgrep python | xargs -r kill -9 diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 417dab43..462cd380 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -181,9 +181,7 @@ jobs: # Install SGLang from source pushd sglang pip install -e "python[all]" - - # Verify installations - python3 -c "import sglang; print('SGLang installed successfully')" + popd # Deactivate the environment deactivate From 81503c4fd31c90b020fcf9b0dce49c2fd481aca1 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Tue, 26 Aug 2025 14:29:59 -0700 Subject: [PATCH 55/57] revert the changes --- .github/scripts/run-sglang-performance-benchmarks.sh | 6 ++---- .github/workflows/sglang-benchmark.yml | 10 +++------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index 6cbcb8c4..130f384c 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -221,11 +221,9 @@ run_serving_tests() { continue fi - # Use SGLang environment's Python directly for complete isolation - sglang_python="../sglang_env/bin/python3" - server_command="$sglang_python -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp" + server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp" - # run the server in a completely separate process with its own environment + # run the server echo "Running test case $test_name" echo "Server command: $server_command" bash -c "$server_command" & diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 462cd380..a333d24a 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -174,17 +174,12 @@ jobs: run: | set -eux - # Create a virtual environment for SGLang - uv venv sglang_env - source sglang_env/bin/activate - # Install SGLang from source pushd sglang pip install -e "python[all]" - popd - # Deactivate the environment - deactivate + # Verify installations + python3 -c "import sglang; print('SGLang installed successfully')" - name: Setup benchmark tests env: @@ -210,6 +205,7 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + TORCHDYNAMO_DISABLE: "1" run: | set -eux From c1c13bae7e5841a6e1abe7029d43945d3deef23e Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 27 Aug 2025 11:01:35 -0700 Subject: [PATCH 56/57] address review comments --- .../run-sglang-performance-benchmarks.sh | 21 ------------------- .github/workflows/sglang-benchmark.yml | 3 +-- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh index 130f384c..7900becf 100644 --- a/.github/scripts/run-sglang-performance-benchmarks.sh +++ b/.github/scripts/run-sglang-performance-benchmarks.sh @@ -143,25 +143,6 @@ kill_gpu_processes() { fi } -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent - if command -v buildkite-agent >/dev/null 2>&1; then - BUILDKITE_AGENT_COMMAND="buildkite-agent" - elif [ -f /workspace/buildkite-agent ]; then - BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent" - else - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - - # Use the determined command to annotate and upload artifacts - $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "sglang-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md" - $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" -} - run_serving_tests() { # run serving tests using `sglang.bench_serving` command # $1: a json file specifying serving test cases @@ -345,8 +326,6 @@ main() { else echo "No JSON result files were generated." >> "$RESULTS_FOLDER/benchmark_results.md" fi - - upload_to_buildkite } main "$@" diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index a333d24a..5b6ec574 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -202,14 +202,13 @@ jobs: find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display" - name: Run SGLang benchmark + working-directory: sglang-benchmarks/benchmarks env: HF_TOKEN: ${{ secrets.HF_TOKEN }} SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 TORCHDYNAMO_DISABLE: "1" run: | set -eux - - cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh - name: Upload the benchmark results From e2e6af0f4ddd96ac6391bbeaafb7908680db4586 Mon Sep 17 00:00:00 2001 From: Naman Lalit Date: Wed, 27 Aug 2025 11:39:57 -0700 Subject: [PATCH 57/57] add a todo for env variable --- .github/workflows/sglang-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml index 5b6ec574..d3bcb7d2 100644 --- a/.github/workflows/sglang-benchmark.yml +++ b/.github/workflows/sglang-benchmark.yml @@ -206,7 +206,7 @@ jobs: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - TORCHDYNAMO_DISABLE: "1" + TORCHDYNAMO_DISABLE: "1" #TODO: remove this variable in future. As of now, this is a workaround to fix cuda errors to avoid breaking the sglang server. run: | set -eux bash ../../.github/scripts/run-sglang-performance-benchmarks.sh