From 4964a592a02de0eaea0f931d26d813d1be891f17 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 14:02:23 -0700
Subject: [PATCH 01/57] Added sglang workflow files

---
 .github/workflows/sglang-benchmark.yml        | 340 ++++++++++++++++++
 .../benchmarks/cuda/latency-tests.json        |  12 +
 .../benchmarks/cuda/serving-tests.json        |  21 ++
 .../benchmarks/cuda/throughput-tests.json     |  13 +
 4 files changed, 386 insertions(+)
 create mode 100644 .github/workflows/sglang-benchmark.yml
 create mode 100644 sglang-benchmarks/benchmarks/cuda/latency-tests.json
 create mode 100644 sglang-benchmarks/benchmarks/cuda/serving-tests.json
 create mode 100644 sglang-benchmarks/benchmarks/cuda/throughput-tests.json

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
new file mode 100644
index 00000000..d1609425
--- /dev/null
+++ b/.github/workflows/sglang-benchmark.yml
@@ -0,0 +1,340 @@
+name: SGLang Benchmark
+
+on:
+  workflow_dispatch:
+    inputs:
+      vllm_branch:
+        description: vLLM branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER/head for pre-merge check on pull request)
+        required: true
+        type: string
+        default: main
+      vllm_commit:
+        description: vLLM commit (optional, default to the latest commit in the branch that has not yet been benchmarked)
+        required: false
+        type: string
+      sglang_branch:
+        description: SGLang branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER)
+        required: true
+        type: string
+        default: main
+      models:
+        description: |
+          A comma-separated list of models from sglang-benchmarks/benchmarks (optional, default to run everything)
+        required: false
+        type: string
+      runners:
+        description: |
+          A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
+        required: true
+        type: string
+        default: h100
+  pull_request:
+    paths:
+      - .github/workflows/sglang-benchmark.yml
+      - sglang-benchmarks/**
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  set-parameters:
+    runs-on: ubuntu-latest
+    outputs:
+      benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Set parameters
+        id: set-parameters
+        shell: bash
+        env:
+          MODELS: ${{ inputs.models || '' }}
+          RUNNERS: ${{ inputs.runners || '' }}
+        run: |
+          set -eux
+
+          # The generated matrix is grouped by model and runner
+          python .github/scripts/generate_vllm_benchmark_matrix.py \
+            --benchmark-configs-dir sglang-benchmarks/benchmarks \
+            --models "${MODELS}" \
+            --runners "${RUNNERS}"
+
+  benchmarks:
+    name: Run SGLang benchmarks
+    needs: set-parameters
+    strategy:
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    environment: pytorch-x-vllm
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Checkout vLLM repository
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          path: sglang-benchmarks/vllm
+          ref: ${{ inputs.vllm_branch || 'main' }}
+          fetch-depth: 0
+
+      - name: Checkout SGLang repository
+        uses: actions/checkout@v4
+        with:
+          repository: sgl-project/sglang.git
+          path: sglang-benchmarks/sglang
+          ref: ${{ inputs.sglang_branch || 'main' }}
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        # Amazon Linux fails on this step
+        continue-on-error: true
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+
+      - name: Check if the device is supported
+        shell: bash
+        run: |
+          set -eux
+
+          if command -v nvidia-smi; then
+            DEVICE_NAME=cuda
+            nvidia-smi
+          elif command -v rocm-smi; then
+            DEVICE_NAME=rocm
+            rocm-smi
+          else
+            DEVICE_NAME=cpu
+            lscpu
+          fi
+          echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
+
+      - name: Set GPU name and type
+        working-directory: sglang-benchmarks
+        shell: bash
+        run: |
+          set -eux
+
+          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
+            DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
+          elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
+          fi
+          echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          set -eux
+
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            pip install -r .github/scripts/requirements.txt \
+              --extra-index-url https://download.pytorch.org/whl/rocm6.3
+          else
+            pip install -r .github/scripts/requirements.txt \
+              --extra-index-url https://download.pytorch.org/whl/cu128
+          fi
+
+      - name: Install SGLang
+        working-directory: sglang-benchmarks/sglang
+        shell: bash
+        run: |
+          set -eux
+          pip install -e "python[all]"
+
+      - name: Set Docker registry
+        shell: bash
+        env:
+          HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
+        run: |
+          set -eux
+
+          # Mimic the logic from vllm ci-infra test template
+          if [[ "${HEAD_BRANCH}" == "main" ]]; then
+            DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
+          else
+            DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-test-repo
+          fi
+
+          DOCKER_IMAGE_SUFFIX=""
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DOCKER_IMAGE_SUFFIX=-cpu
+          fi
+          echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
+          echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
+
+      - name: Authenticate with AWS
+        # Only need for DGX hosts
+        if: contains(env.DEVICE_TYPE, 'B200')
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/public_ecr_read_only
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+      - name: Login to public.ecr.aws
+        # Only need for DGX hosts
+        if: contains(env.DEVICE_TYPE, 'B200')
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+        with:
+          registry-type: public
+
+      - name: Check for last benchmark commit
+        working-directory: sglang-benchmarks
+        env:
+          HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
+          HEAD_SHA: ${{ inputs.vllm_commit || '' }}
+          MODELS: ${{ matrix.models }}
+        run: |
+          set -eux
+
+          if [[ -z "${HEAD_SHA}" ]]; then
+            pushd vllm
+            # Looking back the latest 100 commits is enough
+            for i in {0..99}
+            do
+              # Check if the image is there, if it doesn't then check an older one
+              # because the commit is too recent
+              HEAD_SHA=$(git rev-parse --verify HEAD~${i})
+              DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
+
+              # No Docker image available yet because the commit is too recent
+              if ! docker manifest inspect "${DOCKER_IMAGE}"; then
+                continue
+              fi
+
+              NOT_EXIST=0
+              S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
+              aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
+
+              if [[ ${NOT_EXIST} == "1" ]]; then
+                echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
+                break
+              fi
+            done
+            popd
+          fi
+
+          echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
+
+          # Print the benchmark commit for rereference
+          echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Setup CUDA GPU_FLAG for docker run
+        if: env.DEVICE_NAME == 'cuda'
+        run: |
+          echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+      - name: Setup ROCm
+        if: env.DEVICE_NAME == 'rocm'
+        uses: pytorch/pytorch/./.github/actions/setup-rocm@main
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        run: |
+          echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+
+      - name: Setup benchmark tests
+        env:
+          MODELS: ${{ matrix.models }}
+        run: |
+          set -eux
+
+          pushd sglang-benchmarks/vllm
+          git checkout "${HEAD_SHA}"
+          rm .buildkite/nightly-benchmarks/tests/*.json || true
+          popd
+
+          # Set the list of benchmarks we want to cover in this runner
+          python3 .github/scripts/setup_vllm_benchmark.py \
+            --from-benchmark-configs-dir sglang-benchmarks/benchmarks \
+            --to-benchmark-configs-dir sglang-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
+            --models "${MODELS}" \
+            --device "${DEVICE_NAME}"
+
+          pushd sglang-benchmarks/vllm
+          ls -lah .buildkite/nightly-benchmarks/tests
+          find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \;
+          popd
+
+      - name: Run SGLang benchmark
+        env:
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }}
+          # SGLang-specific environment variables
+          HF_HUB_DISABLE_XET: 1
+          NIGHTLY_BACKENDS: sglang
+          CURRENT_LLM_SERVING_ENGINE: sglang
+          ENGINE_VERSION: v1
+          SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+        run: |
+          set -eux
+
+          if [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            ON_CPU=1
+          else
+            ON_CPU=0
+          fi
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
+            -e SCCACHE_BUCKET \
+            -e SCCACHE_REGION \
+            -e DEVICE_NAME \
+            -e DEVICE_TYPE \
+            -e HF_TOKEN \
+            -e HF_HUB_DISABLE_XET \
+            -e NIGHTLY_BACKENDS \
+            -e CURRENT_LLM_SERVING_ENGINE \
+            -e ENGINE_VERSION \
+            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
+            -e ON_CPU="${ON_CPU}" \
+            --ipc=host \
+            --tty \
+            --detach \
+            --security-opt seccomp=unconfined \
+            --shm-size=4g \
+            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+            -w /tmp/workspace \
+            "${DOCKER_IMAGE}"
+          )
+
+          # Set VLLM_SOURCE_CODE inside the container and run SGLang benchmark
+          docker exec -t "${container_name}" bash -c "
+            export VLLM_SOURCE_CODE=/tmp/workspace/sglang-benchmarks/vllm
+            cd sglang-benchmarks/vllm &&
+            bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+          "
+
+      - name: Authenticate with AWS
+        # AWS CUDA runners already have access to the bucket via its runner IAM role
+        if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200')
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+    # Keep a copy of the benchmark results on GitHub for reference
+      - uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }}
+          path: vllm-benchmarks/vllm/benchmarks/results
diff --git a/sglang-benchmarks/benchmarks/cuda/latency-tests.json b/sglang-benchmarks/benchmarks/cuda/latency-tests.json
new file mode 100644
index 00000000..ace766b7
--- /dev/null
+++ b/sglang-benchmarks/benchmarks/cuda/latency-tests.json
@@ -0,0 +1,12 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
new file mode 100644
index 00000000..e87b9212
--- /dev/null
+++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
@@ -0,0 +1,21 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    }
+]
diff --git a/sglang-benchmarks/benchmarks/cuda/throughput-tests.json b/sglang-benchmarks/benchmarks/cuda/throughput-tests.json
new file mode 100644
index 00000000..f339ffef
--- /dev/null
+++ b/sglang-benchmarks/benchmarks/cuda/throughput-tests.json
@@ -0,0 +1,13 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]

From e6a91f9f2e2b686718da80c76c9b96632ea27fda Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 14:28:24 -0700
Subject: [PATCH 02/57] fixing the source code location

---
 .github/workflows/sglang-benchmark.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index d1609425..c0e72907 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -283,6 +283,7 @@ jobs:
           CURRENT_LLM_SERVING_ENGINE: sglang
           ENGINE_VERSION: v1
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+          VLLM_SOURCE_CODE: $(pwd)
         run: |
           set -eux
 
@@ -306,6 +307,7 @@ jobs:
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
             -e ON_CPU="${ON_CPU}" \
+            -e VLLM_SOURCE_CODE \
             --ipc=host \
             --tty \
             --detach \
@@ -316,10 +318,10 @@ jobs:
             "${DOCKER_IMAGE}"
           )
 
-          # Set VLLM_SOURCE_CODE inside the container and run SGLang benchmark
+          # Set VLLM_SOURCE_CODE_LOC inside the container and run SGLang benchmark
           docker exec -t "${container_name}" bash -c "
-            export VLLM_SOURCE_CODE=/tmp/workspace/sglang-benchmarks/vllm
             cd sglang-benchmarks/vllm &&
+            export VLLM_SOURCE_CODE_LOC=$(pwd)
             bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
           "
 

From 843e7cc8f4e15025ef7fc668ff6e69e451622e74 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 15:15:46 -0700
Subject: [PATCH 03/57] fix source code location

---
 .github/workflows/sglang-benchmark.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index c0e72907..96d85c72 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -283,7 +283,7 @@ jobs:
           CURRENT_LLM_SERVING_ENGINE: sglang
           ENGINE_VERSION: v1
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
-          VLLM_SOURCE_CODE: $(pwd)
+          VLLM_SOURCE_CODE: /tmp/workspace/sglang-benchmarks/vllm
         run: |
           set -eux
 
@@ -321,7 +321,7 @@ jobs:
           # Set VLLM_SOURCE_CODE_LOC inside the container and run SGLang benchmark
           docker exec -t "${container_name}" bash -c "
             cd sglang-benchmarks/vllm &&
-            export VLLM_SOURCE_CODE_LOC=$(pwd)
+            export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm &&
             bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
           "
 

From 8d4eac45b14ccfe26c78f7d31ecec22a7d66f3da Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 16:02:21 -0700
Subject: [PATCH 04/57] add missing json files

---
 .github/workflows/sglang-benchmark.yml        |   2 +-
 .../benchmarks/cuda/genai-perf-tests.json     |  23 ++
 .../benchmarks/cuda/nightly-tests.json        | 323 ++++++++++++++++++
 3 files changed, 347 insertions(+), 1 deletion(-)
 create mode 100644 sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json
 create mode 100644 sglang-benchmarks/benchmarks/cuda/nightly-tests.json

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 96d85c72..81599caa 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -339,4 +339,4 @@ jobs:
       - uses: actions/upload-artifact@v4
         with:
           name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }}
-          path: vllm-benchmarks/vllm/benchmarks/results
+          path: sglang-benchmarks/vllm/benchmarks/results
diff --git a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json
new file mode 100644
index 00000000..a207dc93
--- /dev/null
+++ b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json
@@ -0,0 +1,23 @@
+[
+    {
+        "test_name": "llama8B_tp1_genai_perf",
+        "qps_list": [4,8,16,32],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tp": 1,
+            "port": 8000,
+            "num_prompts": 500,
+            "reuse_server": false
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "genai_perf_input_parameters": {
+        }
+    }
+]
diff --git a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
new file mode 100644
index 00000000..9fe7b5b1
--- /dev/null
+++ b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
@@ -0,0 +1,323 @@
+[
+    {
+        "test_name": "llama8B_tp1_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000,
+            "reuse_server": false
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama8B_tp1_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama8B_tp1_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tp": 1,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "enable_torch_compile": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sharegpt",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000,
+            "reuse_server": false
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sonnet_512_16",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 16,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4_sonnet_512_256",
+        "qps_list": [4,8,16,32,"inf"],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sonnet",
+            "dataset_path": "./sonnet_4x.txt",
+            "num_prompts": 500,
+            "port": 8000,
+            "sonnet_input_len": 512,
+            "sonnet_output_len": 256,
+            "sonnet_prefix_len": 50,
+            "reuse_server": true
+        },
+        "lmdeploy_server_parameters": {
+            "dtype": "bfloat16"
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "bfloat16",
+            "max_batch_size": 2048,
+            "max_input_len": 4096,
+            "max_seq_len": 6144,
+            "max_num_tokens": 16384,
+            "trt_llm_version": "v0.11.0"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "vllm_client_parameters": {
+        },
+        "sglang_server_parameters": {
+            "disable_radix_cache": "",
+            "dtype": "bfloat16"
+        },
+        "sglang_client_parameters": {
+        }
+    }
+]

From 2230a99c0b67a4a1f2632d1621799ba3390a19e1 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 16:22:14 -0700
Subject: [PATCH 05/57] update params

---
 sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json | 1 -
 sglang-benchmarks/benchmarks/cuda/nightly-tests.json    | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json
index a207dc93..ca9027e6 100644
--- a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json
+++ b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json
@@ -13,7 +13,6 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
diff --git a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
index 9fe7b5b1..9bdc2dee 100644
--- a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
+++ b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
@@ -37,7 +37,6 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -92,7 +91,6 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -147,7 +145,6 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -199,7 +196,6 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -253,7 +249,6 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -307,7 +302,6 @@
             "disable_log_stats": "",
             "disable_log_requests": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },

From 66d328f84e0a79e55035dc3d6aa06e374d7e0579 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 16:50:32 -0700
Subject: [PATCH 06/57] only run on h100 for now

---
 .github/workflows/sglang-benchmark.yml        |  26 ++-
 .../benchmarks/cuda/nightly-tests.json        | 156 ------------------
 2 files changed, 22 insertions(+), 160 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 81599caa..423fd2cb 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -55,7 +55,7 @@ jobs:
         shell: bash
         env:
           MODELS: ${{ inputs.models || '' }}
-          RUNNERS: ${{ inputs.runners || '' }}
+          RUNNERS: ${{ inputs.runners || 'h100' }}
         run: |
           set -eux
 
@@ -335,8 +335,26 @@ jobs:
           role-duration-seconds: 18000
           aws-region: us-east-1
 
-    # Keep a copy of the benchmark results on GitHub for reference
-      - uses: actions/upload-artifact@v4
+      - name: Create results summary
+        if: always()
+        run: |
+          RESULTS_DIR="sglang-benchmarks/vllm/benchmarks/results"
+          if [ -d "$RESULTS_DIR" ]; then
+            echo "## Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY
+            echo "- Device: ${{ env.DEVICE_TYPE }}" >> $GITHUB_STEP_SUMMARY
+            echo "- Models: ${{ matrix.models }}" >> $GITHUB_STEP_SUMMARY
+            echo "- Runner: ${{ matrix.runner }}" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "### Files Generated:" >> $GITHUB_STEP_SUMMARY
+            find "$RESULTS_DIR" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY
+          else
+            echo "⚠️ No benchmark results found in $RESULTS_DIR" >> $GITHUB_STEP_SUMMARY
+          fi
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        if: always()
         with:
-          name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }}
+          name: sglang-benchmark-results-${{ matrix.runner }}-${{ matrix.models }}
           path: sglang-benchmarks/vllm/benchmarks/results
+          retention-days: 30
diff --git a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
index 9bdc2dee..10bed8ab 100644
--- a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
+++ b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
@@ -157,161 +157,5 @@
         },
         "sglang_client_parameters": {
         }
-    },
-    {
-        "test_name": "llama70B_tp4_sharegpt",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000,
-            "reuse_server": false
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama70B_tp4_sonnet_512_16",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 16,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama70B_tp4_sonnet_512_256",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 256,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
     }
 ]

From 5938df2d9b72da243f3b784deaf3a5cb849efe11 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 17:46:45 -0700
Subject: [PATCH 07/57] fix serving model engine to sglang

---
 .github/workflows/sglang-benchmark.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 423fd2cb..72fb4960 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -279,7 +279,6 @@ jobs:
           DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }}
           # SGLang-specific environment variables
           HF_HUB_DISABLE_XET: 1
-          NIGHTLY_BACKENDS: sglang
           CURRENT_LLM_SERVING_ENGINE: sglang
           ENGINE_VERSION: v1
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
@@ -302,7 +301,6 @@ jobs:
             -e DEVICE_TYPE \
             -e HF_TOKEN \
             -e HF_HUB_DISABLE_XET \
-            -e NIGHTLY_BACKENDS \
             -e CURRENT_LLM_SERVING_ENGINE \
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
@@ -318,9 +316,11 @@ jobs:
             "${DOCKER_IMAGE}"
           )
 
-          # Set VLLM_SOURCE_CODE_LOC inside the container and run SGLang benchmark
+          # Run SGLang benchmark with proper environment variables
           docker exec -t "${container_name}" bash -c "
+            touch /sgl-workspace &&
             cd sglang-benchmarks/vllm &&
+            export CURRENT_LLM_SERVING_ENGINE=sglang &&
             export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm &&
             bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
           "

From 56558ec6ba96ec96e1e2f9b5665f4abd2e0c0259 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 17:53:22 -0700
Subject: [PATCH 08/57] sanitized results section

---
 .github/workflows/sglang-benchmark.yml | 36 ++++++++++++++++++--------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 72fb4960..da79f40e 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -335,26 +335,40 @@ jobs:
           role-duration-seconds: 18000
           aws-region: us-east-1
 
-      - name: Create results summary
+      - name: Upload the benchmark results
         if: always()
+        env:
+          BENCHMARK_RESULTS: sglang-benchmarks/vllm/benchmarks/results
+          MODELS: ${{ matrix.models }}
         run: |
-          RESULTS_DIR="sglang-benchmarks/vllm/benchmarks/results"
-          if [ -d "$RESULTS_DIR" ]; then
-            echo "## Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY
-            echo "- Device: ${{ env.DEVICE_TYPE }}" >> $GITHUB_STEP_SUMMARY
-            echo "- Models: ${{ matrix.models }}" >> $GITHUB_STEP_SUMMARY
+          set -eux
+
+          sudo chown -R ${UID} "${BENCHMARK_RESULTS}" || true
+          ls -lah "${BENCHMARK_RESULTS}" || echo "Results directory not found"
+
+          SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g")
+          SANITIZED_MODELS="${MODELS//\//_}"
+
+          # Create results summary
+          if [ -d "${BENCHMARK_RESULTS}" ]; then
+            echo "## SGLang Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY
+            echo "- Device: ${DEVICE_TYPE}" >> $GITHUB_STEP_SUMMARY
+            echo "- Models: ${MODELS}" >> $GITHUB_STEP_SUMMARY
             echo "- Runner: ${{ matrix.runner }}" >> $GITHUB_STEP_SUMMARY
             echo "" >> $GITHUB_STEP_SUMMARY
             echo "### Files Generated:" >> $GITHUB_STEP_SUMMARY
-            find "$RESULTS_DIR" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY
+            find "${BENCHMARK_RESULTS}" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY || echo "- No JSON files found" >> $GITHUB_STEP_SUMMARY
           else
-            echo "⚠️ No benchmark results found in $RESULTS_DIR" >> $GITHUB_STEP_SUMMARY
+            echo "⚠️ No benchmark results found in ${BENCHMARK_RESULTS}" >> $GITHUB_STEP_SUMMARY
           fi
 
-      - name: Upload benchmark results
-        uses: actions/upload-artifact@v4
+          echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV
+          echo "SANITIZED_MODELS=$SANITIZED_MODELS" >> $GITHUB_ENV
+
+      # Keep a copy of the benchmark results on GitHub for reference
+      - uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: sglang-benchmark-results-${{ matrix.runner }}-${{ matrix.models }}
+          name: sglang-benchmark-results-${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }}
           path: sglang-benchmarks/vllm/benchmarks/results
           retention-days: 30

From 0e0998ae1a88c3971e897d4e6f55ec8a509d3e6c Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 19:31:56 -0700
Subject: [PATCH 09/57] fix sglang issues

---
 .github/workflows/sglang-benchmark.yml | 31 ++++++++++++++++----------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index da79f40e..983d2a40 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -148,13 +148,6 @@ jobs:
               --extra-index-url https://download.pytorch.org/whl/cu128
           fi
 
-      - name: Install SGLang
-        working-directory: sglang-benchmarks/sglang
-        shell: bash
-        run: |
-          set -eux
-          pip install -e "python[all]"
-
       - name: Set Docker registry
         shell: bash
         env:
@@ -316,12 +309,26 @@ jobs:
             "${DOCKER_IMAGE}"
           )
 
-          # Run SGLang benchmark with proper environment variables
+          echo "container_name=${container_name}" >> $GITHUB_ENV
+
+          # Install SGLang inside the container and run benchmark
           docker exec -t "${container_name}" bash -c "
-            touch /sgl-workspace &&
-            cd sglang-benchmarks/vllm &&
-            export CURRENT_LLM_SERVING_ENGINE=sglang &&
-            export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm &&
+            set -eux
+
+            # Install SGLang inside the container
+            cd /tmp/workspace/sglang-benchmarks/sglang
+            pip install -e 'python[all]'
+
+            # (TODO: Remove this once verified)
+            python3 -c 'import sglang; print(\"SGLang installed successfully\")'
+
+            # Create SGLang workspace marker and set environment
+            touch /sgl-workspace
+            export CURRENT_LLM_SERVING_ENGINE=sglang
+            export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm
+
+            # Run the benchmark
+            cd /tmp/workspace/sglang-benchmarks/vllm
             bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
           "
 

From 5a18dc53c3380ae81fd8cdc8ffd207b262ccb621 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 20 Aug 2025 23:29:05 -0700
Subject: [PATCH 10/57] remove unused files for now

---
 .../benchmarks/cuda/latency-tests.json        | 12 -----------
 .../benchmarks/cuda/serving-tests.json        | 21 -------------------
 .../benchmarks/cuda/throughput-tests.json     | 13 ------------
 3 files changed, 46 deletions(-)
 delete mode 100644 sglang-benchmarks/benchmarks/cuda/latency-tests.json
 delete mode 100644 sglang-benchmarks/benchmarks/cuda/serving-tests.json
 delete mode 100644 sglang-benchmarks/benchmarks/cuda/throughput-tests.json

diff --git a/sglang-benchmarks/benchmarks/cuda/latency-tests.json b/sglang-benchmarks/benchmarks/cuda/latency-tests.json
deleted file mode 100644
index ace766b7..00000000
--- a/sglang-benchmarks/benchmarks/cuda/latency-tests.json
+++ /dev/null
@@ -1,12 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    }
-]
diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
deleted file mode 100644
index e87b9212..00000000
--- a/sglang-benchmarks/benchmarks/cuda/serving-tests.json
+++ /dev/null
@@ -1,21 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    }
-]
diff --git a/sglang-benchmarks/benchmarks/cuda/throughput-tests.json b/sglang-benchmarks/benchmarks/cuda/throughput-tests.json
deleted file mode 100644
index f339ffef..00000000
--- a/sglang-benchmarks/benchmarks/cuda/throughput-tests.json
+++ /dev/null
@@ -1,13 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp1",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]

From 4b03ad0a5b890d7898cba7501944cc8727c3b8d7 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 19:53:48 -0700
Subject: [PATCH 11/57] updated workflow execution

---
 .../run-sglang-performance-benchmarks.sh      | 346 ++++++++++++++++++
 .github/workflows/sglang-benchmark.yml        | 194 ++--------
 .../benchmarks/cuda/serving-tests.json        |  77 ++++
 3 files changed, 449 insertions(+), 168 deletions(-)
 create mode 100644 .github/scripts/run-sglang-performance-benchmarks.sh
 create mode 100644 sglang-benchmarks/benchmarks/cuda/serving-tests.json

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
new file mode 100644
index 00000000..dc96ae0b
--- /dev/null
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# This script should be run inside the CI process
+# This script assumes that we are already inside the sglang-benchmarks/benchmarks/ directory
+# Benchmarking results will be available inside sglang-benchmarks/benchmarks/results/
+
+# Do not set -e, as some models may crash occasionally
+# and we still want to see other benchmarking results even when some models crash.
+set -x
+set -o pipefail
+
+check_gpus() {
+  if command -v nvidia-smi; then
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  elif command -v amd-smi; then
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  fi
+
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  if command -v nvidia-smi; then
+    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  elif command -v amd-smi; then
+    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  fi
+  echo "GPU type is $gpu_type"
+}
+
+check_cpus() {
+  # check the number of CPUs and NUMA Node and GPU type.
+  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
+  if [[ $numa_count -gt 0 ]]; then
+    echo "NUMA found."
+    echo $numa_count
+  else
+    echo "Need at least 1 NUMA to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="cpu"
+  echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+  # check if HF_TOKEN is available and valid
+  if [[ -z "$HF_TOKEN" ]]; then
+    echo "Error: HF_TOKEN is not set."
+    exit 1
+  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Error: HF_TOKEN does not start with 'hf_'."
+    exit 1
+  else
+    echo "HF_TOKEN is set and valid."
+  fi
+}
+
+ensure_sharegpt_downloaded() {
+  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
+  if [ ! -f "$FILE" ]; then
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
+  else
+    echo "$FILE already exists."
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+json2envs() {
+  # transforms the JSON string to environment variables.
+  # example:
+  # input: { "SGLANG_DISABLE_CUDA_GRAPH": 1 }
+  # output: SGLANG_DISABLE_CUDA_GRAPH=1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map((.key ) + "=" + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for sglang server to start
+  # return 1 if sglang server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:30000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+kill_processes_launched_by_current_bash() {
+  # Kill all python processes launched from current bash script
+  current_shell_pid=$$
+  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
+  if [ -n "$processes" ]; then
+    echo "Killing the following processes matching '$1':"
+    echo "$processes"
+    echo "$processes" | xargs kill -9
+  else
+    echo "No processes found matching '$1'."
+  fi
+}
+
+kill_gpu_processes() {
+#   ps -aux
+  lsof -t -i:30000 | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+  pgrep python | xargs -r kill -9
+  pgrep -f "sglang" | xargs -r kill -9
+
+  # wait until GPU memory usage smaller than 1GB
+  if command -v nvidia-smi; then
+    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+      sleep 1
+    done
+  elif command -v amd-smi; then
+    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
+      sleep 1
+    done
+  fi
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
+  if command -v buildkite-agent >/dev/null 2>&1; then
+    BUILDKITE_AGENT_COMMAND="buildkite-agent"
+  elif [ -f /workspace/buildkite-agent ]; then
+    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
+  else
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+
+  # Use the determined command to annotate and upload artifacts
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "sglang-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
+  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
+}
+
+run_serving_tests() {
+  # run serving tests using `sglang.bench_serving` command
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^serving_ ]]; then
+      echo "In serving-test.json, test_name must start with \"serving_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.server_parameters')
+    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
+    client_params=$(echo "$params" | jq -r '.client_parameters')
+    server_args=$(json2args "$server_params")
+    server_envs=$(json2envs "$server_envs")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # Extract only specific SGLang server parameters
+    model_path=$(echo "$server_params" | jq -r '.model_path // .model')
+    context_length=$(echo "$server_params" | jq -r '.context_length // 4096')
+
+    # check if there is enough resources to run the test
+    tp=$(echo "$server_params" | jq -r '.tp // 1')
+    if [ "$ON_CPU" == "1" ]; then
+      if [[ $numa_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+        continue
+      fi
+    else
+      if [[ $gpu_count -lt $tp ]]; then
+        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+        continue
+      fi
+    fi
+
+    # check if server model and client model is aligned
+    server_model="$model_path"
+    client_model=$(echo "$client_params" | jq -r '.model // .model_path')
+    if [[ $server_model != "$client_model" ]]; then
+      echo "Server model and client model must be the same. Skip testcase $test_name."
+      continue
+    fi
+
+    server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+    server_pid=$!
+
+    # wait until the server is alive
+    if wait_for_server; then
+      echo ""
+      echo "SGLang server is up and running."
+    else
+      echo ""
+      echo "SGLang failed to start within the timeout period."
+      kill -9 $server_pid
+      continue
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+      echo "new test name $new_test_name"
+
+      # SGLang bench_serving command
+      client_command="python3 -m sglang.bench_serving \
+        --backend sglang \
+        --dataset-name sharegpt \
+        --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
+        --model $client_model \
+        --request-rate $qps \
+        --port 30000 \
+        --output-file $RESULTS_FOLDER/${new_test_name}.json \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      bash -c "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill -9 $server_pid
+    kill_gpu_processes
+  done
+}
+
+main() {
+    local ARCH
+    ARCH=''
+    if [ "$ON_CPU" == "1" ];then
+        check_cpus
+        ARCH='-cpu'
+    else
+        check_gpus
+    fi
+    check_hf_token
+
+    # dependencies
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+    (which lsof) || (apt-get update && apt-get install -y lsof)
+
+    # get the current IP address, required by SGLang bench commands
+    export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}')
+    # turn off the reporting of the status of each request, to clean up the terminal output
+    export SGLANG_LOGGING_LEVEL="WARNING"
+
+    # prepare for benchmarking
+    ensure_sharegpt_downloaded
+    declare -g RESULTS_FOLDER=results/
+    mkdir -p $RESULTS_FOLDER
+    BENCHMARK_ROOT=tests/
+
+    # benchmarking - look for test files in the tests/ directory
+    if [ -f "$BENCHMARK_ROOT/serving-tests$ARCH.json" ]; then
+    run_serving_tests "$BENCHMARK_ROOT/serving-tests$ARCH.json"
+    elif [ -f "$BENCHMARK_ROOT/serving-tests.json" ]; then
+    run_serving_tests "$BENCHMARK_ROOT/serving-tests.json"
+    else
+    echo "No serving test file found"
+    fi
+
+    # postprocess benchmarking results
+    pip install tabulate pandas
+
+    # Create a simple markdown summary of results
+    echo "# SGLang Benchmark Results" > "$RESULTS_FOLDER/benchmark_results.md"
+    echo "" >> "$RESULTS_FOLDER/benchmark_results.md"
+    echo "## Test Results Summary" >> "$RESULTS_FOLDER/benchmark_results.md"
+    echo "" >> "$RESULTS_FOLDER/benchmark_results.md"
+
+    # List all JSON result files
+    if ls "$RESULTS_FOLDER"/*.json 1> /dev/null 2>&1; then
+    echo "### Generated Result Files:" >> "$RESULTS_FOLDER/benchmark_results.md"
+    for file in "$RESULTS_FOLDER"/*.json; do
+        echo "- $(basename "$file")" >> "$RESULTS_FOLDER/benchmark_results.md"
+    done
+    else
+    echo "No JSON result files were generated." >> "$RESULTS_FOLDER/benchmark_results.md"
+    fi
+
+    upload_to_buildkite
+}
+
+main "$@"
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 983d2a40..0b4328fd 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -3,15 +3,6 @@ name: SGLang Benchmark
 on:
   workflow_dispatch:
     inputs:
-      vllm_branch:
-        description: vLLM branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER/head for pre-merge check on pull request)
-        required: true
-        type: string
-        default: main
-      vllm_commit:
-        description: vLLM commit (optional, default to the latest commit in the branch that has not yet been benchmarked)
-        required: false
-        type: string
       sglang_branch:
         description: SGLang branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER)
         required: true
@@ -80,14 +71,6 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
 
-      - name: Checkout vLLM repository
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          path: sglang-benchmarks/vllm
-          ref: ${{ inputs.vllm_branch || 'main' }}
-          fetch-depth: 0
-
       - name: Checkout SGLang repository
         uses: actions/checkout@v4
         with:
@@ -148,29 +131,6 @@ jobs:
               --extra-index-url https://download.pytorch.org/whl/cu128
           fi
 
-      - name: Set Docker registry
-        shell: bash
-        env:
-          HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
-        run: |
-          set -eux
-
-          # Mimic the logic from vllm ci-infra test template
-          if [[ "${HEAD_BRANCH}" == "main" ]]; then
-            DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
-          else
-            DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-test-repo
-          fi
-
-          DOCKER_IMAGE_SUFFIX=""
-          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
-            DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
-          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
-            DOCKER_IMAGE_SUFFIX=-cpu
-          fi
-          echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
-          echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
-
       - name: Authenticate with AWS
         # Only need for DGX hosts
         if: contains(env.DEVICE_TYPE, 'B200')
@@ -187,59 +147,23 @@ jobs:
         with:
           registry-type: public
 
-      - name: Check for last benchmark commit
+      - name: Install vLLM and SGLang
         working-directory: sglang-benchmarks
-        env:
-          HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
-          HEAD_SHA: ${{ inputs.vllm_commit || '' }}
-          MODELS: ${{ matrix.models }}
+        shell: bash
         run: |
           set -eux
 
-          if [[ -z "${HEAD_SHA}" ]]; then
-            pushd vllm
-            # Looking back the latest 100 commits is enough
-            for i in {0..99}
-            do
-              # Check if the image is there, if it doesn't then check an older one
-              # because the commit is too recent
-              HEAD_SHA=$(git rev-parse --verify HEAD~${i})
-              DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
-
-              # No Docker image available yet because the commit is too recent
-              if ! docker manifest inspect "${DOCKER_IMAGE}"; then
-                continue
-              fi
-
-              NOT_EXIST=0
-              S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
-              aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
-
-              if [[ ${NOT_EXIST} == "1" ]]; then
-                echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
-                break
-              fi
-            done
-            popd
-          fi
-
-          echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
+          # Install vLLM
+          pip install vllm
 
-          # Print the benchmark commit for rereference
-          echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
+          # Install SGLang from source
+          pushd sglang
+          pip install -e "python[all]"
 
-      - name: Setup CUDA GPU_FLAG for docker run
-        if: env.DEVICE_NAME == 'cuda'
-        run: |
-          echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
-
-      - name: Setup ROCm
-        if: env.DEVICE_NAME == 'rocm'
-        uses: pytorch/pytorch/./.github/actions/setup-rocm@main
-
-      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
-        run: |
-          echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+          # Verify installations
+          python3 -c "import vllm; print('vLLM installed successfully')"
+          python3 -c "import sglang; print('SGLang installed successfully')"
+          popd
 
       - name: Setup benchmark tests
         env:
@@ -247,105 +171,39 @@ jobs:
         run: |
           set -eux
 
-          pushd sglang-benchmarks/vllm
-          git checkout "${HEAD_SHA}"
-          rm .buildkite/nightly-benchmarks/tests/*.json || true
-          popd
+          # Create benchmarks directory structure
+          mkdir -p sglang-benchmarks/benchmarks/results
+          mkdir -p sglang-benchmarks/benchmarks/tests
 
           # Set the list of benchmarks we want to cover in this runner
           python3 .github/scripts/setup_vllm_benchmark.py \
             --from-benchmark-configs-dir sglang-benchmarks/benchmarks \
-            --to-benchmark-configs-dir sglang-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
+            --to-benchmark-configs-dir sglang-benchmarks/benchmarks/tests \
             --models "${MODELS}" \
             --device "${DEVICE_NAME}"
 
-          pushd sglang-benchmarks/vllm
-          ls -lah .buildkite/nightly-benchmarks/tests
-          find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \;
-          popd
+          ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found"
+          find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display"
 
       - name: Run SGLang benchmark
         env:
-          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-          SCCACHE_REGION: us-east-1
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }}
-          # SGLang-specific environment variables
-          HF_HUB_DISABLE_XET: 1
-          CURRENT_LLM_SERVING_ENGINE: sglang
-          ENGINE_VERSION: v1
-          SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
-          VLLM_SOURCE_CODE: /tmp/workspace/sglang-benchmarks/vllm
+          MODELS: ${{ matrix.models }}
         run: |
           set -eux
 
-          if [[ "${DEVICE_NAME}" == "cpu" ]]; then
-            ON_CPU=1
-          else
-            ON_CPU=0
-          fi
-
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
-            -e SCCACHE_BUCKET \
-            -e SCCACHE_REGION \
-            -e DEVICE_NAME \
-            -e DEVICE_TYPE \
-            -e HF_TOKEN \
-            -e HF_HUB_DISABLE_XET \
-            -e CURRENT_LLM_SERVING_ENGINE \
-            -e ENGINE_VERSION \
-            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
-            -e ON_CPU="${ON_CPU}" \
-            -e VLLM_SOURCE_CODE \
-            --ipc=host \
-            --tty \
-            --detach \
-            --security-opt seccomp=unconfined \
-            --shm-size=4g \
-            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-            -w /tmp/workspace \
-            "${DOCKER_IMAGE}"
-          )
-
-          echo "container_name=${container_name}" >> $GITHUB_ENV
-
-          # Install SGLang inside the container and run benchmark
-          docker exec -t "${container_name}" bash -c "
-            set -eux
-
-            # Install SGLang inside the container
-            cd /tmp/workspace/sglang-benchmarks/sglang
-            pip install -e 'python[all]'
-
-            # (TODO: Remove this once verified)
-            python3 -c 'import sglang; print(\"SGLang installed successfully\")'
-
-            # Create SGLang workspace marker and set environment
-            touch /sgl-workspace
-            export CURRENT_LLM_SERVING_ENGINE=sglang
-            export VLLM_SOURCE_CODE_LOC=/tmp/workspace/sglang-benchmarks/vllm
-
-            # Run the benchmark
-            cd /tmp/workspace/sglang-benchmarks/vllm
-            bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-          "
+          # Set environment variables for SGLang
+          export CURRENT_LLM_SERVING_ENGINE=sglang
+          export SGLANG_SOURCE_CODE_LOC=$(pwd)/sglang-benchmarks/sglang
 
-      - name: Authenticate with AWS
-        # AWS CUDA runners already have access to the bucket via its runner IAM role
-        if: env.DEVICE_NAME == 'rocm' || contains(env.DEVICE_TYPE, 'B200')
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
-          # The max duration enforced by the server side
-          role-duration-seconds: 18000
-          aws-region: us-east-1
+          # Run the SGLang benchmark script
+          cd sglang-benchmarks/benchmarks
+          bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
 
       - name: Upload the benchmark results
         if: always()
         env:
-          BENCHMARK_RESULTS: sglang-benchmarks/vllm/benchmarks/results
+          BENCHMARK_RESULTS: sglang-benchmarks/benchmarks/results
           MODELS: ${{ matrix.models }}
         run: |
           set -eux
@@ -377,5 +235,5 @@ jobs:
         if: always()
         with:
           name: sglang-benchmark-results-${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }}
-          path: sglang-benchmarks/vllm/benchmarks/results
+          path: sglang-benchmarks/benchmarks/results
           retention-days: 30
diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
new file mode 100644
index 00000000..6b786de8
--- /dev/null
+++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
@@ -0,0 +1,77 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "sglang",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "sglang",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "sglang",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
+        "qps_list": [2],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "speculative_config": {
+                "model": "turboderp/Qwama-0.5B-Instruct",
+                "num_speculative_tokens": 4,
+                "draft_tensor_parallel_size": 1
+            }
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "sglang",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    }
+]

From de0d0f84bf9d33e247c03016b92b7040fe807d3f Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 19:54:52 -0700
Subject: [PATCH 12/57] remove comment

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index dc96ae0b..e35bec1a 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -122,7 +122,7 @@ kill_processes_launched_by_current_bash() {
 }
 
 kill_gpu_processes() {
-#   ps -aux
+  ps -aux
   lsof -t -i:30000 | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
   pgrep python | xargs -r kill -9

From 2b4325a2d29829d4ec38d6db14cffd1791767e16 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 21:10:15 -0700
Subject: [PATCH 13/57] trying a different method

---
 .github/workflows/sglang-benchmark.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 0b4328fd..fbbbb54d 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -153,13 +153,13 @@ jobs:
         run: |
           set -eux
 
-          # Install vLLM
-          pip install vllm
-
           # Install SGLang from source
           pushd sglang
           pip install -e "python[all]"
 
+          # Install vLLM
+          pip install vllm
+
           # Verify installations
           python3 -c "import vllm; print('vLLM installed successfully')"
           python3 -c "import sglang; print('SGLang installed successfully')"
@@ -194,7 +194,6 @@ jobs:
 
           # Set environment variables for SGLang
           export CURRENT_LLM_SERVING_ENGINE=sglang
-          export SGLANG_SOURCE_CODE_LOC=$(pwd)/sglang-benchmarks/sglang
 
           # Run the SGLang benchmark script
           cd sglang-benchmarks/benchmarks

From 5ca157c71c605110984d6d4e69446bb4bbf0f293 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 22:02:20 -0700
Subject: [PATCH 14/57] fix numa installation issue

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index e35bec1a..e9895fb2 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -299,7 +299,7 @@ main() {
     # dependencies
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
-    (which lsof) || (apt-get update && apt-get install -y lsof)
+    (which lsof) || (apt-get update && apt-get install -y lsof libnuma-dev)
 
     # get the current IP address, required by SGLang bench commands
     export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}')

From e40a38e75bd8efd5bb7cb74eb15291fd0b9a401f Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 22:35:17 -0700
Subject: [PATCH 15/57] fix issues

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index e9895fb2..9e9caf16 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -299,7 +299,8 @@ main() {
     # dependencies
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
-    (which lsof) || (apt-get update && apt-get install -y lsof libnuma-dev)
+    (which lsof) || (apt-get update && apt-get install -y lsof)
+    (apt-get install -y libnuma-dev)
 
     # get the current IP address, required by SGLang bench commands
     export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}')

From 7b763ac9506a15215f582563bdfdebae3ac56782 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 22:55:12 -0700
Subject: [PATCH 16/57] fix package

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 1 -
 .github/workflows/sglang-benchmark.yml               | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index 9e9caf16..e35bec1a 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -300,7 +300,6 @@ main() {
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
     (which lsof) || (apt-get update && apt-get install -y lsof)
-    (apt-get install -y libnuma-dev)
 
     # get the current IP address, required by SGLang bench commands
     export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}')
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index fbbbb54d..648a90af 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -41,6 +41,10 @@ jobs:
         with:
           python-version: '3.12'
 
+      - name: Install Packages for SGLang
+        run: |
+          sudo apt-get install -y libnuma-dev
+
       - name: Set parameters
         id: set-parameters
         shell: bash

From 79d4ccf82ecc3c7d71c82b6cdc89a1c1926c9c8f Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 23:13:18 -0700
Subject: [PATCH 17/57] fix package

---
 .github/workflows/sglang-benchmark.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 648a90af..6ca63795 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -41,10 +41,6 @@ jobs:
         with:
           python-version: '3.12'
 
-      - name: Install Packages for SGLang
-        run: |
-          sudo apt-get install -y libnuma-dev
-
       - name: Set parameters
         id: set-parameters
         shell: bash
@@ -90,6 +86,10 @@ jobs:
           python-version: '3.12'
           cache: 'pip'
 
+      - name: Install Packages for SGLang
+        run: |
+          sudo apt-get install -y libnuma-dev
+
       - name: Check if the device is supported
         shell: bash
         run: |

From cd6456865fa3ff2926188677b99ac73a8bfcd767 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 23:19:06 -0700
Subject: [PATCH 18/57] fix package

---
 .github/workflows/sglang-benchmark.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 6ca63795..c8936981 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -41,6 +41,10 @@ jobs:
         with:
           python-version: '3.12'
 
+      - name: Install Packages for SGLang
+        run: |
+          sudo apt-get install -y libnuma-dev
+
       - name: Set parameters
         id: set-parameters
         shell: bash
@@ -74,7 +78,7 @@ jobs:
       - name: Checkout SGLang repository
         uses: actions/checkout@v4
         with:
-          repository: sgl-project/sglang.git
+          repository: sgl-project/sglang
           path: sglang-benchmarks/sglang
           ref: ${{ inputs.sglang_branch || 'main' }}
           fetch-depth: 0
@@ -86,10 +90,6 @@ jobs:
           python-version: '3.12'
           cache: 'pip'
 
-      - name: Install Packages for SGLang
-        run: |
-          sudo apt-get install -y libnuma-dev
-
       - name: Check if the device is supported
         shell: bash
         run: |

From aeecd6ae03364cd1e2e6ac7dd2446ffb4290a4e3 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 23:31:57 -0700
Subject: [PATCH 19/57] fix package

---
 .github/workflows/sglang-benchmark.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index c8936981..3a2bb7b5 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -44,6 +44,7 @@ jobs:
       - name: Install Packages for SGLang
         run: |
           sudo apt-get install -y libnuma-dev
+          sudo apt install numactl
 
       - name: Set parameters
         id: set-parameters

From d121b3419880a2697452169ba2311581b9c62a2c Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 23:38:16 -0700
Subject: [PATCH 20/57] fix package

---
 .github/workflows/sglang-benchmark.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 3a2bb7b5..74b6b9d1 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -43,8 +43,7 @@ jobs:
 
       - name: Install Packages for SGLang
         run: |
-          sudo apt-get install -y libnuma-dev
-          sudo apt install numactl
+          sudo apt-get install -y libnuma-dev numactl
 
       - name: Set parameters
         id: set-parameters

From 2498860fe8eaf8857c20cb207261910c321eda5c Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Thu, 21 Aug 2025 23:57:01 -0700
Subject: [PATCH 21/57] fix package

---
 .github/workflows/sglang-benchmark.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 74b6b9d1..c82982b6 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -41,10 +41,6 @@ jobs:
         with:
           python-version: '3.12'
 
-      - name: Install Packages for SGLang
-        run: |
-          sudo apt-get install -y libnuma-dev numactl
-
       - name: Set parameters
         id: set-parameters
         shell: bash
@@ -75,6 +71,12 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
 
+      - name: Install system dependencies
+        shell: bash
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libnuma-dev numactl
+
       - name: Checkout SGLang repository
         uses: actions/checkout@v4
         with:

From b3800a2a2a70d152c724e09cdf77f5dbd3d9be41 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Fri, 22 Aug 2025 00:14:40 -0700
Subject: [PATCH 22/57] fix package

---
 .github/workflows/sglang-benchmark.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index c82982b6..060ccafc 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -159,13 +159,13 @@ jobs:
         run: |
           set -eux
 
+          # Install vLLM
+          pip install vllm
+
           # Install SGLang from source
           pushd sglang
           pip install -e "python[all]"
 
-          # Install vLLM
-          pip install vllm
-
           # Verify installations
           python3 -c "import vllm; print('vLLM installed successfully')"
           python3 -c "import sglang; print('SGLang installed successfully')"

From 564c0b5a970a1bdb0c61411768ebcbacad0b06d7 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Fri, 22 Aug 2025 00:39:57 -0700
Subject: [PATCH 23/57] fix process

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index e35bec1a..b96cb78d 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -126,7 +126,6 @@ kill_gpu_processes() {
   lsof -t -i:30000 | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
   pgrep python | xargs -r kill -9
-  pgrep -f "sglang" | xargs -r kill -9
 
   # wait until GPU memory usage smaller than 1GB
   if command -v nvidia-smi; then

From 2e0eb3d5aea731388a81a643d907d1878fe92d29 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Fri, 22 Aug 2025 10:49:22 -0700
Subject: [PATCH 24/57] replace sglang benchmarking command with vllm bench
 serve

---
 .../run-sglang-performance-benchmarks.sh      |  5 +-
 .../benchmarks/cuda/serving-tests.json        | 61 +------------------
 2 files changed, 4 insertions(+), 62 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index b96cb78d..e4f49419 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -248,9 +248,8 @@ run_serving_tests() {
       new_test_name=$test_name"_qps_"$qps
       echo "new test name $new_test_name"
 
-      # SGLang bench_serving command
-      client_command="python3 -m sglang.bench_serving \
-        --backend sglang \
+      # Bench serving command
+      client_command="vllm bench serve \
         --dataset-name sharegpt \
         --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
         --model $client_model \
diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
index 6b786de8..e2c30eca 100644
--- a/sglang-benchmarks/benchmarks/cuda/serving-tests.json
+++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
@@ -1,7 +1,7 @@
 [
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": [1, 4],
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
@@ -11,64 +11,7 @@
         },
         "client_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "backend": "sglang",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "backend": "sglang",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "backend": "sglang",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
-        "qps_list": [2],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "speculative_config": {
-                "model": "turboderp/Qwama-0.5B-Instruct",
-                "num_speculative_tokens": 4,
-                "draft_tensor_parallel_size": 1
-            }
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "backend": "sglang",
+            "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200

From 1dd83dc8e0aabca0813986894aa3b00e759904fc Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Fri, 22 Aug 2025 16:37:41 -0700
Subject: [PATCH 25/57] fix import

---
 .github/workflows/sglang-benchmark.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 060ccafc..56d27dc4 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -135,6 +135,7 @@ jobs:
           else
             pip install -r .github/scripts/requirements.txt \
               --extra-index-url https://download.pytorch.org/whl/cu128
+            pip install flash-attn --no-build-isolation
           fi
 
       - name: Authenticate with AWS

From 5aa0db18e9d5f67033327662bbb050c1fd96d69f Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Sun, 24 Aug 2025 20:52:33 -0700
Subject: [PATCH 26/57] running vllm through docker

---
 .github/workflows/sglang-benchmark.yml | 74 ++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 56d27dc4..680baf39 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -41,6 +41,9 @@ jobs:
         with:
           python-version: '3.12'
 
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+
       - name: Set parameters
         id: set-parameters
         shell: bash
@@ -135,8 +138,36 @@ jobs:
           else
             pip install -r .github/scripts/requirements.txt \
               --extra-index-url https://download.pytorch.org/whl/cu128
-            pip install flash-attn --no-build-isolation
+            pip install flashinfer-python
+          fi
+
+      - name: Set Docker registry
+        shell: bash
+        run: |
+          set -eux
+
+          DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
+          DOCKER_IMAGE_SUFFIX=""
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DOCKER_IMAGE_SUFFIX=-cpu
           fi
+          echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
+          echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
+
+      - name: Setup CUDA GPU_FLAG for docker run
+        if: env.DEVICE_NAME == 'cuda'
+        run: |
+          echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+      - name: Setup ROCm
+        if: env.DEVICE_NAME == 'rocm'
+        uses: pytorch/pytorch/./.github/actions/setup-rocm@main
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        run: |
+          echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
 
       - name: Authenticate with AWS
         # Only need for DGX hosts
@@ -154,23 +185,18 @@ jobs:
         with:
           registry-type: public
 
-      - name: Install vLLM and SGLang
+      - name: Install SGLang
         working-directory: sglang-benchmarks
         shell: bash
         run: |
           set -eux
 
-          # Install vLLM
-          pip install vllm
-
           # Install SGLang from source
           pushd sglang
           pip install -e "python[all]"
 
           # Verify installations
-          python3 -c "import vllm; print('vLLM installed successfully')"
           python3 -c "import sglang; print('SGLang installed successfully')"
-          popd
 
       - name: Setup benchmark tests
         env:
@@ -196,15 +222,43 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           MODELS: ${{ matrix.models }}
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
+          ENGINE_VERSION: v1
+          SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
         run: |
           set -eux
 
           # Set environment variables for SGLang
           export CURRENT_LLM_SERVING_ENGINE=sglang
 
-          # Run the SGLang benchmark script
-          cd sglang-benchmarks/benchmarks
-          bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
+          if [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            ON_CPU=1
+          else
+            ON_CPU=0
+          fi
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
+            -e SCCACHE_BUCKET \
+            -e SCCACHE_REGION \
+            -e DEVICE_NAME \
+            -e DEVICE_TYPE \
+            -e HF_TOKEN \
+            -e ENGINE_VERSION \
+            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
+            -e ON_CPU="${ON_CPU}" \
+            --ipc=host \
+            --tty \
+            --detach \
+            --security-opt seccomp=unconfined \
+            --shm-size=4g \
+            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+            -w /tmp/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
 
       - name: Upload the benchmark results
         if: always()

From 403e20da80ab543d60dacb0c4e4b8db044256cc5 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Sun, 24 Aug 2025 21:29:05 -0700
Subject: [PATCH 27/57] add docker image

---
 .github/workflows/sglang-benchmark.yml | 40 ++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 680baf39..ae5e5adc 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -185,6 +185,46 @@ jobs:
         with:
           registry-type: public
 
+      - name: Check for latest vLLM commit with Docker image
+        working-directory: sglang-benchmarks
+        env:
+          HEAD_BRANCH: main
+          HEAD_SHA: ''
+          MODELS: ${{ matrix.models }}
+        run: |
+          set -eux
+
+          # Clone vLLM repository to get the latest commit
+          git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp
+          pushd vllm-temp
+
+          # Looking back the latest 100 commits is enough
+          for i in {0..99}
+          do
+            # Check if the image is there, if it doesn't then check an older one
+            # because the commit is too recent
+            HEAD_SHA=$(git rev-parse --verify HEAD~${i})
+            DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
+
+            # No Docker image available yet because the commit is too recent
+            if ! docker manifest inspect "${DOCKER_IMAGE}"; then
+              continue
+            fi
+
+            echo "Found vLLM commit ${HEAD_SHA} with available Docker image"
+            break
+          done
+          popd
+
+          # Clean up temporary vLLM repo
+          rm -rf vllm-temp
+
+          echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
+          echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV
+
+          # Print the benchmark commit for reference
+          echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
+
       - name: Install SGLang
         working-directory: sglang-benchmarks
         shell: bash

From f01f72fca43f9d11fedfa541c45639a52a4456ca Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Sun, 24 Aug 2025 22:22:57 -0700
Subject: [PATCH 28/57] test sglang docker image

---
 .github/workflows/sglang-benchmark.yml |  71 ++----
 .github/workflows/sglang-v2.yml        | 339 +++++++++++++++++++++++++
 2 files changed, 358 insertions(+), 52 deletions(-)
 create mode 100644 .github/workflows/sglang-v2.yml

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index ae5e5adc..3dd88bdb 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -41,9 +41,6 @@ jobs:
         with:
           python-version: '3.12'
 
-      - name: Install uv
-        uses: astral-sh/setup-uv@v6
-
       - name: Set parameters
         id: set-parameters
         shell: bash
@@ -185,58 +182,16 @@ jobs:
         with:
           registry-type: public
 
-      - name: Check for latest vLLM commit with Docker image
-        working-directory: sglang-benchmarks
-        env:
-          HEAD_BRANCH: main
-          HEAD_SHA: ''
-          MODELS: ${{ matrix.models }}
-        run: |
-          set -eux
-
-          # Clone vLLM repository to get the latest commit
-          git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp
-          pushd vllm-temp
-
-          # Looking back the latest 100 commits is enough
-          for i in {0..99}
-          do
-            # Check if the image is there, if it doesn't then check an older one
-            # because the commit is too recent
-            HEAD_SHA=$(git rev-parse --verify HEAD~${i})
-            DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
-
-            # No Docker image available yet because the commit is too recent
-            if ! docker manifest inspect "${DOCKER_IMAGE}"; then
-              continue
-            fi
-
-            echo "Found vLLM commit ${HEAD_SHA} with available Docker image"
-            break
-          done
-          popd
-
-          # Clean up temporary vLLM repo
-          rm -rf vllm-temp
-
-          echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
-          echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV
-
-          # Print the benchmark commit for reference
-          echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
-
-      - name: Install SGLang
-        working-directory: sglang-benchmarks
+      - name: Setup SGLang Docker Environment
         shell: bash
         run: |
           set -eux
 
-          # Install SGLang from source
-          pushd sglang
-          pip install -e "python[all]"
+          # Set SGLang Docker image
+          echo "SGLANG_DOCKER_IMAGE=lmsysorg/sglang:latest" >> $GITHUB_ENV
 
-          # Verify installations
-          python3 -c "import sglang; print('SGLang installed successfully')"
+          # Pull SGLang image
+          docker pull lmsysorg/sglang:latest
 
       - name: Setup benchmark tests
         env:
@@ -278,6 +233,7 @@ jobs:
             ON_CPU=0
           fi
 
+          # Use SGLang Docker image instead of vLLM image
           container_name=$(docker run \
             ${GPU_FLAG:-} \
             ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
@@ -289,15 +245,26 @@ jobs:
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
             -e ON_CPU="${ON_CPU}" \
+            -e CURRENT_LLM_SERVING_ENGINE \
             --ipc=host \
             --tty \
             --detach \
             --security-opt seccomp=unconfined \
-            --shm-size=4g \
+            --shm-size=32g \
+            -p 30000:30000 \
             -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+            -v ~/.cache/huggingface:/root/.cache/huggingface \
             -w /tmp/workspace \
-            "${DOCKER_IMAGE}"
+            "${SGLANG_DOCKER_IMAGE}"
           )
+
+          # Install vLLM client tools inside SGLang container (needed for 'vllm bench serve')
+          docker exec -t "${container_name}" bash -c "pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128"
+
+          # Install additional dependencies that might be needed
+          docker exec -t "${container_name}" bash -c "apt-get update && apt-get install -y wget curl jq lsof"
+
+          # Run the benchmark script inside the SGLang container
           docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
 
       - name: Upload the benchmark results
diff --git a/.github/workflows/sglang-v2.yml b/.github/workflows/sglang-v2.yml
new file mode 100644
index 00000000..ae5e5adc
--- /dev/null
+++ b/.github/workflows/sglang-v2.yml
@@ -0,0 +1,339 @@
+name: SGLang Benchmark
+
+on:
+  workflow_dispatch:
+    inputs:
+      sglang_branch:
+        description: SGLang branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER)
+        required: true
+        type: string
+        default: main
+      models:
+        description: |
+          A comma-separated list of models from sglang-benchmarks/benchmarks (optional, default to run everything)
+        required: false
+        type: string
+      runners:
+        description: |
+          A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
+        required: true
+        type: string
+        default: h100
+  pull_request:
+    paths:
+      - .github/workflows/sglang-benchmark.yml
+      - sglang-benchmarks/**
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  set-parameters:
+    runs-on: ubuntu-latest
+    outputs:
+      benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+
+      - name: Set parameters
+        id: set-parameters
+        shell: bash
+        env:
+          MODELS: ${{ inputs.models || '' }}
+          RUNNERS: ${{ inputs.runners || 'h100' }}
+        run: |
+          set -eux
+
+          # The generated matrix is grouped by model and runner
+          python .github/scripts/generate_vllm_benchmark_matrix.py \
+            --benchmark-configs-dir sglang-benchmarks/benchmarks \
+            --models "${MODELS}" \
+            --runners "${RUNNERS}"
+
+  benchmarks:
+    name: Run SGLang benchmarks
+    needs: set-parameters
+    strategy:
+      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    environment: pytorch-x-vllm
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        shell: bash
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libnuma-dev numactl
+
+      - name: Checkout SGLang repository
+        uses: actions/checkout@v4
+        with:
+          repository: sgl-project/sglang
+          path: sglang-benchmarks/sglang
+          ref: ${{ inputs.sglang_branch || 'main' }}
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        # Amazon Linux fails on this step
+        continue-on-error: true
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+
+      - name: Check if the device is supported
+        shell: bash
+        run: |
+          set -eux
+
+          if command -v nvidia-smi; then
+            DEVICE_NAME=cuda
+            nvidia-smi
+          elif command -v rocm-smi; then
+            DEVICE_NAME=rocm
+            rocm-smi
+          else
+            DEVICE_NAME=cpu
+            lscpu
+          fi
+          echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
+
+      - name: Set GPU name and type
+        working-directory: sglang-benchmarks
+        shell: bash
+        run: |
+          set -eux
+
+          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
+            DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
+          elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
+          fi
+          echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          set -eux
+
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            pip install -r .github/scripts/requirements.txt \
+              --extra-index-url https://download.pytorch.org/whl/rocm6.3
+          else
+            pip install -r .github/scripts/requirements.txt \
+              --extra-index-url https://download.pytorch.org/whl/cu128
+            pip install flashinfer-python
+          fi
+
+      - name: Set Docker registry
+        shell: bash
+        run: |
+          set -eux
+
+          DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
+          DOCKER_IMAGE_SUFFIX=""
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
+            DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DOCKER_IMAGE_SUFFIX=-cpu
+          fi
+          echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
+          echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
+
+      - name: Setup CUDA GPU_FLAG for docker run
+        if: env.DEVICE_NAME == 'cuda'
+        run: |
+          echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+      - name: Setup ROCm
+        if: env.DEVICE_NAME == 'rocm'
+        uses: pytorch/pytorch/./.github/actions/setup-rocm@main
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        run: |
+          echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+
+      - name: Authenticate with AWS
+        # Only need for DGX hosts
+        if: contains(env.DEVICE_TYPE, 'B200')
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/public_ecr_read_only
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+      - name: Login to public.ecr.aws
+        # Only need for DGX hosts
+        if: contains(env.DEVICE_TYPE, 'B200')
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+        with:
+          registry-type: public
+
+      - name: Check for latest vLLM commit with Docker image
+        working-directory: sglang-benchmarks
+        env:
+          HEAD_BRANCH: main
+          HEAD_SHA: ''
+          MODELS: ${{ matrix.models }}
+        run: |
+          set -eux
+
+          # Clone vLLM repository to get the latest commit
+          git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp
+          pushd vllm-temp
+
+          # Looking back the latest 100 commits is enough
+          for i in {0..99}
+          do
+            # Check if the image is there, if it doesn't then check an older one
+            # because the commit is too recent
+            HEAD_SHA=$(git rev-parse --verify HEAD~${i})
+            DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
+
+            # No Docker image available yet because the commit is too recent
+            if ! docker manifest inspect "${DOCKER_IMAGE}"; then
+              continue
+            fi
+
+            echo "Found vLLM commit ${HEAD_SHA} with available Docker image"
+            break
+          done
+          popd
+
+          # Clean up temporary vLLM repo
+          rm -rf vllm-temp
+
+          echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
+          echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV
+
+          # Print the benchmark commit for reference
+          echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Install SGLang
+        working-directory: sglang-benchmarks
+        shell: bash
+        run: |
+          set -eux
+
+          # Install SGLang from source
+          pushd sglang
+          pip install -e "python[all]"
+
+          # Verify installations
+          python3 -c "import sglang; print('SGLang installed successfully')"
+
+      - name: Setup benchmark tests
+        env:
+          MODELS: ${{ matrix.models }}
+        run: |
+          set -eux
+
+          # Create benchmarks directory structure
+          mkdir -p sglang-benchmarks/benchmarks/results
+          mkdir -p sglang-benchmarks/benchmarks/tests
+
+          # Set the list of benchmarks we want to cover in this runner
+          python3 .github/scripts/setup_vllm_benchmark.py \
+            --from-benchmark-configs-dir sglang-benchmarks/benchmarks \
+            --to-benchmark-configs-dir sglang-benchmarks/benchmarks/tests \
+            --models "${MODELS}" \
+            --device "${DEVICE_NAME}"
+
+          ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found"
+          find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display"
+
+      - name: Run SGLang benchmark
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          MODELS: ${{ matrix.models }}
+          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
+          ENGINE_VERSION: v1
+          SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+        run: |
+          set -eux
+
+          # Set environment variables for SGLang
+          export CURRENT_LLM_SERVING_ENGINE=sglang
+
+          if [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            ON_CPU=1
+          else
+            ON_CPU=0
+          fi
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
+            -e SCCACHE_BUCKET \
+            -e SCCACHE_REGION \
+            -e DEVICE_NAME \
+            -e DEVICE_TYPE \
+            -e HF_TOKEN \
+            -e ENGINE_VERSION \
+            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
+            -e ON_CPU="${ON_CPU}" \
+            --ipc=host \
+            --tty \
+            --detach \
+            --security-opt seccomp=unconfined \
+            --shm-size=4g \
+            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+            -w /tmp/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
+
+      - name: Upload the benchmark results
+        if: always()
+        env:
+          BENCHMARK_RESULTS: sglang-benchmarks/benchmarks/results
+          MODELS: ${{ matrix.models }}
+        run: |
+          set -eux
+
+          sudo chown -R ${UID} "${BENCHMARK_RESULTS}" || true
+          ls -lah "${BENCHMARK_RESULTS}" || echo "Results directory not found"
+
+          SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g")
+          SANITIZED_MODELS="${MODELS//\//_}"
+
+          # Create results summary
+          if [ -d "${BENCHMARK_RESULTS}" ]; then
+            echo "## SGLang Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY
+            echo "- Device: ${DEVICE_TYPE}" >> $GITHUB_STEP_SUMMARY
+            echo "- Models: ${MODELS}" >> $GITHUB_STEP_SUMMARY
+            echo "- Runner: ${{ matrix.runner }}" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "### Files Generated:" >> $GITHUB_STEP_SUMMARY
+            find "${BENCHMARK_RESULTS}" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY || echo "- No JSON files found" >> $GITHUB_STEP_SUMMARY
+          else
+            echo "⚠️ No benchmark results found in ${BENCHMARK_RESULTS}" >> $GITHUB_STEP_SUMMARY
+          fi
+
+          echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV
+          echo "SANITIZED_MODELS=$SANITIZED_MODELS" >> $GITHUB_ENV
+
+      # Keep a copy of the benchmark results on GitHub for reference
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: sglang-benchmark-results-${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }}
+          path: sglang-benchmarks/benchmarks/results
+          retention-days: 30

From 0d0379cd433405c35076e78712cc584aee5b73b7 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Sun, 24 Aug 2025 23:33:35 -0700
Subject: [PATCH 29/57] test different approach - 1

---
 .github/workflows/sglang-benchmark.yml | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 3dd88bdb..62f2a1db 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -258,12 +258,25 @@ jobs:
             "${SGLANG_DOCKER_IMAGE}"
           )
 
-          # Install vLLM client tools inside SGLang container (needed for 'vllm bench serve')
-          docker exec -t "${container_name}" bash -c "pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128"
-
-          # Install additional dependencies that might be needed
-          docker exec -t "${container_name}" bash -c "apt-get update && apt-get install -y wget curl jq lsof"
-
+          # Install system dependencies and build tools
+          docker exec -t "${container_name}" bash -c "
+            apt-get update &&
+            apt-get install -y wget curl jq lsof git build-essential python3-dev &&
+            pip install uv
+          "
+
+          # Check current PyTorch version
+          docker exec -t "${container_name}" bash -c "python3 -c 'import torch; print(f\"SGLang container PyTorch: {torch.__version__}\")'"
+
+          # Clone vLLM and build with existing PyTorch
+          docker exec -t "${container_name}" bash -c "
+            cd /tmp &&
+            git clone https://github.com/vllm-project/vllm.git &&
+            cd vllm &&
+            python use_existing_torch.py &&
+            uv pip install -r requirements/build.txt &&
+            uv pip install --no-build-isolation -e .
+          "
           # Run the benchmark script inside the SGLang container
           docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
 

From 3b99ff448f37badc2348b80d6f4213c355421b16 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 00:12:46 -0700
Subject: [PATCH 30/57] remove uv and use pip

---
 .github/workflows/sglang-benchmark.yml |   6 +-
 .github/workflows/sglang-v2.yml        | 339 -------------------------
 2 files changed, 3 insertions(+), 342 deletions(-)
 delete mode 100644 .github/workflows/sglang-v2.yml

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 62f2a1db..40c23c28 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -274,10 +274,10 @@ jobs:
             git clone https://github.com/vllm-project/vllm.git &&
             cd vllm &&
             python use_existing_torch.py &&
-            uv pip install -r requirements/build.txt &&
-            uv pip install --no-build-isolation -e .
+            pip install -r requirements/build.txt &&
+            pip install --no-build-isolation -e .
           "
-          # Run the benchmark script inside the SGLang container
+          # Run the benchmark script inside the SGLang container to run the benchmarks
           docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
 
       - name: Upload the benchmark results
diff --git a/.github/workflows/sglang-v2.yml b/.github/workflows/sglang-v2.yml
deleted file mode 100644
index ae5e5adc..00000000
--- a/.github/workflows/sglang-v2.yml
+++ /dev/null
@@ -1,339 +0,0 @@
-name: SGLang Benchmark
-
-on:
-  workflow_dispatch:
-    inputs:
-      sglang_branch:
-        description: SGLang branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER)
-        required: true
-        type: string
-        default: main
-      models:
-        description: |
-          A comma-separated list of models from sglang-benchmarks/benchmarks (optional, default to run everything)
-        required: false
-        type: string
-      runners:
-        description: |
-          A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
-        required: true
-        type: string
-        default: h100
-  pull_request:
-    paths:
-      - .github/workflows/sglang-benchmark.yml
-      - sglang-benchmarks/**
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  set-parameters:
-    runs-on: ubuntu-latest
-    outputs:
-      benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v6
-
-      - name: Set parameters
-        id: set-parameters
-        shell: bash
-        env:
-          MODELS: ${{ inputs.models || '' }}
-          RUNNERS: ${{ inputs.runners || 'h100' }}
-        run: |
-          set -eux
-
-          # The generated matrix is grouped by model and runner
-          python .github/scripts/generate_vllm_benchmark_matrix.py \
-            --benchmark-configs-dir sglang-benchmarks/benchmarks \
-            --models "${MODELS}" \
-            --runners "${RUNNERS}"
-
-  benchmarks:
-    name: Run SGLang benchmarks
-    needs: set-parameters
-    strategy:
-      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    environment: pytorch-x-vllm
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Install system dependencies
-        shell: bash
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y libnuma-dev numactl
-
-      - name: Checkout SGLang repository
-        uses: actions/checkout@v4
-        with:
-          repository: sgl-project/sglang
-          path: sglang-benchmarks/sglang
-          ref: ${{ inputs.sglang_branch || 'main' }}
-          fetch-depth: 0
-
-      - uses: actions/setup-python@v5
-        # Amazon Linux fails on this step
-        continue-on-error: true
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Check if the device is supported
-        shell: bash
-        run: |
-          set -eux
-
-          if command -v nvidia-smi; then
-            DEVICE_NAME=cuda
-            nvidia-smi
-          elif command -v rocm-smi; then
-            DEVICE_NAME=rocm
-            rocm-smi
-          else
-            DEVICE_NAME=cpu
-            lscpu
-          fi
-          echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
-
-      - name: Set GPU name and type
-        working-directory: sglang-benchmarks
-        shell: bash
-        run: |
-          set -eux
-
-          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
-            DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
-          elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
-            DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
-          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
-            DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
-          fi
-          echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
-
-      - name: Install dependencies
-        shell: bash
-        run: |
-          set -eux
-
-          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
-            pip install -r .github/scripts/requirements.txt \
-              --extra-index-url https://download.pytorch.org/whl/rocm6.3
-          else
-            pip install -r .github/scripts/requirements.txt \
-              --extra-index-url https://download.pytorch.org/whl/cu128
-            pip install flashinfer-python
-          fi
-
-      - name: Set Docker registry
-        shell: bash
-        run: |
-          set -eux
-
-          DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
-          DOCKER_IMAGE_SUFFIX=""
-          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
-            DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
-          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
-            DOCKER_IMAGE_SUFFIX=-cpu
-          fi
-          echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
-          echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
-
-      - name: Setup CUDA GPU_FLAG for docker run
-        if: env.DEVICE_NAME == 'cuda'
-        run: |
-          echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
-
-      - name: Setup ROCm
-        if: env.DEVICE_NAME == 'rocm'
-        uses: pytorch/pytorch/./.github/actions/setup-rocm@main
-
-      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
-        run: |
-          echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
-
-      - name: Authenticate with AWS
-        # Only need for DGX hosts
-        if: contains(env.DEVICE_TYPE, 'B200')
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/public_ecr_read_only
-          role-duration-seconds: 18000
-          aws-region: us-east-1
-
-      - name: Login to public.ecr.aws
-        # Only need for DGX hosts
-        if: contains(env.DEVICE_TYPE, 'B200')
-        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
-        with:
-          registry-type: public
-
-      - name: Check for latest vLLM commit with Docker image
-        working-directory: sglang-benchmarks
-        env:
-          HEAD_BRANCH: main
-          HEAD_SHA: ''
-          MODELS: ${{ matrix.models }}
-        run: |
-          set -eux
-
-          # Clone vLLM repository to get the latest commit
-          git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp
-          pushd vllm-temp
-
-          # Looking back the latest 100 commits is enough
-          for i in {0..99}
-          do
-            # Check if the image is there, if it doesn't then check an older one
-            # because the commit is too recent
-            HEAD_SHA=$(git rev-parse --verify HEAD~${i})
-            DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
-
-            # No Docker image available yet because the commit is too recent
-            if ! docker manifest inspect "${DOCKER_IMAGE}"; then
-              continue
-            fi
-
-            echo "Found vLLM commit ${HEAD_SHA} with available Docker image"
-            break
-          done
-          popd
-
-          # Clean up temporary vLLM repo
-          rm -rf vllm-temp
-
-          echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
-          echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV
-
-          # Print the benchmark commit for reference
-          echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
-
-      - name: Install SGLang
-        working-directory: sglang-benchmarks
-        shell: bash
-        run: |
-          set -eux
-
-          # Install SGLang from source
-          pushd sglang
-          pip install -e "python[all]"
-
-          # Verify installations
-          python3 -c "import sglang; print('SGLang installed successfully')"
-
-      - name: Setup benchmark tests
-        env:
-          MODELS: ${{ matrix.models }}
-        run: |
-          set -eux
-
-          # Create benchmarks directory structure
-          mkdir -p sglang-benchmarks/benchmarks/results
-          mkdir -p sglang-benchmarks/benchmarks/tests
-
-          # Set the list of benchmarks we want to cover in this runner
-          python3 .github/scripts/setup_vllm_benchmark.py \
-            --from-benchmark-configs-dir sglang-benchmarks/benchmarks \
-            --to-benchmark-configs-dir sglang-benchmarks/benchmarks/tests \
-            --models "${MODELS}" \
-            --device "${DEVICE_NAME}"
-
-          ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found"
-          find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display"
-
-      - name: Run SGLang benchmark
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          MODELS: ${{ matrix.models }}
-          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-          SCCACHE_REGION: us-east-1
-          ENGINE_VERSION: v1
-          SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
-        run: |
-          set -eux
-
-          # Set environment variables for SGLang
-          export CURRENT_LLM_SERVING_ENGINE=sglang
-
-          if [[ "${DEVICE_NAME}" == "cpu" ]]; then
-            ON_CPU=1
-          else
-            ON_CPU=0
-          fi
-
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
-            -e SCCACHE_BUCKET \
-            -e SCCACHE_REGION \
-            -e DEVICE_NAME \
-            -e DEVICE_TYPE \
-            -e HF_TOKEN \
-            -e ENGINE_VERSION \
-            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
-            -e ON_CPU="${ON_CPU}" \
-            --ipc=host \
-            --tty \
-            --detach \
-            --security-opt seccomp=unconfined \
-            --shm-size=4g \
-            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-            -w /tmp/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
-
-      - name: Upload the benchmark results
-        if: always()
-        env:
-          BENCHMARK_RESULTS: sglang-benchmarks/benchmarks/results
-          MODELS: ${{ matrix.models }}
-        run: |
-          set -eux
-
-          sudo chown -R ${UID} "${BENCHMARK_RESULTS}" || true
-          ls -lah "${BENCHMARK_RESULTS}" || echo "Results directory not found"
-
-          SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g")
-          SANITIZED_MODELS="${MODELS//\//_}"
-
-          # Create results summary
-          if [ -d "${BENCHMARK_RESULTS}" ]; then
-            echo "## SGLang Benchmark Results Summary" >> $GITHUB_STEP_SUMMARY
-            echo "- Device: ${DEVICE_TYPE}" >> $GITHUB_STEP_SUMMARY
-            echo "- Models: ${MODELS}" >> $GITHUB_STEP_SUMMARY
-            echo "- Runner: ${{ matrix.runner }}" >> $GITHUB_STEP_SUMMARY
-            echo "" >> $GITHUB_STEP_SUMMARY
-            echo "### Files Generated:" >> $GITHUB_STEP_SUMMARY
-            find "${BENCHMARK_RESULTS}" -type f -name "*.json" -exec echo "- {}" \; >> $GITHUB_STEP_SUMMARY || echo "- No JSON files found" >> $GITHUB_STEP_SUMMARY
-          else
-            echo "⚠️ No benchmark results found in ${BENCHMARK_RESULTS}" >> $GITHUB_STEP_SUMMARY
-          fi
-
-          echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV
-          echo "SANITIZED_MODELS=$SANITIZED_MODELS" >> $GITHUB_ENV
-
-      # Keep a copy of the benchmark results on GitHub for reference
-      - uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: sglang-benchmark-results-${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }}
-          path: sglang-benchmarks/benchmarks/results
-          retention-days: 30

From 8fc7488005b8ef87109e35d1f59b76c2b5e6153f Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 08:38:39 -0700
Subject: [PATCH 31/57] try different approach

---
 .github/workflows/sglang-benchmark.yml | 89 ++++++++++++++++----------
 1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 40c23c28..c0f8b654 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -41,6 +41,9 @@ jobs:
         with:
           python-version: '3.12'
 
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+
       - name: Set parameters
         id: set-parameters
         shell: bash
@@ -135,7 +138,6 @@ jobs:
           else
             pip install -r .github/scripts/requirements.txt \
               --extra-index-url https://download.pytorch.org/whl/cu128
-            pip install flashinfer-python
           fi
 
       - name: Set Docker registry
@@ -182,16 +184,58 @@ jobs:
         with:
           registry-type: public
 
-      - name: Setup SGLang Docker Environment
-        shell: bash
+      - name: Check for latest vLLM commit with Docker image
+        working-directory: sglang-benchmarks
+        env:
+          HEAD_BRANCH: main
+          HEAD_SHA: ''
+          MODELS: ${{ matrix.models }}
         run: |
           set -eux
 
-          # Set SGLang Docker image
-          echo "SGLANG_DOCKER_IMAGE=lmsysorg/sglang:latest" >> $GITHUB_ENV
+          # Clone vLLM repository to get the latest commit
+          git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp
+          pushd vllm-temp
+
+          # Looking back the latest 100 commits is enough
+          for i in {0..99}
+          do
+            # Check if the image is there, if it doesn't then check an older one
+            # because the commit is too recent
+            HEAD_SHA=$(git rev-parse --verify HEAD~${i})
+            DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
+
+            # No Docker image available yet because the commit is too recent
+            if ! docker manifest inspect "${DOCKER_IMAGE}"; then
+              continue
+            fi
 
-          # Pull SGLang image
-          docker pull lmsysorg/sglang:latest
+            echo "Found vLLM commit ${HEAD_SHA} with available Docker image"
+            break
+          done
+          popd
+
+          # Clean up temporary vLLM repo
+          rm -rf vllm-temp
+
+          echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
+          echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV
+
+          # Print the benchmark commit for reference
+          echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
+
+      # - name: Install SGLang
+      #   working-directory: sglang-benchmarks
+      #   shell: bash
+      #   run: |
+      #     set -eux
+
+      #     # Install SGLang from source
+      #     pushd sglang
+      #     pip install -e "python[all]"
+
+      #     # Verify installations
+      #     python3 -c "import sglang; print('SGLang installed successfully')"
 
       - name: Setup benchmark tests
         env:
@@ -233,7 +277,6 @@ jobs:
             ON_CPU=0
           fi
 
-          # Use SGLang Docker image instead of vLLM image
           container_name=$(docker run \
             ${GPU_FLAG:-} \
             ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
@@ -245,40 +288,16 @@ jobs:
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
             -e ON_CPU="${ON_CPU}" \
-            -e CURRENT_LLM_SERVING_ENGINE \
             --ipc=host \
             --tty \
             --detach \
             --security-opt seccomp=unconfined \
-            --shm-size=32g \
-            -p 30000:30000 \
+            --shm-size=4g \
             -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-            -v ~/.cache/huggingface:/root/.cache/huggingface \
             -w /tmp/workspace \
-            "${SGLANG_DOCKER_IMAGE}"
+            "${DOCKER_IMAGE}"
           )
-
-          # Install system dependencies and build tools
-          docker exec -t "${container_name}" bash -c "
-            apt-get update &&
-            apt-get install -y wget curl jq lsof git build-essential python3-dev &&
-            pip install uv
-          "
-
-          # Check current PyTorch version
-          docker exec -t "${container_name}" bash -c "python3 -c 'import torch; print(f\"SGLang container PyTorch: {torch.__version__}\")'"
-
-          # Clone vLLM and build with existing PyTorch
-          docker exec -t "${container_name}" bash -c "
-            cd /tmp &&
-            git clone https://github.com/vllm-project/vllm.git &&
-            cd vllm &&
-            python use_existing_torch.py &&
-            pip install -r requirements/build.txt &&
-            pip install --no-build-isolation -e .
-          "
-          # Run the benchmark script inside the SGLang container to run the benchmarks
-          docker exec -t "${container_name}" bash -c "cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
+          docker exec -t "${container_name}" bash -c "pip install sglang && cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
 
       - name: Upload the benchmark results
         if: always()

From 14d33f934cf4184910b080bfdef69108ec43a46e Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 08:55:06 -0700
Subject: [PATCH 32/57] remove uv and use pip

---
 .github/workflows/sglang-benchmark.yml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index c0f8b654..e8c076f7 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -298,6 +298,26 @@ jobs:
             "${DOCKER_IMAGE}"
           )
           docker exec -t "${container_name}" bash -c "pip install sglang && cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
+          docker exec -t "${container_name}" bash -c "
+            # Install system dependencies
+            apt-get update &&
+            apt-get install -y wget curl jq lsof git build-essential python3-dev &&
+
+            # Check current PyTorch version
+            python3 -c 'import torch; print(f\"vLLM container PyTorch: {torch.__version__}\")' &&
+
+            # Clone SGLang repository
+            cd /tmp &&
+            git clone https://github.com/sgl-project/sglang.git &&
+            cd sglang &&
+
+            # Install SGLang with existing PyTorch
+            pip install -e 'python[all]' &&
+
+            # Navigate to workspace and run benchmarks
+            cd /tmp/workspace/sglang-benchmarks/benchmarks &&
+            bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
+          "
 
       - name: Upload the benchmark results
         if: always()

From 017e2528d65f25bce54614cc92a155432fd73171 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 08:55:28 -0700
Subject: [PATCH 33/57] remove uv and use pip

---
 .github/workflows/sglang-benchmark.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index e8c076f7..e486b973 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -297,7 +297,6 @@ jobs:
             -w /tmp/workspace \
             "${DOCKER_IMAGE}"
           )
-          docker exec -t "${container_name}" bash -c "pip install sglang && cd sglang-benchmarks/benchmarks && bash ../../.github/scripts/run-sglang-performance-benchmarks.sh"
           docker exec -t "${container_name}" bash -c "
             # Install system dependencies
             apt-get update &&

From cac7fc1f2a66737de2397452787d658f4aa7236f Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 11:10:54 -0700
Subject: [PATCH 34/57] create diff venvs

---
 .../run-sglang-performance-benchmarks.sh      |  14 ++
 .github/workflows/sglang-benchmark.yml        | 123 ++----------------
 2 files changed, 26 insertions(+), 111 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index e4f49419..b5511cfb 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -236,6 +236,15 @@ run_serving_tests() {
       continue
     fi
 
+    # Create a new uv environment for vllm client (once per test case)
+    echo "Creating new uv environment for vllm client..."
+    uv venv vllm_client_env
+
+    # Activate the environment and install vllm
+    echo "Installing vllm in the new environment..."
+    source vllm_client_env/bin/activate
+    pip install vllm
+
     # iterate over different QPS
     for qps in $qps_list; do
       # remove the surrounding single quote from qps
@@ -261,6 +270,7 @@ run_serving_tests() {
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
 
+      # Run the vllm bench serve command in the activated environment
       bash -c "$client_command"
 
       # record the benchmarking commands
@@ -277,6 +287,10 @@ run_serving_tests() {
 
     done
 
+    # Deactivate and clean up the environment after all QPS tests
+    deactivate
+    rm -rf vllm_client_env
+
     # clean up
     kill -9 $server_pid
     kill_gpu_processes
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index e486b973..8c8a369b 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -95,6 +95,9 @@ jobs:
           python-version: '3.12'
           cache: 'pip'
 
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+
       - name: Check if the device is supported
         shell: bash
         run: |
@@ -140,21 +143,6 @@ jobs:
               --extra-index-url https://download.pytorch.org/whl/cu128
           fi
 
-      - name: Set Docker registry
-        shell: bash
-        run: |
-          set -eux
-
-          DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
-          DOCKER_IMAGE_SUFFIX=""
-          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
-            DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
-          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
-            DOCKER_IMAGE_SUFFIX=-cpu
-          fi
-          echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
-          echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
-
       - name: Setup CUDA GPU_FLAG for docker run
         if: env.DEVICE_NAME == 'cuda'
         run: |
@@ -184,58 +172,18 @@ jobs:
         with:
           registry-type: public
 
-      - name: Check for latest vLLM commit with Docker image
+      - name: Install SGLang
         working-directory: sglang-benchmarks
-        env:
-          HEAD_BRANCH: main
-          HEAD_SHA: ''
-          MODELS: ${{ matrix.models }}
+        shell: bash
         run: |
           set -eux
 
-          # Clone vLLM repository to get the latest commit
-          git clone --depth 100 https://github.com/vllm-project/vllm.git vllm-temp
-          pushd vllm-temp
-
-          # Looking back the latest 100 commits is enough
-          for i in {0..99}
-          do
-            # Check if the image is there, if it doesn't then check an older one
-            # because the commit is too recent
-            HEAD_SHA=$(git rev-parse --verify HEAD~${i})
-            DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
-
-            # No Docker image available yet because the commit is too recent
-            if ! docker manifest inspect "${DOCKER_IMAGE}"; then
-              continue
-            fi
-
-            echo "Found vLLM commit ${HEAD_SHA} with available Docker image"
-            break
-          done
-          popd
-
-          # Clean up temporary vLLM repo
-          rm -rf vllm-temp
-
-          echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
-          echo "DOCKER_IMAGE=$DOCKER_IMAGE" >> $GITHUB_ENV
-
-          # Print the benchmark commit for reference
-          echo "### Using vLLM Docker image for commit [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"
+          # Install SGLang from source
+          pushd sglang
+          pip install -e "python[all]"
 
-      # - name: Install SGLang
-      #   working-directory: sglang-benchmarks
-      #   shell: bash
-      #   run: |
-      #     set -eux
-
-      #     # Install SGLang from source
-      #     pushd sglang
-      #     pip install -e "python[all]"
-
-      #     # Verify installations
-      #     python3 -c "import sglang; print('SGLang installed successfully')"
+          # Verify installations
+          python3 -c "import sglang; print('SGLang installed successfully')"
 
       - name: Setup benchmark tests
         env:
@@ -260,10 +208,6 @@ jobs:
       - name: Run SGLang benchmark
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          MODELS: ${{ matrix.models }}
-          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-          SCCACHE_REGION: us-east-1
-          ENGINE_VERSION: v1
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
         run: |
           set -eux
@@ -271,51 +215,8 @@ jobs:
           # Set environment variables for SGLang
           export CURRENT_LLM_SERVING_ENGINE=sglang
 
-          if [[ "${DEVICE_NAME}" == "cpu" ]]; then
-            ON_CPU=1
-          else
-            ON_CPU=0
-          fi
-
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
-            -e SCCACHE_BUCKET \
-            -e SCCACHE_REGION \
-            -e DEVICE_NAME \
-            -e DEVICE_TYPE \
-            -e HF_TOKEN \
-            -e ENGINE_VERSION \
-            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
-            -e ON_CPU="${ON_CPU}" \
-            --ipc=host \
-            --tty \
-            --detach \
-            --security-opt seccomp=unconfined \
-            --shm-size=4g \
-            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-            -w /tmp/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          docker exec -t "${container_name}" bash -c "
-            # Install system dependencies
-            apt-get update &&
-            apt-get install -y wget curl jq lsof git build-essential python3-dev &&
-
-            # Check current PyTorch version
-            python3 -c 'import torch; print(f\"vLLM container PyTorch: {torch.__version__}\")' &&
-
-            # Clone SGLang repository
-            cd /tmp &&
-            git clone https://github.com/sgl-project/sglang.git &&
-            cd sglang &&
-
-            # Install SGLang with existing PyTorch
-            pip install -e 'python[all]' &&
-
-            # Navigate to workspace and run benchmarks
-            cd /tmp/workspace/sglang-benchmarks/benchmarks &&
-            bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
+          cd sglang-benchmarks/benchmarks &&
+          bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
           "
 
       - name: Upload the benchmark results

From 90f049398df9636dbc68db9a83829542a544ae15 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 11:44:40 -0700
Subject: [PATCH 35/57] update arguments

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index b5511cfb..c88a50d4 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -257,14 +257,16 @@ run_serving_tests() {
       new_test_name=$test_name"_qps_"$qps
       echo "new test name $new_test_name"
 
-      # Bench serving command
+      # Bench serving command with proper parameters for connecting to external server
       client_command="vllm bench serve \
+        --port 30000 \
+        --model $client_model \
         --dataset-name sharegpt \
         --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
-        --model $client_model \
         --request-rate $qps \
-        --port 30000 \
-        --output-file $RESULTS_FOLDER/${new_test_name}.json \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
         $client_args"
 
       echo "Running test case $test_name with qps $qps"

From 5160576437a9e8d3e391233e32f5330894773876 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 12:35:51 -0700
Subject: [PATCH 36/57] added max concurrency

---
 .../run-sglang-performance-benchmarks.sh      | 72 ++++++++++---------
 1 file changed, 39 insertions(+), 33 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index c88a50d4..962e1f9e 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -190,6 +190,13 @@ run_serving_tests() {
     qps_list=$(echo "$params" | jq -r '.qps_list')
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
     echo "Running over qps list $qps_list"
+    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
+    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
+        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+        max_concurrency_list="[$num_prompts]"
+    fi
+    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
+    echo "Running over max concurrency list $max_concurrency_list"
 
     # Extract only specific SGLang server parameters
     model_path=$(echo "$server_params" | jq -r '.model_path // .model')
@@ -254,39 +261,38 @@ run_serving_tests() {
         echo "now qps is $qps"
       fi
 
-      new_test_name=$test_name"_qps_"$qps
-      echo "new test name $new_test_name"
-
-      # Bench serving command with proper parameters for connecting to external server
-      client_command="vllm bench serve \
-        --port 30000 \
-        --model $client_model \
-        --dataset-name sharegpt \
-        --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
-        --request-rate $qps \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      # Run the vllm bench serve command in the activated environment
-      bash -c "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
+      for max_concurrency in $max_concurrency_list; do
+        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
+        echo " new test name $new_test_name"
+        # pass the tensor parallel size to the client so that it can be displayed
+        # on the benchmark dashboard
+        client_command="vllm bench serve \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --max-concurrency $max_concurrency \
+          --metadata "tensor_parallel_size=$tp" \
+          --port 30000 \
+          $client_args "
+
+        echo "Running test case $test_name with qps $qps"
+        echo "Client command: $client_command"
+
+        bash -c "$client_command"
+
+        # record the benchmarking commands
+        jq_output=$(jq -n \
+          --arg server "$server_command" \
+          --arg client "$client_command" \
+          --arg gpu "$gpu_type" \
+          '{
+            server_command: $server,
+            client_command: $client,
+            gpu_type: $gpu
+          }')
+        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+      done
     done
 
     # Deactivate and clean up the environment after all QPS tests

From 110929b4c9b23f5ae7efa9f7dda073f0561bfb8b Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 14:29:10 -0700
Subject: [PATCH 37/57] add virtual env for sglang as well

---
 .github/scripts/run-sglang-performance-benchmarks.sh |  2 +-
 .github/workflows/sglang-benchmark.yml               | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index 962e1f9e..506e768c 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -224,7 +224,7 @@ run_serving_tests() {
       continue
     fi
 
-    server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
+    server_command="source sglang_env/bin/activate && python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
 
     # run the server
     echo "Running test case $test_name"
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 8c8a369b..31dfdfa5 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -172,12 +172,16 @@ jobs:
         with:
           registry-type: public
 
-      - name: Install SGLang
+      - name: Install SGLang in virtual environment
         working-directory: sglang-benchmarks
         shell: bash
         run: |
           set -eux
 
+          # Create virtual environment for SGLang
+          uv venv sglang_env
+          source sglang_env/bin/activate
+
           # Install SGLang from source
           pushd sglang
           pip install -e "python[all]"
@@ -185,6 +189,9 @@ jobs:
           # Verify installations
           python3 -c "import sglang; print('SGLang installed successfully')"
 
+          # Deactivate for now - will be activated in the benchmark script
+          deactivate
+
       - name: Setup benchmark tests
         env:
           MODELS: ${{ matrix.models }}
@@ -212,9 +219,6 @@ jobs:
         run: |
           set -eux
 
-          # Set environment variables for SGLang
-          export CURRENT_LLM_SERVING_ENGINE=sglang
-
           cd sglang-benchmarks/benchmarks &&
           bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
           "

From 33f9cbd11c07d579b3984ada4bf2ce29d13ee5ee Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 14:57:39 -0700
Subject: [PATCH 38/57] test

---
 .../run-sglang-performance-benchmarks.sh      | 70 ++++++++-----------
 .github/workflows/sglang-benchmark.yml        |  9 +--
 2 files changed, 31 insertions(+), 48 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index 506e768c..aa5dcaf4 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -190,13 +190,6 @@ run_serving_tests() {
     qps_list=$(echo "$params" | jq -r '.qps_list')
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
     echo "Running over qps list $qps_list"
-    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
-    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
-        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
-        max_concurrency_list="[$num_prompts]"
-    fi
-    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
-    echo "Running over max concurrency list $max_concurrency_list"
 
     # Extract only specific SGLang server parameters
     model_path=$(echo "$server_params" | jq -r '.model_path // .model')
@@ -224,7 +217,7 @@ run_serving_tests() {
       continue
     fi
 
-    server_command="source sglang_env/bin/activate && python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
+    server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
 
     # run the server
     echo "Running test case $test_name"
@@ -261,38 +254,35 @@ run_serving_tests() {
         echo "now qps is $qps"
       fi
 
-      for max_concurrency in $max_concurrency_list; do
-        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
-        echo " new test name $new_test_name"
-        # pass the tensor parallel size to the client so that it can be displayed
-        # on the benchmark dashboard
-        client_command="vllm bench serve \
-          --save-result \
-          --result-dir $RESULTS_FOLDER \
-          --result-filename ${new_test_name}.json \
-          --request-rate $qps \
-          --max-concurrency $max_concurrency \
-          --metadata "tensor_parallel_size=$tp" \
-          --port 30000 \
-          $client_args "
-
-        echo "Running test case $test_name with qps $qps"
-        echo "Client command: $client_command"
-
-        bash -c "$client_command"
-
-        # record the benchmarking commands
-        jq_output=$(jq -n \
-          --arg server "$server_command" \
-          --arg client "$client_command" \
-          --arg gpu "$gpu_type" \
-          '{
-            server_command: $server,
-            client_command: $client,
-            gpu_type: $gpu
-          }')
-        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-      done
+      new_test_name=$test_name"_qps_"$qps"_concurrency_"
+      echo " new test name $new_test_name"
+      # pass the tensor parallel size to the client so that it can be displayed
+      # on the benchmark dashboard
+      client_command="vllm bench serve \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        --metadata "tensor_parallel_size=$tp" \
+        --port 30000 \
+        $client_args "
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      bash -c "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
     done
 
     # Deactivate and clean up the environment after all QPS tests
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 31dfdfa5..b1566dd2 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -172,16 +172,12 @@ jobs:
         with:
           registry-type: public
 
-      - name: Install SGLang in virtual environment
+      - name: Install SGLang
         working-directory: sglang-benchmarks
         shell: bash
         run: |
           set -eux
 
-          # Create virtual environment for SGLang
-          uv venv sglang_env
-          source sglang_env/bin/activate
-
           # Install SGLang from source
           pushd sglang
           pip install -e "python[all]"
@@ -189,9 +185,6 @@ jobs:
           # Verify installations
           python3 -c "import sglang; print('SGLang installed successfully')"
 
-          # Deactivate for now - will be activated in the benchmark script
-          deactivate
-
       - name: Setup benchmark tests
         env:
           MODELS: ${{ matrix.models }}

From 1bb0f3442f228746a46b318e3e277457db596a97 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 15:38:02 -0700
Subject: [PATCH 39/57] trying with env variables

---
 .github/workflows/sglang-benchmark.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index b1566dd2..80da7ff1 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -175,6 +175,10 @@ jobs:
       - name: Install SGLang
         working-directory: sglang-benchmarks
         shell: bash
+        env:
+          TORCH_COMPILE_DISABLE: "1"
+          TORCHDYNAMO_DISABLE: "1"
+          TRITON_DISABLE_LINE_INFO: "1"
         run: |
           set -eux
 
@@ -209,6 +213,9 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+          CUDA_VISIBLE_DEVICES: "0"
+          TORCH_COMPILE_DISABLE: "1"
+          TORCHDYNAMO_DISABLE: "1"
         run: |
           set -eux
 

From fc897adea79587ae9bb31dc45b99c28123e2892b Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 16:02:37 -0700
Subject: [PATCH 40/57] final touches

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 2 +-
 .github/workflows/sglang-benchmark.yml               | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index aa5dcaf4..c0203a15 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -254,7 +254,7 @@ run_serving_tests() {
         echo "now qps is $qps"
       fi
 
-      new_test_name=$test_name"_qps_"$qps"_concurrency_"
+      new_test_name=$test_name"_qps_"$qps"
       echo " new test name $new_test_name"
       # pass the tensor parallel size to the client so that it can be displayed
       # on the benchmark dashboard
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 80da7ff1..6127d56d 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -221,7 +221,6 @@ jobs:
 
           cd sglang-benchmarks/benchmarks &&
           bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
-          "
 
       - name: Upload the benchmark results
         if: always()

From 747817dc7fd46e75d0cecb3da452b8f5d28d9d34 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 16:14:58 -0700
Subject: [PATCH 41/57] fix extra character

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index c0203a15..1de0312f 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -254,7 +254,7 @@ run_serving_tests() {
         echo "now qps is $qps"
       fi
 
-      new_test_name=$test_name"_qps_"$qps"
+      new_test_name=$test_name"_qps_"$qps
       echo " new test name $new_test_name"
       # pass the tensor parallel size to the client so that it can be displayed
       # on the benchmark dashboard

From 0fc1017e38b76393a1097fe33184cbd4db66b705 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 16:50:04 -0700
Subject: [PATCH 42/57] cleanup and adding more tests

---
 .github/workflows/sglang-benchmark.yml        |  4 ----
 .../benchmarks/cuda/serving-tests.json        | 22 ++++++++++++++++++-
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 6127d56d..00490413 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -152,10 +152,6 @@ jobs:
         if: env.DEVICE_NAME == 'rocm'
         uses: pytorch/pytorch/./.github/actions/setup-rocm@main
 
-      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
-        run: |
-          echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
-
       - name: Authenticate with AWS
         # Only need for DGX hosts
         if: contains(env.DEVICE_TYPE, 'B200')
diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
index e2c30eca..3b3981dd 100644
--- a/sglang-benchmarks/benchmarks/cuda/serving-tests.json
+++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
@@ -1,12 +1,13 @@
 [
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4],
+        "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
+            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -16,5 +17,24 @@
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200
         }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
     }
 ]

From 545c19b9e6d6bd8b3db94a3746a282b6f471cb7d Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 16:56:35 -0700
Subject: [PATCH 43/57] removing not needed files and tests

---
 .../run-sglang-performance-benchmarks.sh      |  13 +-
 .../benchmarks/cuda/genai-perf-tests.json     |  22 ---
 .../benchmarks/cuda/nightly-tests.json        | 161 ------------------
 3 files changed, 2 insertions(+), 194 deletions(-)
 delete mode 100644 sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json
 delete mode 100644 sglang-benchmarks/benchmarks/cuda/nightly-tests.json

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index 1de0312f..39372288 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -296,14 +296,7 @@ run_serving_tests() {
 }
 
 main() {
-    local ARCH
-    ARCH=''
-    if [ "$ON_CPU" == "1" ];then
-        check_cpus
-        ARCH='-cpu'
-    else
-        check_gpus
-    fi
+    check_gpus
     check_hf_token
 
     # dependencies
@@ -323,9 +316,7 @@ main() {
     BENCHMARK_ROOT=tests/
 
     # benchmarking - look for test files in the tests/ directory
-    if [ -f "$BENCHMARK_ROOT/serving-tests$ARCH.json" ]; then
-    run_serving_tests "$BENCHMARK_ROOT/serving-tests$ARCH.json"
-    elif [ -f "$BENCHMARK_ROOT/serving-tests.json" ]; then
+    if [ -f "$BENCHMARK_ROOT/serving-tests.json" ]; then
     run_serving_tests "$BENCHMARK_ROOT/serving-tests.json"
     else
     echo "No serving test file found"
diff --git a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json b/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json
deleted file mode 100644
index ca9027e6..00000000
--- a/sglang-benchmarks/benchmarks/cuda/genai-perf-tests.json
+++ /dev/null
@@ -1,22 +0,0 @@
-[
-    {
-        "test_name": "llama8B_tp1_genai_perf",
-        "qps_list": [4,8,16,32],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tp": 1,
-            "port": 8000,
-            "num_prompts": 500,
-            "reuse_server": false
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "genai_perf_input_parameters": {
-        }
-    }
-]
diff --git a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json b/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
deleted file mode 100644
index 10bed8ab..00000000
--- a/sglang-benchmarks/benchmarks/cuda/nightly-tests.json
+++ /dev/null
@@ -1,161 +0,0 @@
-[
-    {
-        "test_name": "llama8B_tp1_sharegpt",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tp": 1,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000,
-            "reuse_server": false
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "enable_torch_compile": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama8B_tp1_sonnet_512_16",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tp": 1,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 16,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "enable_torch_compile": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama8B_tp1_sonnet_512_256",
-        "qps_list": [4,8,16,32,"inf"],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "tp": 1,
-            "dataset_name": "sonnet",
-            "dataset_path": "./sonnet_4x.txt",
-            "num_prompts": 500,
-            "port": 8000,
-            "sonnet_input_len": 512,
-            "sonnet_output_len": 256,
-            "sonnet_prefix_len": 50,
-            "reuse_server": true
-        },
-        "lmdeploy_server_parameters": {
-            "dtype": "bfloat16"
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "bfloat16",
-            "max_batch_size": 2048,
-            "max_input_len": 4096,
-            "max_seq_len": 6144,
-            "max_num_tokens": 16384,
-            "trt_llm_version": "v0.11.0"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "gpu_memory_utilization": 0.9,
-            "max_num_seqs": 512,
-            "dtype": "bfloat16"
-        },
-        "vllm_client_parameters": {
-        },
-        "sglang_server_parameters": {
-            "disable_radix_cache": "",
-            "enable_torch_compile": "",
-            "dtype": "bfloat16"
-        },
-        "sglang_client_parameters": {
-        }
-    }
-]

From 5b3f9f9a3e174e0a861caf90665b3808b970645c Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 19:31:54 -0700
Subject: [PATCH 44/57] try running inside docker container

---
 .github/workflows/sglang-benchmark.yml | 37 ++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 00490413..3b4b4941 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -205,6 +205,20 @@ jobs:
           ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found"
           find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display"
 
+      # - name: Run SGLang benchmark
+      #   env:
+      #     HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      #     SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+      #     CUDA_VISIBLE_DEVICES: "0"
+      #     TORCH_COMPILE_DISABLE: "1"
+      #     TORCHDYNAMO_DISABLE: "1"
+      #   run: |
+      #     set -eux
+
+      #     cd sglang-benchmarks/benchmarks &&
+      #     bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
+
+
       - name: Run SGLang benchmark
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -215,8 +229,27 @@ jobs:
         run: |
           set -eux
 
-          cd sglang-benchmarks/benchmarks &&
-          bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
+          container_name=$(docker run \
+            --gpus all \
+            -e NVIDIA_DRIVER_CAPABILITIES=all \
+            -e HF_TOKEN \
+            -e CUDA_VISIBLE_DEVICES \
+            -e TORCH_COMPILE_DISABLE \
+            -e TORCHDYNAMO_DISABLE \
+            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
+            --ipc=host \
+            --shm-size=4g \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+            -w /tmp/workspace \
+            python:3.12
+          )
+
+          docker exec -t "${container_name}" bash -c "
+            cd sglang-benchmarks/benchmarks &&
+            bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
+          "
 
       - name: Upload the benchmark results
         if: always()

From f8bd1c8f456661ead113839eaf51f9bc55cb7f00 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 19:44:28 -0700
Subject: [PATCH 45/57] try sglang docker image

---
 .github/workflows/sglang-benchmark.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 3b4b4941..42b6bad3 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -231,19 +231,19 @@ jobs:
 
           container_name=$(docker run \
             --gpus all \
-            -e NVIDIA_DRIVER_CAPABILITIES=all \
+            --shm-size 4g \
             -e HF_TOKEN \
             -e CUDA_VISIBLE_DEVICES \
             -e TORCH_COMPILE_DISABLE \
             -e TORCHDYNAMO_DISABLE \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
             --ipc=host \
-            --shm-size=4g \
             --tty \
             --detach \
             -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+            -v ~/.cache/huggingface:/root/.cache/huggingface \
             -w /tmp/workspace \
-            python:3.12
+            lmsysorg/sglang:latest
           )
 
           docker exec -t "${container_name}" bash -c "

From c3d6657a6441bc1cadde0bad8563133189ad0637 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 20:31:23 -0700
Subject: [PATCH 46/57] try with gpu cleaning

---
 .../run-sglang-performance-benchmarks.sh      |  1 +
 .github/workflows/sglang-benchmark.yml        | 37 +------------------
 2 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index 39372288..fbe078f8 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -126,6 +126,7 @@ kill_gpu_processes() {
   lsof -t -i:30000 | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
   pgrep python | xargs -r kill -9
+  pgrep VLLM | xargs -r kill -9
 
   # wait until GPU memory usage smaller than 1GB
   if command -v nvidia-smi; then
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 42b6bad3..00490413 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -205,20 +205,6 @@ jobs:
           ls -lah sglang-benchmarks/benchmarks/tests || echo "No test files found"
           find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display"
 
-      # - name: Run SGLang benchmark
-      #   env:
-      #     HF_TOKEN: ${{ secrets.HF_TOKEN }}
-      #     SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
-      #     CUDA_VISIBLE_DEVICES: "0"
-      #     TORCH_COMPILE_DISABLE: "1"
-      #     TORCHDYNAMO_DISABLE: "1"
-      #   run: |
-      #     set -eux
-
-      #     cd sglang-benchmarks/benchmarks &&
-      #     bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
-
-
       - name: Run SGLang benchmark
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -229,27 +215,8 @@ jobs:
         run: |
           set -eux
 
-          container_name=$(docker run \
-            --gpus all \
-            --shm-size 4g \
-            -e HF_TOKEN \
-            -e CUDA_VISIBLE_DEVICES \
-            -e TORCH_COMPILE_DISABLE \
-            -e TORCHDYNAMO_DISABLE \
-            -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
-            --ipc=host \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-            -v ~/.cache/huggingface:/root/.cache/huggingface \
-            -w /tmp/workspace \
-            lmsysorg/sglang:latest
-          )
-
-          docker exec -t "${container_name}" bash -c "
-            cd sglang-benchmarks/benchmarks &&
-            bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
-          "
+          cd sglang-benchmarks/benchmarks &&
+          bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
 
       - name: Upload the benchmark results
         if: always()

From f877d7b83f77de3d6bd80fe0a6ffbd66cc79c349 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Mon, 25 Aug 2025 20:56:15 -0700
Subject: [PATCH 47/57] remove cuda check

---
 .github/workflows/sglang-benchmark.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 00490413..1cb7bf45 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -209,7 +209,6 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
-          CUDA_VISIBLE_DEVICES: "0"
           TORCH_COMPILE_DISABLE: "1"
           TORCHDYNAMO_DISABLE: "1"
         run: |

From f8912f4849a2493de1ab6e8eee24d8ef1fccc0e7 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Tue, 26 Aug 2025 10:23:30 -0700
Subject: [PATCH 48/57] try using vllm docker image

---
 .github/scripts/common_utils.py               |  0
 .../run-sglang-performance-benchmarks.sh      | 25 +++++++++++++------
 2 files changed, 17 insertions(+), 8 deletions(-)
 create mode 100644 .github/scripts/common_utils.py

diff --git a/.github/scripts/common_utils.py b/.github/scripts/common_utils.py
new file mode 100644
index 00000000..e69de29b
diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index fbe078f8..a3dfdd23 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -238,13 +238,13 @@ run_serving_tests() {
     fi
 
     # Create a new uv environment for vllm client (once per test case)
-    echo "Creating new uv environment for vllm client..."
+    # echo "Creating new uv environment for vllm client..."
     uv venv vllm_client_env
-
-    # Activate the environment and install vllm
-    echo "Installing vllm in the new environment..."
+    # echo "Installing vllm in the new environment..."
     source vllm_client_env/bin/activate
-    pip install vllm
+
+    echo "Pulling official vLLM Docker image..."
+    docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:latest
 
     # iterate over different QPS
     for qps in $qps_list; do
@@ -257,8 +257,7 @@ run_serving_tests() {
 
       new_test_name=$test_name"_qps_"$qps
       echo " new test name $new_test_name"
-      # pass the tensor parallel size to the client so that it can be displayed
-      # on the benchmark dashboard
+
       client_command="vllm bench serve \
         --save-result \
         --result-dir $RESULTS_FOLDER \
@@ -271,7 +270,17 @@ run_serving_tests() {
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
 
-      bash -c "$client_command"
+      # Run vLLM client inside Docker container
+      docker run --rm \
+        --gpus all \
+        -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
+        -w /tmp/workspace \
+        --ipc=host \
+        -e HF_TOKEN="$HF_TOKEN" \
+        --shm-size=4g \
+        --security-opt seccomp=unconfined \
+        public.ecr.aws/q9t5s3a7/vllm-release-repo:latest \
+        $client_command
 
       # record the benchmarking commands
       jq_output=$(jq -n \

From 936bd02311b4d71813516270416323d62b8b9864 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Tue, 26 Aug 2025 10:42:25 -0700
Subject: [PATCH 49/57] check valid docker image

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index a3dfdd23..4d5fb38c 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -244,7 +244,6 @@ run_serving_tests() {
     source vllm_client_env/bin/activate
 
     echo "Pulling official vLLM Docker image..."
-    docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:latest
 
     # iterate over different QPS
     for qps in $qps_list; do
@@ -279,7 +278,7 @@ run_serving_tests() {
         -e HF_TOKEN="$HF_TOKEN" \
         --shm-size=4g \
         --security-opt seccomp=unconfined \
-        public.ecr.aws/q9t5s3a7/vllm-release-repo:latest \
+        vllm/vllm-openai:latest \
         $client_command
 
       # record the benchmarking commands

From 53d83bbfb87027f57729a3e6f693c212f60de6fe Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Tue, 26 Aug 2025 11:40:42 -0700
Subject: [PATCH 50/57] removing not needed tests and back to original
 implementation

---
 .../run-sglang-performance-benchmarks.sh      | 23 +++++++------------
 .github/workflows/sglang-benchmark.yml        |  2 --
 .../benchmarks/cuda/serving-tests.json        | 19 ---------------
 3 files changed, 8 insertions(+), 36 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index 4d5fb38c..ac381961 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -238,12 +238,13 @@ run_serving_tests() {
     fi
 
     # Create a new uv environment for vllm client (once per test case)
-    # echo "Creating new uv environment for vllm client..."
+    echo "Creating new uv environment for vllm client..."
     uv venv vllm_client_env
-    # echo "Installing vllm in the new environment..."
-    source vllm_client_env/bin/activate
 
-    echo "Pulling official vLLM Docker image..."
+    # Activate the environment and install vllm
+    echo "Installing vllm in the new environment..."
+    source vllm_client_env/bin/activate
+    pip install vllm
 
     # iterate over different QPS
     for qps in $qps_list; do
@@ -257,6 +258,8 @@ run_serving_tests() {
       new_test_name=$test_name"_qps_"$qps
       echo " new test name $new_test_name"
 
+      # pass the tensor parallel size to the client so that it can be displayed
+      # on the benchmark dashboard
       client_command="vllm bench serve \
         --save-result \
         --result-dir $RESULTS_FOLDER \
@@ -269,17 +272,7 @@ run_serving_tests() {
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
 
-      # Run vLLM client inside Docker container
-      docker run --rm \
-        --gpus all \
-        -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-        -w /tmp/workspace \
-        --ipc=host \
-        -e HF_TOKEN="$HF_TOKEN" \
-        --shm-size=4g \
-        --security-opt seccomp=unconfined \
-        vllm/vllm-openai:latest \
-        $client_command
+      bash -c "$client_command"
 
       # record the benchmarking commands
       jq_output=$(jq -n \
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 1cb7bf45..fee1fd6c 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -172,7 +172,6 @@ jobs:
         working-directory: sglang-benchmarks
         shell: bash
         env:
-          TORCH_COMPILE_DISABLE: "1"
           TORCHDYNAMO_DISABLE: "1"
           TRITON_DISABLE_LINE_INFO: "1"
         run: |
@@ -209,7 +208,6 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
-          TORCH_COMPILE_DISABLE: "1"
           TORCHDYNAMO_DISABLE: "1"
         run: |
           set -eux
diff --git a/sglang-benchmarks/benchmarks/cuda/serving-tests.json b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
index 3b3981dd..e87b9212 100644
--- a/sglang-benchmarks/benchmarks/cuda/serving-tests.json
+++ b/sglang-benchmarks/benchmarks/cuda/serving-tests.json
@@ -17,24 +17,5 @@
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200
         }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
     }
 ]

From 9edcfaa50d8fcf5f36b6bcab640a6c4e274ce48b Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Tue, 26 Aug 2025 12:03:23 -0700
Subject: [PATCH 51/57] try after removing extra env variables

---
 .github/scripts/common_utils.py        | 0
 .github/workflows/sglang-benchmark.yml | 4 ----
 2 files changed, 4 deletions(-)
 delete mode 100644 .github/scripts/common_utils.py

diff --git a/.github/scripts/common_utils.py b/.github/scripts/common_utils.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index fee1fd6c..cad98280 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -171,9 +171,6 @@ jobs:
       - name: Install SGLang
         working-directory: sglang-benchmarks
         shell: bash
-        env:
-          TORCHDYNAMO_DISABLE: "1"
-          TRITON_DISABLE_LINE_INFO: "1"
         run: |
           set -eux
 
@@ -208,7 +205,6 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
-          TORCHDYNAMO_DISABLE: "1"
         run: |
           set -eux
 

From ca9c3d85052541895ee9bd90d9030cacd63e8c00 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Tue, 26 Aug 2025 12:24:11 -0700
Subject: [PATCH 52/57] adding dynamo variable

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 3 +++
 .github/workflows/sglang-benchmark.yml               | 1 +
 2 files changed, 4 insertions(+)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index ac381961..130f384c 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -9,6 +9,9 @@
 set -x
 set -o pipefail
 
+# The helper functions and their implementations are referred from the implementation
+# of the run-performance-benchmarks.sh script in the official vllm repo
+# Path:- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 check_gpus() {
   if command -v nvidia-smi; then
     # check the number of GPUs and GPU type.
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index cad98280..a333d24a 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -205,6 +205,7 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+          TORCHDYNAMO_DISABLE: "1"
         run: |
           set -eux
 

From a414e4bccaa6b01b572ab8778ae36b9ad6c13972 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Tue, 26 Aug 2025 13:53:00 -0700
Subject: [PATCH 53/57] run sglang in a diff venv

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 8 +++++---
 .github/workflows/sglang-benchmark.yml               | 8 +++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index 130f384c..e8c1478b 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -125,7 +125,7 @@ kill_processes_launched_by_current_bash() {
 }
 
 kill_gpu_processes() {
-  ps -aux
+  # ps -aux
   lsof -t -i:30000 | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
   pgrep python | xargs -r kill -9
@@ -221,9 +221,11 @@ run_serving_tests() {
       continue
     fi
 
-    server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
+    # Use SGLang environment's Python directly for complete isolation
+    sglang_python="../sglang_env/bin/python3"
+    server_command="$sglang_python -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
 
-    # run the server
+    # run the server in a completely separate process with its own environment
     echo "Running test case $test_name"
     echo "Server command: $server_command"
     bash -c "$server_command" &
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index a333d24a..417dab43 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -174,6 +174,10 @@ jobs:
         run: |
           set -eux
 
+          # Create a virtual environment for SGLang
+          uv venv sglang_env
+          source sglang_env/bin/activate
+
           # Install SGLang from source
           pushd sglang
           pip install -e "python[all]"
@@ -181,6 +185,9 @@ jobs:
           # Verify installations
           python3 -c "import sglang; print('SGLang installed successfully')"
 
+          # Deactivate the environment
+          deactivate
+
       - name: Setup benchmark tests
         env:
           MODELS: ${{ matrix.models }}
@@ -205,7 +212,6 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
-          TORCHDYNAMO_DISABLE: "1"
         run: |
           set -eux
 

From 0b7f1cf5ed722aab95dd5ca6f4767ee371bf82e2 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Tue, 26 Aug 2025 14:11:59 -0700
Subject: [PATCH 54/57] debug issue

---
 .github/scripts/run-sglang-performance-benchmarks.sh | 2 +-
 .github/workflows/sglang-benchmark.yml               | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index e8c1478b..6cbcb8c4 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -125,7 +125,7 @@ kill_processes_launched_by_current_bash() {
 }
 
 kill_gpu_processes() {
-  # ps -aux
+  ps -aux
   lsof -t -i:30000 | xargs -r kill -9
   pgrep python3 | xargs -r kill -9
   pgrep python | xargs -r kill -9
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 417dab43..462cd380 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -181,9 +181,7 @@ jobs:
           # Install SGLang from source
           pushd sglang
           pip install -e "python[all]"
-
-          # Verify installations
-          python3 -c "import sglang; print('SGLang installed successfully')"
+          popd
 
           # Deactivate the environment
           deactivate

From 81503c4fd31c90b020fcf9b0dce49c2fd481aca1 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Tue, 26 Aug 2025 14:29:59 -0700
Subject: [PATCH 55/57] revert the changes

---
 .github/scripts/run-sglang-performance-benchmarks.sh |  6 ++----
 .github/workflows/sglang-benchmark.yml               | 10 +++-------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index 6cbcb8c4..130f384c 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -221,11 +221,9 @@ run_serving_tests() {
       continue
     fi
 
-    # Use SGLang environment's Python directly for complete isolation
-    sglang_python="../sglang_env/bin/python3"
-    server_command="$sglang_python -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
+    server_command="python3 -m sglang.launch_server --model-path $model_path --context-length $context_length --tp $tp"
 
-    # run the server in a completely separate process with its own environment
+    # run the server
     echo "Running test case $test_name"
     echo "Server command: $server_command"
     bash -c "$server_command" &
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 462cd380..a333d24a 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -174,17 +174,12 @@ jobs:
         run: |
           set -eux
 
-          # Create a virtual environment for SGLang
-          uv venv sglang_env
-          source sglang_env/bin/activate
-
           # Install SGLang from source
           pushd sglang
           pip install -e "python[all]"
-          popd
 
-          # Deactivate the environment
-          deactivate
+          # Verify installations
+          python3 -c "import sglang; print('SGLang installed successfully')"
 
       - name: Setup benchmark tests
         env:
@@ -210,6 +205,7 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+          TORCHDYNAMO_DISABLE: "1"
         run: |
           set -eux
 

From c1c13bae7e5841a6e1abe7029d43945d3deef23e Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 27 Aug 2025 11:01:35 -0700
Subject: [PATCH 56/57] address review comments

---
 .../run-sglang-performance-benchmarks.sh      | 21 -------------------
 .github/workflows/sglang-benchmark.yml        |  3 +--
 2 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/.github/scripts/run-sglang-performance-benchmarks.sh b/.github/scripts/run-sglang-performance-benchmarks.sh
index 130f384c..7900becf 100644
--- a/.github/scripts/run-sglang-performance-benchmarks.sh
+++ b/.github/scripts/run-sglang-performance-benchmarks.sh
@@ -143,25 +143,6 @@ kill_gpu_processes() {
   fi
 }
 
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
-  if command -v buildkite-agent >/dev/null 2>&1; then
-    BUILDKITE_AGENT_COMMAND="buildkite-agent"
-  elif [ -f /workspace/buildkite-agent ]; then
-    BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
-  else
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-
-  # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "sglang-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
-  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
-}
-
 run_serving_tests() {
   # run serving tests using `sglang.bench_serving` command
   # $1: a json file specifying serving test cases
@@ -345,8 +326,6 @@ main() {
     else
     echo "No JSON result files were generated." >> "$RESULTS_FOLDER/benchmark_results.md"
     fi
-
-    upload_to_buildkite
 }
 
 main "$@"
diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index a333d24a..5b6ec574 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -202,14 +202,13 @@ jobs:
           find sglang-benchmarks/benchmarks/tests -type f -exec cat {} \; || echo "No test files to display"
 
       - name: Run SGLang benchmark
+        working-directory: sglang-benchmarks/benchmarks
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
           TORCHDYNAMO_DISABLE: "1"
         run: |
           set -eux
-
-          cd sglang-benchmarks/benchmarks &&
           bash ../../.github/scripts/run-sglang-performance-benchmarks.sh
 
       - name: Upload the benchmark results

From e2e6af0f4ddd96ac6391bbeaafb7908680db4586 Mon Sep 17 00:00:00 2001
From: Naman Lalit <nl2688@nyu.edu>
Date: Wed, 27 Aug 2025 11:39:57 -0700
Subject: [PATCH 57/57] add a todo for env variable

---
 .github/workflows/sglang-benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/sglang-benchmark.yml b/.github/workflows/sglang-benchmark.yml
index 5b6ec574..d3bcb7d2 100644
--- a/.github/workflows/sglang-benchmark.yml
+++ b/.github/workflows/sglang-benchmark.yml
@@ -206,7 +206,7 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
-          TORCHDYNAMO_DISABLE: "1"
+          TORCHDYNAMO_DISABLE: "1" #TODO: remove this variable in future. As of now, this is a workaround to fix cuda errors to avoid breaking the sglang server.
         run: |
           set -eux
           bash ../../.github/scripts/run-sglang-performance-benchmarks.sh