pytorch · huydhn · Oct 2, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -21,30 +21,18 @@ on:
       alias:
         required: true
         type: string
-      max-runners:
+      num-shards:
+        required: true
+        type: number
+        description: "Number of shards benchmark is running on"
+      shard:
         required: true
         type: number
         description: "Maximum parallel runners to determine shards"
 
 jobs:
-  compute-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.gen.outputs.matrix }}
-    steps:
-      - id: gen
-        run: |
-          n="${{ inputs.max-runners }}"
-          shards=$(seq 0 $((n-1)) | paste -sd, -)
-          echo "matrix={\"shard\": [${shards}], \"num_shards\": [${n}]}" >> $GITHUB_OUTPUT
-
   benchmark:
-    name: benchmark-${{ inputs.runtime-version }}-py${{ inputs.python-version }}-${{ inputs.alias }}
-    needs: compute-matrix
-
-    strategy:
-      fail-fast: false
-      matrix: ${{ fromJSON(needs.compute-matrix.outputs.matrix) }}
+    name: benchmark-${{ inputs.runtime-version }}-shard${{ inputs.shard }}-py${{ inputs.python-version }}-${{ inputs.alias }}
 
     container:
       image: ${{ inputs.image }}
@@ -59,6 +47,11 @@ jobs:
       run:
         shell: bash -l {0}
 
+    outputs:
+      benchmark-metadata: ${{ steps.gather-benchmark-metadata.outputs.benchmark-metadata }}
+      runners-info: ${{ steps.gather-runners-info.outputs.runners-info }}
+      dependencies: ${{ steps.gather-dependencies.outputs.dependencies }}
+
     steps:
       - name: Check out code
         uses: actions/checkout@v4
@@ -129,8 +122,8 @@ jobs:
           source .venv/bin/activate
 
           KERNELS=("softmax" "geglu" "swiglu" "jsd" "welford" "kl_div" "int4_gemm" "layer_norm" "layer_norm-bwd" "rms_norm" "rms_norm-bwd" "cross_entropy")
-          NUMSHARDS=${{ matrix.num_shards }}
-          SHARD=${{ matrix.shard }}
+          NUMSHARDS=${{ inputs.num-shards }}
+          SHARD=${{ inputs.shard }}
 
           SHARD_KERNELS=()
           for ((i=0; i<${#KERNELS[@]}; i++)); do
@@ -203,19 +196,41 @@ jobs:
           fi
           cat "$TEST_REPORTS_DIR/helionbench.json"
 
-      - name: Authenticate with AWS
-        uses: aws-actions/configure-aws-credentials@v4
+      - name: Gather benchmark metadata
+        id: gather-benchmark-metadata
+        uses: pytorch/test-infra/.github/actions/gather-benchmark-metadata@main
         with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
-          # The max duration enforced by the server side
-          role-duration-seconds: 18000
-          aws-region: us-east-1
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          venv: .venv/bin/activate
 
-      - name: Upload the benchmark results to OSS benchmark database for the dashboard
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+      - name: Gather runners info
+        id: gather-runners-info
+        uses: pytorch/test-infra/.github/actions/gather-runners-info@main
         with:
-          benchmark-results-dir: test/test-reports
-          dry-run: false
-          schema-version: v3
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          venv: ".venv/bin/activate"
+          venv: .venv/bin/activate
+
+      - name: Gather dependencies
+        id: gather-dependencies
+        uses: pytorch/test-infra/.github/actions/gather-dependencies@main
+        with:
+          venv: .venv/bin/activate
+
+      - name: Upload the benchmark results to GitHub
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results-${{ inputs.alias }}-${{ inputs.shard }}
+          path: test/test-reports
+
+  upload-benchmark-results:
+    needs: benchmark
+    uses: pytorch/test-infra/.github/workflows/upload_benchmark_results.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      benchmark-artifact: benchmark-results-${{ inputs.alias }}-${{ inputs.shard }}
+      benchmark-metadata: ${{ needs.benchmark.outputs.benchmark-metadata }}
+      runners-info: ${{ needs.benchmark.outputs.runners-info }}
+      dependencies: ${{ needs.benchmark.outputs.dependencies }}
+      schema-version: v3
+      dry-run: false
diff --git a/.github/workflows/benchmark_dispatch.yml b/.github/workflows/benchmark_dispatch.yml
@@ -22,9 +22,17 @@ on:
     - cron: '0 8 * * *'  # Runs at midnight PST (8 AM UTC)
 
 jobs:
-  run-h100:
+  gen-matrix-h100:
     if: ${{ github.event.inputs.run_h100 == 'true' || github.event_name == 'schedule' }}
+    uses: ./.github/workflows/compute-benchmark-matrix.yml
+    with:
+      max-runners: 12
+
+  run-h100:
+    needs: gen-matrix-h100
     uses: ./.github/workflows/benchmark.yml
+    strategy:
+      matrix: ${{ fromJSON(needs.gen-matrix-h100.outputs.matrix) }}
     permissions:
       id-token: write
       contents: read
@@ -35,11 +43,20 @@ jobs:
       runtime-version: cu129
       container-options: --gpus all
       alias: h100
+      num-shards: ${{ matrix.num_shards }}
+      shard: ${{ matrix.shard }}
+
+  gen-matrix-b200:
+    uses: ./.github/workflows/compute-benchmark-matrix.yml
+    if: ${{ github.event.inputs.run_b200 == 'true' || github.event_name == 'schedule' }}
+    with:
       max-runners: 12
 
   run-b200:
-    if: ${{ github.event.inputs.run_b200 == 'true' || github.event_name == 'schedule' }}
+    needs: gen-matrix-b200
     uses: ./.github/workflows/benchmark.yml
+    strategy:
+      matrix: ${{ fromJSON(needs.gen-matrix-b200.outputs.matrix) }}
     permissions:
       id-token: write
       contents: read
@@ -50,11 +67,20 @@ jobs:
       runtime-version: cu129
       container-options: --gpus all
       alias: b200
-      max-runners: 12
+      num-shards: ${{ matrix.num_shards }}
+      shard: ${{ matrix.shard }}
 
-  run-mi325x:
+  gen-matrix-mi325x:
+    uses: ./.github/workflows/compute-benchmark-matrix.yml
     if: ${{ github.event.inputs.run_mi325x == 'true' || github.event_name == 'schedule' }}
+    with:
+      max-runners: 6
+
+  run-mi325x:
+    needs: gen-matrix-mi325x
     uses: ./.github/workflows/benchmark.yml
+    strategy:
+      matrix: ${{ fromJSON(needs.gen-matrix-mi325x.outputs.matrix) }}
     permissions:
       id-token: write
       contents: read
@@ -65,4 +91,5 @@ jobs:
       runtime-version: rocm7.0
       container-options: --device=/dev/kfd --device=/dev/dri
       alias: mi325x
-      max-runners: 6
+      num-shards: ${{ matrix.num_shards }}
+      shard: ${{ matrix.shard }}
diff --git a/.github/workflows/compute-benchmark-matrix.yml b/.github/workflows/compute-benchmark-matrix.yml
@@ -0,0 +1,24 @@
+name: Compute Benchmark Matrix
+
+on:
+  workflow_call:
+    inputs:
+      max-runners:
+        required: true
+        type: string
+    outputs:
+      matrix:
+        description: "The generated matrix for sharding"
+        value: ${{ jobs.gen.outputs.matrix }}
+
+jobs:
+  gen:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.gen.outputs.matrix }}
+    steps:
+      - id: gen
+        run: |
+          n="${{ inputs.max-runners }}"
+          shards=$(seq 0 $((n-1)) | paste -sd, -)
+          echo "matrix={\"shard\": [${shards}], \"num_shards\": [${n}]}" >> $GITHUB_OUTPUT