From 6756120fbdaaa4f093e5ff92b553d48625a32a2d Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 1 Oct 2025 15:11:29 -0700 Subject: [PATCH 1/4] Move upload benchmark results to a separate workflows Signed-off-by: Huy Do --- .github/workflows/benchmark.yml | 84 +++++++++++-------- .github/workflows/benchmark_dispatch.yml | 37 ++++++-- .../workflows/compute-benchmark-matrix.yml | 24 ++++++ 3 files changed, 107 insertions(+), 38 deletions(-) create mode 100644 .github/workflows/compute-benchmark-matrix.yml diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 87cd26f02..ec4ca331d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -21,30 +21,18 @@ on: alias: required: true type: string - max-runners: + num-shards: + required: true + type: number + description: "Number of shards benchmark is running on" + shard: required: true type: number description: "Maximum parallel runners to determine shards" jobs: - compute-matrix: - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.gen.outputs.matrix }} - steps: - - id: gen - run: | - n="${{ inputs.max-runners }}" - shards=$(seq 0 $((n-1)) | paste -sd, -) - echo "matrix={\"shard\": [${shards}], \"num_shards\": [${n}]}" >> $GITHUB_OUTPUT - benchmark: - name: benchmark-${{ inputs.runtime-version }}-py${{ inputs.python-version }}-${{ inputs.alias }} - needs: compute-matrix - - strategy: - fail-fast: false - matrix: ${{ fromJSON(needs.compute-matrix.outputs.matrix) }} + name: benchmark-${{ inputs.runtime-version }}-shard${{ inputs.shard }}-py${{ inputs.python-version }}-${{ inputs.alias }} container: image: ${{ inputs.image }} @@ -59,6 +47,11 @@ jobs: run: shell: bash -l {0} + outputs: + benchmark-metadata: ${{ steps.gather-benchmark-metadata.outputs.benchmark-metadata }} + runners-info: ${{ steps.gather-runners-info.outputs.runners-info }} + dependencies: ${{ steps.gather-dependencies.outputs.dependencies }} + steps: - name: Check out code uses: actions/checkout@v4 @@ -129,8 +122,8 @@ jobs: source .venv/bin/activate KERNELS=("softmax" "geglu" "swiglu" "jsd" "welford" "kl_div" "int4_gemm" "layer_norm" "layer_norm-bwd" "rms_norm" "rms_norm-bwd" "cross_entropy") - NUMSHARDS=${{ matrix.num_shards }} - SHARD=${{ matrix.shard }} + NUMSHARDS=${{ inputs.num-shards }} + SHARD=${{ inputs.shard }} SHARD_KERNELS=() for ((i=0; i<${#KERNELS[@]}; i++)); do @@ -203,19 +196,44 @@ jobs: fi cat "$TEST_REPORTS_DIR/helionbench.json" - - name: Authenticate with AWS - uses: aws-actions/configure-aws-credentials@v4 + - name: Gather benchmark metadata + id: gather-benchmark-metadata + # TODO: Switch to main once https://github.com/pytorch/test-infra/pull/7269 lands + uses: pytorch/test-infra/.github/actions/gather-benchmark-metadata@upload-benchmark-results-v2 with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results - # The max duration enforced by the server side - role-duration-seconds: 18000 - aws-region: us-east-1 + github-token: ${{ secrets.GITHUB_TOKEN }} + venv: .venv/bin/activate - - name: Upload the benchmark results to OSS benchmark database for the dashboard - uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + - name: Gather runners info + id: gather-runners-info + # TODO: Switch to main once https://github.com/pytorch/test-infra/pull/7269 lands + uses: pytorch/test-infra/.github/actions/gather-runners-info@upload-benchmark-results-v2 with: - benchmark-results-dir: test/test-reports - dry-run: false - schema-version: v3 - github-token: ${{ secrets.GITHUB_TOKEN }} - venv: ".venv/bin/activate" + venv: .venv/bin/activate + + - name: Gather dependencies + id: gather-dependencies + # TODO: Switch to main once https://github.com/pytorch/test-infra/pull/7269 lands + uses: pytorch/test-infra/.github/actions/gather-dependencies@upload-benchmark-results-v2 + with: + venv: .venv/bin/activate + + - name: Upload the benchmark results to GitHub + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-${{ inputs.alias }}-${{ inputs.shard }} + path: test/test-reports + + upload-benchmark-results: + needs: benchmark + uses: + permissions: + id-token: write + contents: read + with: + benchmark-artifact: benchmark-results-${{ inputs.alias }}-${{ inputs.shard }} + benchmark-metadata: ${{ needs.benchmark.outputs.benchmark-metadata }} + runners-info: ${{ needs.benchmark.outputs.runners-info }} + dependencies: ${{ needs.benchmark.outputs.dependencies }} + schema-version: v3 + dry-run: false diff --git a/.github/workflows/benchmark_dispatch.yml b/.github/workflows/benchmark_dispatch.yml index 7f5be9d16..266ff65a9 100644 --- a/.github/workflows/benchmark_dispatch.yml +++ b/.github/workflows/benchmark_dispatch.yml @@ -22,9 +22,17 @@ on: - cron: '0 8 * * *' # Runs at midnight PST (8 AM UTC) jobs: - run-h100: + gen-matrix-h100: if: ${{ github.event.inputs.run_h100 == 'true' || github.event_name == 'schedule' }} + uses: ./.github/workflows/compute-benchmark-matrix.yml + with: + max-runners: 12 + + run-h100: + needs: gen-matrix-h100 uses: ./.github/workflows/benchmark.yml + strategy: + matrix: ${{ fromJSON(needs.gen-matrix-h100.outputs.matrix) }} permissions: id-token: write contents: read @@ -35,11 +43,20 @@ jobs: runtime-version: cu129 container-options: --gpus all alias: h100 + num-shards: ${{ matrix.num_shards }} + shard: ${{ matrix.shard }} + + gen-matrix-b200: + uses: ./.github/workflows/compute-benchmark-matrix.yml + if: ${{ github.event.inputs.run_h100 == 'true' || github.event_name == 'schedule' }} + with: max-runners: 12 run-b200: - if: ${{ github.event.inputs.run_b200 == 'true' || github.event_name == 'schedule' }} + needs: gen-matrix-b200 uses: ./.github/workflows/benchmark.yml + strategy: + matrix: ${{ fromJSON(needs.gen-matrix-b200.outputs.matrix) }} permissions: id-token: write contents: read @@ -50,11 +67,20 @@ jobs: runtime-version: cu129 container-options: --gpus all alias: b200 - max-runners: 12 + num-shards: ${{ matrix.num_shards }} + shard: ${{ matrix.shard }} - run-mi325x: + gen-matrix-mi325x: + uses: ./.github/workflows/compute-benchmark-matrix.yml if: ${{ github.event.inputs.run_mi325x == 'true' || github.event_name == 'schedule' }} + with: + max-runners: 6 + + run-mi325x: + needs: gen-matrix-mi325x uses: ./.github/workflows/benchmark.yml + strategy: + matrix: ${{ fromJSON(needs.gen-matrix-mi325x.outputs.matrix) }} permissions: id-token: write contents: read @@ -65,4 +91,5 @@ jobs: runtime-version: rocm7.0 container-options: --device=/dev/kfd --device=/dev/dri alias: mi325x - max-runners: 6 + num-shards: ${{ matrix.num_shards }} + shard: ${{ matrix.shard }} diff --git a/.github/workflows/compute-benchmark-matrix.yml b/.github/workflows/compute-benchmark-matrix.yml new file mode 100644 index 000000000..160210814 --- /dev/null +++ b/.github/workflows/compute-benchmark-matrix.yml @@ -0,0 +1,24 @@ +name: Compute Benchmark Matrix + +on: + workflow_call: + inputs: + max-runners: + required: true + type: string + outputs: + matrix: + description: "The generated matrix for sharding" + value: ${{ jobs.gen.outputs.matrix }} + +jobs: + gen: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.gen.outputs.matrix }} + steps: + - id: gen + run: | + n="${{ inputs.max-runners }}" + shards=$(seq 0 $((n-1)) | paste -sd, -) + echo "matrix={\"shard\": [${shards}], \"num_shards\": [${n}]}" >> $GITHUB_OUTPUT From ced4c3a376d45bcf2b4ceb28ae860804ac909a63 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 1 Oct 2025 15:15:32 -0700 Subject: [PATCH 2/4] Use the right workflow Signed-off-by: Huy Do --- .github/workflows/benchmark.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index ec4ca331d..5f836cd7e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -226,7 +226,8 @@ jobs: upload-benchmark-results: needs: benchmark - uses: + # TODO: Switch to main once https://github.com/pytorch/test-infra/pull/7269 lands + uses: pytorch/test-infra/.github/workflows/upload_benchmark_results.yml@upload-benchmark-results-v2 permissions: id-token: write contents: read From 9f17465f6f26ed48d5d8116c6719a59ef95a7955 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 1 Oct 2025 15:22:02 -0700 Subject: [PATCH 3/4] Typo Signed-off-by: Huy Do --- .github/workflows/benchmark_dispatch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark_dispatch.yml b/.github/workflows/benchmark_dispatch.yml index 266ff65a9..fa351adc2 100644 --- a/.github/workflows/benchmark_dispatch.yml +++ b/.github/workflows/benchmark_dispatch.yml @@ -48,7 +48,7 @@ jobs: gen-matrix-b200: uses: ./.github/workflows/compute-benchmark-matrix.yml - if: ${{ github.event.inputs.run_h100 == 'true' || github.event_name == 'schedule' }} + if: ${{ github.event.inputs.run_b200 == 'true' || github.event_name == 'schedule' }} with: max-runners: 12 From 9c56f3c59b4ff01ef340040e591fb5a497e7d4b9 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 1 Oct 2025 17:18:16 -0700 Subject: [PATCH 4/4] Ready to land Signed-off-by: Huy Do --- .github/workflows/benchmark.yml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 5f836cd7e..1beb3e10e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -198,23 +198,20 @@ jobs: - name: Gather benchmark metadata id: gather-benchmark-metadata - # TODO: Switch to main once https://github.com/pytorch/test-infra/pull/7269 lands - uses: pytorch/test-infra/.github/actions/gather-benchmark-metadata@upload-benchmark-results-v2 + uses: pytorch/test-infra/.github/actions/gather-benchmark-metadata@main with: github-token: ${{ secrets.GITHUB_TOKEN }} venv: .venv/bin/activate - name: Gather runners info id: gather-runners-info - # TODO: Switch to main once https://github.com/pytorch/test-infra/pull/7269 lands - uses: pytorch/test-infra/.github/actions/gather-runners-info@upload-benchmark-results-v2 + uses: pytorch/test-infra/.github/actions/gather-runners-info@main with: venv: .venv/bin/activate - name: Gather dependencies id: gather-dependencies - # TODO: Switch to main once https://github.com/pytorch/test-infra/pull/7269 lands - uses: pytorch/test-infra/.github/actions/gather-dependencies@upload-benchmark-results-v2 + uses: pytorch/test-infra/.github/actions/gather-dependencies@main with: venv: .venv/bin/activate @@ -226,8 +223,7 @@ jobs: upload-benchmark-results: needs: benchmark - # TODO: Switch to main once https://github.com/pytorch/test-infra/pull/7269 lands - uses: pytorch/test-infra/.github/workflows/upload_benchmark_results.yml@upload-benchmark-results-v2 + uses: pytorch/test-infra/.github/workflows/upload_benchmark_results.yml@main permissions: id-token: write contents: read