[CI] Collect inductor max-autotune performance every Sunday

ghstack-source-id: 2833907826b7d803988531ccbcab6c99563196a3 Pull Request resolved: #99387
pytorch · Apr 18, 2023 · de6efcb · de6efcb
1 parent 7ff1f3f
commit de6efcb
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 17 deletions.
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -278,6 +278,10 @@ else
   DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi
 
+if [[ "${TEST_CONFIG}" == *max_autotune* ]]; then
+  export TORCHINDUCTOR_MAX_AUTOTUNE=1
+fi
+
 test_perf_for_dashboard() {
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
@@ -292,30 +296,31 @@ test_perf_for_dashboard() {
     # Run accuracy test for inductor with different configs
     # --disable-cudagraphs is the default inductor behavior
     # TODO: update here once cudagraphs is turned on as default
-    python "benchmarks/dynamo/$suite.py" \
-        --accuracy --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
-        --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
+    if [[ "${TEST_CONFIG}" != *max_autotune* ]]; then
+      python "benchmarks/dynamo/$suite.py" \
+          --accuracy --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
+          --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
+      python "benchmarks/dynamo/$suite.py" \
+          --accuracy --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes --dynamic-batch-only --disable-cudagraphs "$@" \
+          --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
+    # Only test this one config for max-autotune
     python "benchmarks/dynamo/$suite.py" \
         --accuracy --"$mode" --"$dtype" --backend "$backend" "$@" \
         --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
-    python "benchmarks/dynamo/$suite.py" \
-        --accuracy --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes --dynamic-batch-only --disable-cudagraphs "$@" \
-        --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_accuracy.csv"
 
     # Run performance test
-    # Skip dynamo-eager and aot-eager for performance test
-    # Run performance test for inductor with different configs
-    # TODO: add more configs here, e.g. max-autotune, etc.
-    python "benchmarks/dynamo/$suite.py" \
-        --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
-        --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_performance.csv"
+    if [[ "${TEST_CONFIG}" != *max_autotune* ]]; then
+      python "benchmarks/dynamo/$suite.py" \
+          --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
+          --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_performance.csv"
+      python "benchmarks/dynamo/$suite.py" \
+          --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
+          --dynamic-batch-only --disable-cudagraphs "$@" \
+          --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_performance.csv"
+    # Only test this one config for max-autotune
     python "benchmarks/dynamo/$suite.py" \
         --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" "$@" \
         --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_performance.csv"
-    python "benchmarks/dynamo/$suite.py" \
-        --performance --cold-start-latency --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
-        --dynamic-batch-only --disable-cudagraphs "$@" \
-        --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_performance.csv"
   done
 }
 

diff --git a/.github/workflows/inductor-perf-max-autotune-weekly.yml b/.github/workflows/inductor-perf-max-autotune-weekly.yml
@@ -0,0 +1,45 @@
+name: inductor-A100-max-autotune-weekly
+
+on:
+  schedule:
+    - cron: 0 0 * * 0
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-bionic-cuda11_8-py3_10-gcc7-inductor-build:
+    name: cuda11.8-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
+      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_max_autotune", shard: 1, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_huggingface_perf_max_autotune", shard: 2, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_huggingface_perf_max_autotune", shard: 3, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 1, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 2, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 3, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 4, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 5, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_timm_perf_max_autotune", shard: 6, num_shards: 6, runner: "linux.gcp.a100.large" },
+          { config: "inductor_torchbench_perf_max_autotune", shard: 1, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_torchbench_perf_max_autotune", shard: 2, num_shards: 3, runner: "linux.gcp.a100.large" },
+          { config: "inductor_torchbench_perf_max_autotune", shard: 3, num_shards: 3, runner: "linux.gcp.a100.large" },
+        ]}
+
+  linux-bionic-cuda11_8-py3_10-gcc7-inductor-test:
+    name: cuda11.8-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_8-py3_10-gcc7-inductor-build
+    with:
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7-sm80
+      docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-inductor-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+      timeout-minutes: 720
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
@@ -2,7 +2,7 @@ name: inductor-A100-perf-nightly
 
 on:
   schedule:
-    - cron: 45 1,13 * * *
+    - cron: 45 1,13 * * 1-6
   workflow_dispatch:
 
 concurrency:

diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
@@ -75,6 +75,8 @@ def setup_torchbench_cwd():
     "fambench_xlmr",
     # https://github.com/pytorch/pytorch/issues/99201
     "opacus_cifar10",
+    # TIMEOUT, https://github.com/pytorch/pytorch/issues/98467
+    "tacotron2",
 }
 
 SKIP_FOR_CUDA = {