Update on "[Inductor][Quant] Change the QConv output scale name"

**Summary** Change the name of QConv output scale from `inv_output_scale` to `output_scale` after we move the optimization of quant/dequant from decomposition to lowering phase. cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 voznesenskym penguinwu EikanWang Guobing-Chen zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler amjames desertfire chauhang [ghstack-poisoned]
pytorch · May 9, 2024 · b291167 · b291167
2 parents 530c2c4 + 468e423
commit b291167
Show file tree

Hide file tree

Showing 268 changed files with 9,997 additions and 4,249 deletions.
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -366,7 +366,7 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
 fi
 
 # Build image
-DOCKER_BUILDKIT=1 docker build \
+docker build \
        --no-cache \
        --progress=plain \
        --build-arg "BUILD_ENVIRONMENT=${image}" \

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -279,9 +279,9 @@ ghstack==0.8.0
 #Pinned versions: 0.8.0
 #test that import:
 
-jinja2==3.1.3
+jinja2==3.1.4
 #Description: jinja2 template engine
-#Pinned versions: 3.1.3
+#Pinned versions: 3.1.4
 #test that import:
 
 pytest-cpp==2.3.0

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -310,23 +310,23 @@ test_dynamo_shard() {
 test_inductor_distributed() {
   # Smuggle a few multi-gpu tests here so that we don't have to request another large node
   echo "Testing multi_gpu tests in test_torchinductor"
-  pytest test/inductor/test_torchinductor.py -k test_multi_gpu
-  pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
-  pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
-  pytest test/distributed/test_c10d_functional_native.py
-  pytest test/distributed/_tensor/test_dtensor_compile.py
-  pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
-  pytest test/distributed/_composable/fsdp/test_fully_shard_comm.py
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume
-  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation
-  pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
-  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
-  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype
-  pytest test/distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration
+  python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
+  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
+  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
+  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
+  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
 
   # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
   # with if required # gpus aren't available
@@ -522,6 +522,11 @@ test_single_dynamo_benchmark() {
   fi
 }
 
+test_inductor_micro_benchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-micro-reports
+  python benchmarks/gpt_fast/benchmark.py
+}
+
 test_dynamo_benchmark() {
   # Usage: test_dynamo_benchmark huggingface 0
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
@@ -1209,6 +1214,8 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then
   test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
   test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
+  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
   install_torchvision
   id=$((SHARD_NUMBER-1))

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-06ad737628abc3a1e617571dc03cbdd5b36ea96a
+d23a6e1664d20707c11781299611436e1f0c104f
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
@@ -29,10 +29,12 @@
   approved_by:
   - BowenBao
   - justinchuby
+  - liqunfu
   - shubhambhokare1
   - thiagocrepaldi
   - titaiwangms
   - wschin
+  - xadupre
   mandatory_checks_name:
   - EasyCLA
   - Lint

diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -8,6 +8,7 @@ ciflow_push_tags:
 - ciflow/binaries_wheel
 - ciflow/inductor
 - ciflow/inductor-perf-compare
+- ciflow/inductor-micro-benchmark
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly

diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt
@@ -5,7 +5,7 @@
 #   functorch/docs/requirements.txt
 #   .ci/docker/requirements-ci.txt
 boto3==1.19.12
-jinja2==3.1.3
+jinja2==3.1.4
 lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84

diff --git a/.github/scripts/amd/patch_triton_wheel.sh b/.github/scripts/amd/patch_triton_wheel.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
 set -x
 
-WHEELHOUSE_DIR=/artifacts
+if [ -z "$1" ]; then
+    echo "Need wheel location argument" && exit 1
+fi
+
+WHEELHOUSE_DIR=$1
 PATCHELF_BIN=patchelf
 ROCM_LIB=backends/amd/lib
 ROCM_LD=backends/amd/llvm/bin

diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
@@ -157,10 +157,10 @@ def build_triton(
 
         if build_rocm:
             check_call(
-                [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh"],
+                [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()],
                 cwd=triton_basedir,
-                shell=True,
             )
+
         return Path.cwd() / whl_path.name
 
 

diff --git a/.github/scripts/generate_docker_release_matrix.py b/.github/scripts/generate_docker_release_matrix.py
@@ -21,6 +21,8 @@
 
 def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
     ret: List[Dict[str, str]] = []
+    # CUDA amd64 Docker images are available as both runtime and devel while
+    # CPU arm64 image is only available as runtime.
     for cuda, version in generate_binary_build_matrix.CUDA_ARCHES_FULL_VERSION.items():
         for image in DOCKER_IMAGE_TYPES:
             ret.append(
@@ -31,9 +33,19 @@ def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
                         cuda
                     ],
                     "image_type": image,
-                    "platform": "linux/arm64,linux/amd64",
+                    "platform": "linux/amd64",
                 }
             )
+    ret.append(
+        {
+            "cuda": "cpu",
+            "cuda_full_version": "",
+            "cudnn_version": "",
+            "image_type": "runtime",
+            "platform": "linux/arm64",
+        }
+    )
+
     return {"include": ret}
 
 

diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
@@ -7,6 +7,7 @@ on:
       - Dockerfile
       - docker.Makefile
       - .github/workflows/docker-release.yml
+      - .github/scripts/generate_docker_release_matrix.py
   push:
     branches:
       - nightly
@@ -129,17 +130,27 @@ jobs:
         if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }}
         run: |
           PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
+          CUDA_SUFFIX="-cu${CUDA_VERSION}"
+          if [[ ${CUDA_VERSION_SHORT} == "cpu" ]]; then
+            PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-runtime"
+            CUDA_SUFFIX=""
+          fi
 
           PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
                                           python -c 'import torch; print(torch.version.git_version[:7],end="")')
 
           docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
-                 ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
-          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
+                 ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
+
+          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
+
+          # Please note, here we ned to pin specific verison of CUDA as with latest label
+          if [[ ${CUDA_VERSION_SHORT} == "12.1" ]]; then
+            docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \
+                    ghcr.io/pytorch/pytorch-nightly:latest
+            docker push ghcr.io/pytorch/pytorch-nightly:latest
+          fi
 
-          docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" \
-                 ghcr.io/pytorch/pytorch-nightly:latest
-          docker push ghcr.io/pytorch/pytorch-nightly:latest
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()
diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml
@@ -0,0 +1,40 @@
+name: inductor-micro-benchmark
+
+on:
+  schedule:
+    - cron: 0 7 * * *
+  push:
+    tags:
+      - ciflow/inductor-micro-benchmark/*
+  workflow_dispatch:
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-test:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+      timeout-minutes: 720
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -1052,12 +1052,6 @@ exclude_patterns = [
     'test/quantization/fx/test_quantize_fx.py',
     'test/quantization/fx/test_subgraph_rewriter.py',
     'test/test_datapipe.py',
-    'test/test_decomp.py',
-    'test/test_deploy.py',
-    'test/test_determination.py',
-    'test/test_dlpack.py',
-    'test/test_dynamic_shapes.py',
-    'test/test_expanded_weights.py',
     'test/test_fake_tensor.py',
     'test/test_flop_counter.py',
     'test/test_function_schema.py',

diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,10 @@
-# syntax = docker/dockerfile:experimental
-#
-# NOTE: To build this you will need a docker version > 18.06 with
-#       experimental enabled and DOCKER_BUILDKIT=1
-#
-#       If you do not use buildkit you are not going to have a good time
+# syntax=docker/dockerfile:1
+
+# NOTE: Building this image require's docker version >= 23.0.
 #
-#       For reference:
-#           https://docs.docker.com/develop/develop-images/build_enhancements/
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+
 ARG BASE_IMAGE=ubuntu:22.04
 ARG PYTHON_VERSION=3.11
 

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
@@ -227,6 +227,7 @@ namespace c10 {
   _(aten, is_autocast_enabled)       \
   _(aten, is_autocast_cpu_enabled)   \
   _(aten, is_autocast_xla_enabled)   \
+  _(aten, get_autocast_dtype)        \
   FORALL_ATEN_BASE_SYMBOLS(_)        \
   _(onnx, Add)                       \
   _(onnx, Concat)                    \

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -236,7 +236,7 @@ namespace at::cuda::blas {
     CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, num_batches);  \
   } while (0)
 
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 
 #if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000
 // only for rocm 5.7 where we first supported hipblaslt, it was difficult
@@ -375,7 +375,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
 
 template <typename Dtype>
 inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
   cudaDataType_t abcType = CUDA_R_32F;
   cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
   cudaDataType_t scaleType = CUDA_R_32F;
@@ -1235,7 +1235,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
   }
 }
 
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 
 template <typename Dtype>
 void gemm_and_bias(
@@ -1745,7 +1745,7 @@ void int8_gemm(
   TORCH_CHECK(false, "int8_gemm is only supported for ROCm 6.0 and above");
 #endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
 }
-#endif // (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 
 // ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not.
 #if defined(USE_ROCM) && ROCM_VERSION < 50600

diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
@@ -82,7 +82,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
 
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 enum GEMMAndBiasActivationEpilogue {
   None,
   RELU,

diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
@@ -9,7 +9,7 @@
 
 // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
 // added bf16 support
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 #include <cublasLt.h>
 #endif
 
@@ -82,7 +82,7 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
 /* Handles */
 TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
 TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
 #endif
 

diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -191,7 +191,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   return handle;
 }
 
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 cublasLtHandle_t getCurrentCUDABlasLtHandle() {
 #ifdef USE_ROCM
   c10::DeviceIndex device = 0;