Update on "[Inductor] [Quant] Enable lowering of quant per tensor and…

… refactor quant pattern" **Summary** Per the discussion in #123444, the `decomposed quant/dequant` patterns changed after #123445, we can move the optimization of `decomposed quant/dequant` from inductor decomposition into lowering phase to avoid the changes. In this way, we can: - Avoid the pattern matcher failure introduced in #123445 - Make the quantization pattern clearer in the pattern matcher phase, since the `quant/dequant` nodes have not been decomposed. **Changes in this PR** - Move optimization of `decomposed quant/dequant` from inductor decomposition into lowering phase. - Corresponding changes in the quantization pattern matcher to ensure no bc-breaking. **TestPlan** ``` python -u -m pytest -s -v test/inductor/test_mkldnn_pattern_matcher.py -k test_q ``` cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 voznesenskym penguinwu EikanWang Guobing-Chen zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler amjames desertfire chauhang [ghstack-poisoned]
pytorch · May 6, 2024 · 8b0ea3a · 8b0ea3a
2 parents 88a1243 + 5b08a4f
commit 8b0ea3a
Show file tree

Hide file tree

Showing 803 changed files with 23,576 additions and 55,756 deletions.
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -204,7 +204,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=5.7
+    ROCM_VERSION=6.0
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
     TRITON=yes
@@ -215,7 +215,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=6.0
+    ROCM_VERSION=6.1
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
     TRITON=yes
@@ -306,6 +306,12 @@ case "$image" in
     DB=yes
     VISION=yes
     CONDA_CMAKE=yes
+    # snadampal: skipping sccache due to the following issue
+    # https://github.com/pytorch/pytorch/issues/121559
+    SKIP_SCCACHE_INSTALL=yes
+    # snadampal: skipping llvm src build install because the current version
+    # from pytorch/llvm:9.0.1 is x86 specific
+    SKIP_LLVM_SRC_BUILD_INSTALL=yes
     ;;
   *)
     # Catch-all for builds that are not hardcoded.
@@ -399,6 +405,8 @@ DOCKER_BUILDKIT=1 docker build \
        --build-arg "EXECUTORCH=${EXECUTORCH}" \
        --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
        --build-arg "ACL=${ACL:-}" \
+       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
+       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
        -t "$tmp_tag" \
        "$@" \

diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
@@ -113,7 +113,6 @@ install_centos() {
     glibc-devel \
     glibc-headers \
     glog-devel \
-    hiredis-devel \
     libstdc++-devel \
     libsndfile-devel \
     make \

diff --git a/.ci/docker/common/install_db.sh b/.ci/docker/common/install_db.sh
@@ -4,11 +4,6 @@ set -ex
 
 install_ubuntu() {
   apt-get update
-  apt-get install -y --no-install-recommends \
-          libhiredis-dev \
-          libleveldb-dev \
-          liblmdb-dev \
-          libsnappy-dev
 
   # Cleanup
   apt-get autoclean && apt-get clean
@@ -20,12 +15,6 @@ install_centos() {
   # See http://fedoraproject.org/wiki/EPEL
   yum --enablerepo=extras install -y epel-release
 
-  yum install -y \
-      hiredis-devel \
-      leveldb-devel \
-      lmdb-devel \
-      snappy-devel
-
   # Cleanup
   yum clean all
   rm -rf /var/cache/yum

diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
@@ -61,6 +61,10 @@ install_ubuntu() {
                    rocprofiler-dev \
                    roctracer-dev
 
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
+        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
+    fi
+
     # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
     # search for all unversioned packages
     # if search fails it will abort this script; use true to avoid case where search fails

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -263,10 +263,10 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #Pinned versions:
 #test that import:
 
-#wheel not found on aarch64, and source build requires rust
-lintrunner==0.10.7 ; platform_machine == "x86_64"
+#lintrunner is supported on aarch64-linux only from 0.12.4 version
+lintrunner==0.12.5
 #Description: all about linters!
-#Pinned versions: 0.10.7
+#Pinned versions: 0.12.5
 #test that import:
 
 rockset==1.0.3

diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -169,9 +169,11 @@ RUN rm install_acl.sh
 ENV INSTALLED_ACL ${ACL}
 
 # Install ccache/sccache (do this last, so we get priority in PATH)
+ARG SKIP_SCCACHE_INSTALL
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
-RUN bash ./install_cache.sh && rm install_cache.sh
+RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi
+RUN rm install_cache.sh
 
 # Add jni.h for java host build
 COPY ./common/install_jni.sh install_jni.sh
@@ -188,7 +190,9 @@ ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
 
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
+ARG SKIP_LLVM_SRC_BUILD_INSTALL
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi
 
 # AWS specific CUDA build guidance
 ENV TORCH_CUDA_ARCH_LIST Maxwell

diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
@@ -81,7 +81,22 @@ if ! which conda; then
     export USE_MKLDNN=0
   fi
 else
-  export CMAKE_PREFIX_PATH=/opt/conda
+  # CMAKE_PREFIX_PATH precedences
+  # 1. $CONDA_PREFIX, if defined. This follows the pytorch official build instructions.
+  # 2. /opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}, if ANACONDA_PYTHON_VERSION defined.
+  #    This is for CI, which defines ANACONDA_PYTHON_VERSION but not CONDA_PREFIX.
+  # 3. $(conda info --base). The fallback value of pytorch official build
+  #    instructions actually refers to this.
+  #    Commonly this is /opt/conda/
+  if [[ -v CONDA_PREFIX ]]; then
+    export CMAKE_PREFIX_PATH=${CONDA_PREFIX}
+  elif [[ -v ANACONDA_PYTHON_VERSION ]]; then
+    export CMAKE_PREFIX_PATH="/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}"
+  else
+    # already checked by `! which conda`
+    CMAKE_PREFIX_PATH="$(conda info --base)"
+    export CMAKE_PREFIX_PATH
+  fi
 
   # Workaround required for MKL library linkage
   # https://github.com/pytorch/pytorch/issues/119557
@@ -376,4 +391,8 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
   python tools/stats/export_test_times.py
 fi
 
-print_sccache_stats
+# snadampal: skipping it till sccache support added for aarch64
+# https://github.com/pytorch/pytorch/issues/121559
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
+  print_sccache_stats
+fi
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -181,6 +181,11 @@ if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
   export PATH="$HOME/.local/bin:$PATH"
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
+  # TODO: revisit this once the CI is stabilized on aarch64 linux
+  export VALGRIND=OFF
+fi
+
 install_tlparse
 
 # DANGER WILL ROBINSON.  The LD_PRELOAD here could cause you problems
@@ -317,6 +322,7 @@ test_inductor_distributed() {
   pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
   pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp
   pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume
+  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation
   pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
   pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
   pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype
@@ -1152,11 +1158,33 @@ test_executorch() {
   assert_git_not_dirty
 }
 
+test_linux_aarch64(){
+  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
+       test_transformers test_multiprocessing test_numpy_interop --verbose
+
+  # Dynamo tests
+  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
+       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
+       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
+
+  # Inductor tests
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
+       inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \
+       inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \
+       inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
+       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
+       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
+       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
+       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
+}
+
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-if [[ "${TEST_CONFIG}" == *backward* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
+  test_linux_aarch64
+elif [[ "${TEST_CONFIG}" == *backward* ]]; then
   test_forward_backward_compatibility
   # Do NOT add tests after bc check tests, see its comment.
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then

diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -17,22 +17,22 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol
 set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers
 
 call %INSTALLER_DIR%\install_magma.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 
 call %INSTALLER_DIR%\install_sccache.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 
 :: Miniconda has been installed as part of the Windows AMI with all the dependencies.
 :: We just need to activate it here
 call %INSTALLER_DIR%\activate_miniconda3.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 
 call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 
 :: Override VS env here
 pushd .
@@ -41,8 +41,8 @@ if "%VC_VERSION%" == "" (
 ) else (
     call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION%
 )
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 @echo on
 popd
 
@@ -52,12 +52,12 @@ set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%
 
 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
     echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    exit /b 1
+    goto fail
 )
 rem version transformer, for example 10.1 to 10_1.
 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
     echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    exit /b 1
+    goto fail
 )
 set VERSION_SUFFIX=%CUDA_VERSION:.=_%
 set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
@@ -101,8 +101,8 @@ if "%USE_CUDA%"=="1" (
   :: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers
   :: randomtemp.exe and sccache.exe into a batch file which CMake invokes.
   curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
-  if errorlevel 1 exit /b
-  if not errorlevel 0 exit /b
+  if errorlevel 1 goto fail
+  if not errorlevel 0 goto fail
   echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat"
   cat %TMP_DIR%/bin/nvcc.bat
   set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat
@@ -114,8 +114,8 @@ if "%USE_CUDA%"=="1" (
 set
 
 python setup.py bdist_wheel
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
@@ -135,3 +135,8 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps
 
 sccache --show-stats --stats-format json | jq .stats > sccache-stats-%BUILD_ENVIRONMENT%-%OUR_GITHUB_JOB_ID%.json
 sccache --stop-server
+
+exit /b 0
+
+:fail
+exit /b 1
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-2c4665ffbb64f03f5d18016d3398af4ac4da5f03
+06ad737628abc3a1e617571dc03cbdd5b36ea96a
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-58a412cb271a3f98ae2e01fd1d24bdbb66645d4e
+73b915b55d96553a0e370b2bab01f47b8c2a9e7c
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -8,6 +8,7 @@ ciflow_push_tags:
 - ciflow/binaries_wheel
 - ciflow/inductor
 - ciflow/inductor-perf-compare
+- ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
@@ -13,16 +13,16 @@
 import os
 from typing import Dict, List, Optional, Tuple
 
-CUDA_ARCHES = ["11.8", "12.1"]
+CUDA_ARCHES = ["11.8", "12.1", "12.4"]
 
 
-CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1"}
+CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}
 
 
-CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8"}
+CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8", "12.4": "8"}
 
 
-ROCM_ARCHES = ["5.7", "6.0"]
+ROCM_ARCHES = ["6.0", "6.1"]
 
 
 CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
@@ -58,6 +58,20 @@
         "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
     ),
+    "12.4": (
+        "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
 }
 
 
@@ -324,7 +338,7 @@ def generate_wheels_matrix(
             )
 
             # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-            if arch_version in ["12.1", "11.8"] and os == "linux":
+            if arch_version in ["12.4", "12.1", "11.8"] and os == "linux":
                 ret.append(
                     {
                         "python_version": python_version,
@@ -367,5 +381,6 @@ def generate_wheels_matrix(
     return ret
 
 
+validate_nccl_dep_consistency("12.4")
 validate_nccl_dep_consistency("12.1")
 validate_nccl_dep_consistency("11.8")
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -46,7 +46,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 !{{ common.concurrency(build_environment) }}
 
 jobs:

diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -48,7 +48,7 @@ env:
   BUILD_ENVIRONMENT: !{{ build_environment }}
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 {%- if cross_compile_arm64 %}
   CROSS_COMPILE_ARM64: 1
 {% endif %}

diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
@@ -37,7 +37,7 @@ jobs:
         device: ["cuda", "rocm"]
         include:
           - device: "rocm"
-            rocm_version: "6.0"
+            rocm_version: "6.1"
           - device: "cuda"
             rocm_version: ""
     timeout-minutes: 40