Update on "[inductor] Clear cache on ctx manager exit"

FIXES #126128. Right now, we only clear the cache on ctx manager enter. So state is bad unless we call fresh_inductor_cache again, usually fine in tests. Cue compiled autograd tests when going from TestCompiledAutograd -> TestAutogradWithCompiledAutograd. TestCompiledAutograd uses the ctx manager, but TestAutogradWithCompiledAutograd don't cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 ColinPeppler amjames desertfire chauhang [ghstack-poisoned]
pytorch · May 16, 2024 · 692290a · 692290a
2 parents 8c807d4 + bd71d28
commit 692290a
Show file tree

Hide file tree

Showing 1,943 changed files with 16,231 additions and 130,432 deletions.
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -84,13 +84,27 @@ fi
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5
 
-_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
-_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea
+_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
+_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
+  pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
   pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
     CUDA_VERSION=12.1.1
     CUDNN_VERSION=8
@@ -105,6 +119,21 @@ case "$image" in
     CONDA_CMAKE=yes
     TRITON=yes
     ;;
+  pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
   pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
     CUDA_VERSION=12.1.1
     CUDNN_VERSION=8
@@ -134,6 +163,20 @@ case "$image" in
     CONDA_CMAKE=yes
     TRITON=yes
     ;;
+  pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
   pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
     CUDA_VERSION=12.1.1
     CUDNN_VERSION=8
@@ -226,7 +269,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    BASEKIT_VERSION=2024.0.0-49522
+    XPU_VERSION=0.5
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
     TRITON=yes
@@ -403,7 +446,7 @@ docker build \
        --build-arg "DOCS=${DOCS}" \
        --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
        --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
+       --build-arg "XPU_VERSION=${XPU_VERSION}" \
        --build-arg "ACL=${ACL:-}" \
        --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
        --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \

diff --git a/.ci/docker/common/install_cudnn.sh b/.ci/docker/common/install_cudnn.sh
@@ -4,7 +4,10 @@ if [[ ${CUDNN_VERSION} == 8 ]]; then
     # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
     mkdir tmp_cudnn
     pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+    if [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.9.7.29_cuda12-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+    elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
         CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
         curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
     elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then

diff --git a/.ci/docker/common/install_cusparselt.sh b/.ci/docker/common/install_cusparselt.sh
@@ -5,9 +5,14 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt
 
-if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
-    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
+    arch_path='sbsa'
+    export TARGETARCH=${TARGETARCH:-$(uname -m)}
+    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
+        arch_path='x86_64'
+    fi
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
     CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
     curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz

diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
@@ -15,7 +15,7 @@ conda_reinstall() {
 if [ -n "${ROCM_VERSION}" ]; then
   TRITON_REPO="https://github.com/openai/triton"
   TRITON_TEXT_FILE="triton-rocm"
-elif [ -n "${BASEKIT_VERSION}" ]; then
+elif [ -n "${XPU_VERSION}" ]; then
   TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
   TRITON_TEXT_FILE="triton-xpu"
 else

diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh
@@ -3,10 +3,7 @@ set -xe
 
 
 # Intel® software for general purpose GPU capabilities.
-# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html
-
-# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
-# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
+# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
 
 # Users should update to the latest version as it becomes available
 
@@ -17,14 +14,16 @@ function install_ubuntu() {
     # Set up the repository. To do this, download the key to the system keyring
     wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
         | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
 
     # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
+        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
         | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
-    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
-        | tee /etc/apt/sources.list.d/oneAPI.list
+    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
+        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
+        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
 
     # Update the packages list and repository index
     apt-get update
@@ -40,11 +39,11 @@ function install_ubuntu() {
         mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
     # Development Packages
     apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    # Install Intel® oneAPI Base Toolkit
-    if [ -n "$BASEKIT_VERSION" ]; then
-        apt-get install intel-basekit=$BASEKIT_VERSION -y
+    # Install Intel Support Packages
+    if [ -n "$XPU_VERSION" ]; then
+        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
     else
-        apt-get install intel-basekit -y
+        apt-get install -y intel-for-pytorch-gpu-dev
     fi
 
     # Cleanup

diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
@@ -152,6 +152,7 @@ RUN rm install_cusparselt.sh
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
 RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
+RUN if [ -h /usr/local/cuda-12.1/cuda-12.4 ]; then rm /usr/local/cuda-12.1/cuda-12.4; fi
 
 USER jenkins
 CMD ["bash"]
diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile
@@ -62,7 +62,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 
 # Install XPU Dependencies
-ARG BASEKIT_VERSION
+ARG XPU_VERSION
 COPY ./common/install_xpu.sh install_xpu.sh
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 

diff --git a/.ci/pytorch/docs-test.sh b/.ci/pytorch/docs-test.sh
@@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 echo "Testing pytorch docs"
 
 cd docs
-make doctest
+TERM=vt100 make doctest
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -588,6 +588,15 @@ test_inductor_torchbench_smoketest_perf() {
       "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
       --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
   done
+
+  # Perform some "warm-start" runs for a few huggingface models.
+  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
+      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
+    python benchmarks/dynamo/check_accuracy.py \
+      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
+      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
+  done
 }
 
 test_inductor_torchbench_cpu_smoketest_perf(){
@@ -1269,6 +1278,10 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHAR
 elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   test_dynamo_shard "${SHARD_NUMBER}"
+elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
+  install_torchvision
+  test_python_shard "$SHARD_NUMBER"
+  test_aten
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision
@@ -1298,10 +1311,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
   test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
   test_docs_test
-elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
-  install_torchvision
-  test_python
-  test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
   install_torchvision
   test_python

diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-ea437b31ce316ea3d66fe73768c0dcb94edb79ad
+1980f8af5bcd0bb2ce51965cf79d8d4c25dad8a0
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -1,6 +1,5 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
-TD_rollout_issue: 123120
 ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_conda

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
@@ -38,6 +38,8 @@ jobs:
       matrix:
         runner: [linux.12xlarge]
         docker-image-name: [
+          pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9,
+          pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
           pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,

diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
@@ -125,7 +125,7 @@ jobs:
           { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
           { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
           { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
         ]}
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -43,6 +43,8 @@ jobs:
       submodules: true
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
+        pip install onnx==1.16.0
+        pip install numpy==1.26.4
         export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT"
         .github/scripts/lintrunner.sh
 

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -192,7 +192,8 @@ jobs:
       sync-tag: rocm-build
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
         ]}
 
   linux-focal-rocm6_1-py3_8-test:

diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -1052,13 +1052,13 @@ exclude_patterns = [
     'test/quantization/fx/test_numeric_suite_fx.py',
     'test/quantization/fx/test_quantize_fx.py',
     'test/quantization/fx/test_subgraph_rewriter.py',
-    'test/test_datapipe.py',
     'test/test_fake_tensor.py',
     'test/test_flop_counter.py',
     'test/test_function_schema.py',
     'test/test_functional_autograd_benchmark.py',
     'test/test_functional_optim.py',
     'test/test_functionalization_of_rng_ops.py',
+    'test/test_datapipe.py',
     'test/test_futures.py',
     'test/test_fx.py',
     'test/test_fx_experimental.py',
@@ -1143,7 +1143,6 @@ exclude_patterns = [
     'test/test_transformers.py',
     'test/test_type_promotion.py',
     'test/test_unary_ufuncs.py',
-    'test/test_utils.py',
     'test/test_vulkan.py',
     'test/test_xnnpack_integration.py',
     'test/torch_np/numpy_test/**/*.py',

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -277,8 +277,8 @@ if(NOT DEFINED USE_VULKAN)
 endif()
 
 option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
-option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
-option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
+option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
+option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
 option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 # option USE_XNNPACK: try to enable xnnpack by default.

diff --git a/README.md b/README.md
@@ -24,6 +24,9 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
     - [NVIDIA Jetson Platforms](#nvidia-jetson-platforms)
   - [From Source](#from-source)
     - [Prerequisites](#prerequisites)
+      - [NVIDIA CUDA Support](#nvidia-cuda-support)
+      - [AMD ROCm Support](#amd-rocm-support)
+      - [Intel GPU Support](#intel-gpu-support)
     - [Install Dependencies](#install-dependencies)
     - [Get the PyTorch Source](#get-the-pytorch-source)
     - [Install PyTorch](#install-pytorch)
@@ -162,6 +165,7 @@ If you are installing from source, you will need:
 
 We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
 
+##### NVIDIA CUDA Support
 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
 - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
@@ -174,13 +178,22 @@ Other potentially useful environment variables may be found in `setup.py`.
 
 If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xavier), Instructions to install PyTorch for Jetson Nano are [available here](https://devtalk.nvidia.com/default/topic/1049071/jetson-nano/pytorch-for-jetson-nano/)
 
+##### AMD ROCm Support
 If you want to compile with ROCm support, install
 - [AMD ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) 4.0 and above installation
 - ROCm is currently supported only for Linux systems.
 
 If you want to disable ROCm support, export the environment variable `USE_ROCM=0`.
 Other potentially useful environment variables may be found in `setup.py`.
 
+##### Intel GPU Support
+If you want to compile with Intel GPU support, follow these
+- [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions.
+- Intel GPU is currently supported only for Linux systems.
+
+If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
+Other potentially useful environment variables may be found in `setup.py`.
+
 #### Install Dependencies
 
 **Common**