Update on "[dtensor][7/n] remove reduction rule"

[ghstack-poisoned]
pytorch · Sep 15, 2023 · c51b2ff · c51b2ff
2 parents cf2a5cf + b875b80
commit c51b2ff
Show file tree

Hide file tree

Showing 480 changed files with 18,816 additions and 16,272 deletions.
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -129,35 +129,6 @@ case "$image" in
     CONDA_CMAKE=yes
     TRITON=yes
     ;;
-  pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7)
-    CUDA_VERSION=11.8.0
-    CUDNN_VERSION=8
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=7
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-    pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7-inductor-benchmarks)
-    CUDA_VERSION=11.8.0
-    CUDNN_VERSION=8
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=7
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
   pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
     CUDA_VERSION=12.1.1
     CUDNN_VERSION=8
@@ -244,17 +215,6 @@ case "$image" in
     CONDA_CMAKE=yes
     TRITON=yes
     ;;
-  pytorch-linux-focal-py3.8-gcc7)
-    ANACONDA_PYTHON_VERSION=3.8
-    GCC_VERSION=7
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    CONDA_CMAKE=yes
-    TRITON=yes
-    DOCS=yes
-    ;;
     pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
     ANACONDA_PYTHON_VERSION=3.8
     GCC_VERSION=11
@@ -354,14 +314,11 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
 fi
 
 # Build image
-# TODO: build-arg THRIFT is not turned on for any image, remove it once we confirm
-# it's no longer needed.
 docker build \
        --no-cache \
        --progress=plain \
        --build-arg "BUILD_ENVIRONMENT=${image}" \
        --build-arg "PROTOBUF=${PROTOBUF:-}" \
-       --build-arg "THRIFT=${THRIFT:-}" \
        --build-arg "LLVMDEV=${LLVMDEV:-}" \
        --build-arg "DB=${DB:-}" \
        --build-arg "VISION=${VISION:-}" \

diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
@@ -29,7 +29,7 @@ pip_install \
   transformers==4.32.1
 
 pip_install coloredlogs packaging
-retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.16.0.dev20230908001
+retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.16.0.dev20230912006
 
 pip_install onnx==1.14.1
 pip_install onnxscript-preview==0.1.0.dev20230828 --no-deps

diff --git a/.ci/docker/common/install_thrift.sh b/.ci/docker/common/install_thrift.sh
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -275,3 +275,8 @@ z3-solver==4.12.2.0
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
+
+tensorboard==2.13.0
+#Description: Also included in .ci/docker/requirements-docs.txt
+#Pinned versions:
+#test that import: test_tensorboard
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -17,13 +17,6 @@ ARG LLVMDEV
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh
 
-# (optional) Install thrift.
-ARG THRIFT
-COPY ./common/install_thrift.sh install_thrift.sh
-RUN if [ -n "${THRIFT}" ]; then bash ./install_thrift.sh; fi
-RUN rm install_thrift.sh
-ENV INSTALLED_THRIFT ${THRIFT}
-
 # Install user
 COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
@@ -159,6 +159,14 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* && -z "$TORCH_CUDA_ARCH_LIST" ]]; then
   exit 1
 fi
 
+# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
+# memory to build and will OOM
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ "$TORCH_CUDA_ARCH_LIST" == *"8.6"* || "$TORCH_CUDA_ARCH_LIST" == *"8.0"* ]]; then
+  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
+  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
+  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
+fi
+
 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
   export CC=clang
   export CXX=clang++

diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh
@@ -35,7 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi
 
 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0
+python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0
 
 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver

diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
@@ -155,8 +155,8 @@ EOL
 
 # nproc doesn't exist on darwin
 if [[ "$(uname)" != Darwin ]]; then
-  # Because most Circle executors only have 20 CPUs, using more causes OOMs w/ Ninja and nvcc parallelization
-  MEMORY_LIMIT_MAX_JOBS=18
+  # This was lowered from 18 to 12 to avoid OOMs when compiling FlashAttentionV2
+  MEMORY_LIMIT_MAX_JOBS=12
   NUM_CPUS=$(( $(nproc) - 2 ))
 
   # Defaults here for **binary** linux builds so they can be changed in one place

diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
@@ -1 +1 @@
-9371b9e13c826f3930e54346b4d619cb59182f68
+0a64c552d35dfb9252215a7e3c794a16fff6cb0b
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
@@ -10,6 +10,7 @@
   - docs/source/_static/img/onnx/**
   - scripts/onnx/**
   - test/onnx/**
+  - test/onnx_caffe2/**
   - tools/onnx/**
   - torch/_dynamo/backends/onnxrt.py
   - torch/_C/__init__.pyi.in

diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
@@ -25,3 +25,4 @@ sympy==1.11.1
 pytest-cpp==2.3.0
 rockset==1.0.3
 z3-solver==4.12.2.0
+tensorboard==2.13.0
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
@@ -410,16 +410,17 @@ def process_jobs(
             if target_job in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME):
                 target_cfg = m.group("cfg")
 
-                return _filter_jobs(
+                # NB: There can be multiple unstable configurations, i.e. inductor, inductor_huggingface
+                test_matrix = _filter_jobs(
                     test_matrix=test_matrix,
                     issue_type=issue_type,
                     target_cfg=target_cfg,
                 )
-
-        warnings.warn(
-            f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
-            + f"but the name {target_job_cfg} is invalid"
-        )
+        else:
+            warnings.warn(
+                f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
+                + f"but the name {target_job_cfg} is invalid"
+            )
 
     # Found no matching target, return the same input test matrix
     return test_matrix

diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py
@@ -102,6 +102,30 @@
         "manywheel-py3_8-cuda11_8-build",
         "",
     ],
+    "inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor)": [
+        "pytorchbot",
+        "107079",
+        "https://github.com/pytorch/pytorch/issues/107079",
+        "inductor",
+        "cuda12.1-py3.10-gcc9-sm86",
+        "test (inductor)",
+    ],
+    "inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface)": [
+        "pytorchbot",
+        "109153",
+        "https://github.com/pytorch/pytorch/issues/109153",
+        "inductor",
+        "cuda12.1-py3.10-gcc9-sm86",
+        "test (inductor_huggingface)",
+    ],
+    "inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface_dynamic)": [
+        "pytorchbot",
+        "109154",
+        "https://github.com/pytorch/pytorch/issues/109154",
+        "inductor",
+        "cuda12.1-py3.10-gcc9-sm86",
+        "test (inductor_huggingface_dynamic)",
+    ],
 }
 
 MOCKED_PR_INFO = {
@@ -569,6 +593,37 @@ def test_mark_unstable_jobs(self, mock_download_json: Any) -> None:
                 "expected": '{"include": [{"config": "default", "unstable": "unstable"}]}',
                 "description": "Both binary build and test jobs are unstable",
             },
+            {
+                "workflow": "inductor",
+                "job_name": "cuda12.1-py3.10-gcc9-sm86 / build",
+                "test_matrix": """
+                    { include: [
+                        { config: "inductor" },
+                        { config: "inductor_huggingface", shard: 1 },
+                        { config: "inductor_huggingface", shard: 2 },
+                        { config: "inductor_timm", shard: 1 },
+                        { config: "inductor_timm", shard: 2 },
+                        { config: "inductor_torchbench" },
+                        { config: "inductor_huggingface_dynamic" },
+                        { config: "inductor_torchbench_dynamic" },
+                        { config: "inductor_distributed" },
+                    ]}
+                """,
+                "expected": """
+                    { "include": [
+                        { "config": "inductor", "unstable": "unstable" },
+                        { "config": "inductor_huggingface", "shard": 1, "unstable": "unstable" },
+                        { "config": "inductor_huggingface", "shard": 2, "unstable": "unstable" },
+                        { "config": "inductor_timm", "shard": 1 },
+                        { "config": "inductor_timm", "shard": 2 },
+                        { "config": "inductor_torchbench" },
+                        { "config": "inductor_huggingface_dynamic", "unstable": "unstable" },
+                        { "config": "inductor_torchbench_dynamic" },
+                        { "config": "inductor_distributed" }
+                    ]}
+                """,
+                "description": "Marking multiple unstable configurations",
+            },
         ]
 
         for case in testcases:
@@ -577,7 +632,7 @@ def test_mark_unstable_jobs(self, mock_download_json: Any) -> None:
             test_matrix = yaml.safe_load(case["test_matrix"])
 
             filtered_test_matrix = mark_unstable_jobs(workflow, job_name, test_matrix)
-            self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
+            self.assertEqual(json.loads(case["expected"]), filtered_test_matrix)
 
     @mock.patch("subprocess.check_output")
     def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:

diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
@@ -131,7 +131,7 @@ jobs:
     needs: build-wheel
     container:
       image: continuumio/miniconda3:4.12.0
-    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
+    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
     steps:
       - uses: actions/checkout@v3
 
@@ -244,7 +244,7 @@ jobs:
     needs: build-conda
     container:
       image: continuumio/miniconda3:4.12.0
-    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
+    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
     steps:
       - uses: actions/checkout@v3
 
@@ -283,7 +283,7 @@ jobs:
         run: |
           set -ex
 
-          if [[ "${UPLOAD_CHANNEL}" = "nightly" ]]; then
+          if [[ "${UPLOAD_CHANNEL:-nightly}" == "nightly" ]]; then
             export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}"
           else
             export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN_TEST}"

diff --git a/.github/workflows/update_s3_htmls.yml b/.github/workflows/update_s3_htmls.yml
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -195,37 +195,21 @@ include_patterns = [
 exclude_patterns = [
     '**/fb/**',
     'torch/_inductor/index_propagation.py',
-    'torch/_inductor/coordinate_descent_tuner.py',
     'torch/_inductor/debug.py',
-    'torch/_inductor/hooks.py',
-    'torch/_inductor/bounds.py',
-    'torch/_inductor/config.py',
     'torch/_inductor/ir.py',
-    'torch/_inductor/codecache.py',
-    'torch/_inductor/test_operators.py',
-    'torch/_inductor/inductor_prims.py',
     'torch/_inductor/scheduler.py',
-    'torch/_inductor/exc.py',
     'torch/_inductor/sizevars.py',
-    'torch/_inductor/triton_helpers.py',
     'torch/_inductor/freezing.py',
     'torch/_inductor/pattern_matcher.py',
     'torch/_inductor/fx_utils.py',
-    'torch/_inductor/virtualized.py',
-    'torch/_inductor/cuda_properties.py',
     'torch/_inductor/codegen/triton_foreach.py',
-    'torch/_inductor/codegen/__init__.py',
     'torch/_inductor/codegen/cpp.py',
     'torch/_inductor/codegen/triton.py',
     'torch/_inductor/fx_passes/split_cat.py',
-    'torch/_inductor/fx_passes/binary_folding.py',
-    'torch/_inductor/fx_passes/replace_random.py',
     'torch/_inductor/fx_passes/joint_graph.py',
     'torch/_inductor/fx_passes/pad_mm.py',
-    'torch/_inductor/fx_passes/__init__.py',
     'torch/_inductor/fx_passes/group_batch_fusion.py',
     'torch/_inductor/fx_passes/pre_grad.py',
-    'torch/_inductor/fx_passes/freezing_patterns.py',
 ]
 command = [
     'python3',

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -58,6 +58,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
 endif()
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_LINK_WHAT_YOU_USE TRUE)
 
 # One variable that determines whether the current cmake process is being run
 # with the main Caffe2 library. This is useful for building modules - if
@@ -730,7 +731,7 @@ include(cmake/Dependencies.cmake)
 cmake_dependent_option(
   USE_FLASH_ATTENTION
   "Whether to build the flash_attention kernel for scaled dot product attention" ON
-  "USE_CUDA AND NOT ROCM AND NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6" OFF)
+  "USE_CUDA AND NOT ROCM AND NOT MSVC AND NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6" OFF)
 
 # Flash Attention2 will error while building for sm52 while Mem Eff Attention won't
 cmake_dependent_option(