Update on "[inductor] switch assume_aligned_inputs to False"

In #123319, we guard some behavior behind the `assume_aligned_inputs` config option. If we set this to `False`, then the behavior added in #123319 becomes the default behavior. See the referenced PR for more details about the behavior affected. Side effects: * It's possible that this will hurt performance in some scenarios. For example, if an unaligned input is used in a matmul, it might be better to perform the clone to align it first. * This will occasionally cause recompiles. Specifically: the check we perform (`(storage_offset * get_dtype_size(dtype)) % ALIGNMENT == 0`) can be guarded on if the storage_offset becomes dynamic. storage_offset becomes dynamic during automatic_dynamic_shapes after a shape or stride changes. Previously, this was increasing graph breaks in cpu inductor torchbench tests (but is fixed by more carefully guarding checks on alignment, so that we don't run them and generate guards unless actually needed). cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler amjames desertfire chauhang [ghstack-poisoned]
pytorch · Apr 28, 2024 · 8fd7120 · 8fd7120
2 parents 49070d1 + 4313c7d
commit 8fd7120
Show file tree

Hide file tree

Showing 326 changed files with 6,784 additions and 2,506 deletions.
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -228,12 +228,11 @@ scikit-image==0.20.0 ; python_version >= "3.10"
 #Pinned versions: 0.20.3
 #test that import:
 
-scipy==1.8.1 ; python_version <= "3.10"
-scipy==1.10.1 ; python_version == "3.11"
+scipy==1.10.1 ; python_version <= "3.11"
 scipy==1.12.0 ; python_version == "3.12"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
-#Pinned versions: 1.6.3
+#Pinned versions: 1.10.1
 #test that import: test_unary_ufuncs.py, test_torch.py,test_tensor_creation_ops.py
 #test_spectral_ops.py, test_sparse_csr.py, test_reductions.py,test_nn.py
 #test_linalg.py, test_binary_ufuncs.py

diff --git a/.flake8 b/.flake8
@@ -54,6 +54,7 @@ per-file-ignores =
     torch/ao/quantization/fx/_decomposed.py: TOR901
     torch/distributed/_functional_collectives.py: TOR901
     torch/distributed/_spmd/data_parallel.py: TOR901
+    torch/distributed/_tensor/_collective_utils.py: TOR901
 optional-ascii-coding = True
 exclude =
     ./.git,

diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
@@ -66,6 +66,7 @@ runs:
         env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
 
     - name: Kill any existing containers, clean up images
+      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
       shell: bash
       run: |
         # ignore expansion of "docker ps -q" since it could be empty
@@ -104,3 +105,28 @@ runs:
 
         echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
         cat /etc/hosts
+
+    - name: Check that the docker daemon is running
+      shell: bash
+      continue-on-error: true
+      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
+      run: |
+        set +x
+
+        max_attempts=30
+        delay=10
+        attempt=1
+
+        for attempt in $(seq 1 $max_attempts); do
+          echo "Attempt $attempt of $max_attempts: Checking if Docker daemon is running..."
+          if docker info > /dev/null 2>&1; then
+            echo "Docker is running. Proceeding with the next steps"
+            exit 0
+          else
+            echo "Docker is not running yet."
+            echo "Retrying in $delay seconds..."
+            sleep $delay
+          fi
+        done
+        echo "Reached maximum attempts to connect to Docker. Exiting."
+        exit 1
diff --git a/.github/label_to_label.yml b/.github/label_to_label.yml
@@ -0,0 +1,13 @@
+# Use this to auto apply labels based on other labels.  Applies to both PRs and
+# issues. Currently only supports any and all
+- any:
+  - "module: custom operators"
+  - "module: aotdispatch"
+  then:
+  - "module: pt2-dispatcher"
+- any:
+  - "module: dynamo"
+  - "module: pt2-dispatcher"
+  - "module: inductor"
+  then:
+  - "oncall: pt2"
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -24,3 +24,4 @@ retryable_workflows:
 - linux-binary
 - windows-binary
 labeler_config: labeler.yml
+label_to_label_config: label_to_label.yml
diff --git a/.github/scripts/cherry_pick.py b/.github/scripts/cherry_pick.py
@@ -29,7 +29,7 @@ def parse_args() -> Any:
         "--onto-branch", type=str, required=True, help="the target release branch"
     )
     parser.add_argument(
-        "--github-actor", type=str, required=True, help="all the world’s a stage"
+        "--github-actor", type=str, required=True, help="all the world's a stage"
     )
     parser.add_argument(
         "--classification",

diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py
@@ -4,6 +4,7 @@
 
 import argparse
 import json
+import operator
 import os
 import re
 import sys
@@ -126,7 +127,7 @@ def find_job_id_name(args: Any) -> Tuple[str, str]:
 
     # Sort the jobs list by start time, in descending order. We want to get the most
     # recently scheduled job on the runner.
-    jobs.sort(key=lambda job: job["started_at"], reverse=True)
+    jobs.sort(key=operator.itemgetter("started_at"), reverse=True)
 
     for job in jobs:
         if job["runner_name"] == args.runner_name:

diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
@@ -92,7 +92,7 @@ jobs:
           retry_wait_seconds: 30
           command: |
             set -eu
-            python3 -m pip install rockset==1.0.3
+            python3 -m pip install rockset==1.0.3 'xdoctest>=1.1.0'
 
       - name: Start monitoring script
         id: monitor-script

diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
@@ -8,11 +8,16 @@ on:
   # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
   workflow_dispatch:
     inputs:
-      training_and_inference:
-        description: Run training and inference?
+      training:
+        description: Run training (on by default)?
         required: false
-        type: string
-        default: training-true-inference-false
+        type: boolean
+        default: true
+      inference:
+        description: Run inference (off by default)?
+        required: false
+        type: boolean
+        default: false
       default:
         description: Run inductor_default?
         required: false
@@ -28,11 +33,6 @@ on:
         required: false
         type: boolean
         default: true
-      cppwrapper:
-        description: Run inductor_cpp_wrapper for inference?
-        required: false
-        type: boolean
-        default: false
       freezing_cudagraphs:
         description: Run inductor_cudagraphs with freezing for inference?
         required: false
@@ -129,7 +129,7 @@ jobs:
     if: github.event_name == 'workflow_dispatch'
     with:
       build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      dashboard-tag: ${{ inputs.training_and_inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-false-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
       docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
       use-gha: anything-non-empty-to-use-gha

diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml
@@ -53,6 +53,7 @@ jobs:
           GITHUB_RUN_ID: ${{ github.run_id }}
           GITHUB_RUN_NUMBER: ${{ github.run_number }}
           GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          GITHUB_REF: ${{ github.ref }}
           JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
           JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
           PR_NUMBER: ${{ github.event.pull_request.number }}

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -69,6 +69,8 @@ class TORCH_API Context {
       return at::detail::getMPSHooks();
     } else if (device_type == at::kPrivateUse1) {
       return at::detail::getPrivateUse1Hooks();
+    } else if (device_type == at::kMTIA) {
+      return at::detail::getMTIAHooks();
     } else {
       AT_ERROR(
           c10::DeviceTypeName(device_type), " device type not an accelerator.");
@@ -156,6 +158,9 @@ class TORCH_API Context {
   void lazyInitXPU() {
     c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); });
   }
+  void lazyInitMTIA() {
+    c10::call_once(th_mtia_init, [&] { detail::getMTIAHooks().initMTIA(); });
+  }
   void lazyInitPrivateUse1() {
     c10::call_once(thp_init, [&] {
       if (isPrivateUse1HooksRegistered()) {
@@ -349,6 +354,7 @@ class TORCH_API Context {
   c10::once_flag thc_init;
   c10::once_flag thh_init;
   c10::once_flag thx_init;
+  c10::once_flag th_mtia_init;
   c10::once_flag thp_init;
   bool enabled_cudnn = true;
   bool deterministic_cudnn = false;

diff --git a/aten/src/ATen/DeviceAccelerator.cpp b/aten/src/ATen/DeviceAccelerator.cpp
@@ -10,14 +10,22 @@ C10_API std::optional<DeviceType> getAccelerator(bool checked) {
 #define CHECK_NO_PU1 \
   TORCH_CHECK(!is_privateuse1_backend_registered(), "Cannot have both CUDA and PrivateUse1");
 
+#define CHECK_NO_MTIA \
+  TORCH_CHECK(!at::hasMTIA(), "Cannot have MTIA with other devices");
+
     if (is_privateuse1_backend_registered()) {
         // We explicitly allow PrivateUse1 and another device at the same time
         // as we use this for testing.
         // Whenever a PrivateUse1 device is registered, use it first.
         return kPrivateUse1;
     } else if (at::hasCUDA()) {
         CHECK_NO_PU1
+        CHECK_NO_MTIA
         return kCUDA;
+    } else if (at::hasMTIA()) {
+        CHECK_NO_CUDA
+        CHECK_NO_PU1
+        return kMTIA;
     } else {
         TORCH_CHECK(!checked, "Cannot access accelerator device when none is available.")
         return std::nullopt;

diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
@@ -81,7 +81,7 @@ inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
   std::vector<int64_t> result(1, sizes.sizes()[0]);
   if (sizes.dim() > 0) {
     size_t nested_dim = result.size();
-    int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+    const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
     result.resize(nested_dim + sizes.sizes()[1]);
     int64_t sizes_size_0 = sizes.sizes()[0];
     int64_t sizes_size_1 = sizes.sizes()[1];
@@ -114,7 +114,7 @@ at::Tensor construct_nested_strides(const at::Tensor& sizes) {
     return sizes;
   }
   at::Tensor strides = sizes.new_empty(sizes.sizes());
-  const int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+  const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
   int64_t* strides_ptr = strides.data_ptr<int64_t>();
   for (int64_t i = 0; i < sizes.size(0); i++) {
     strides_ptr[orig_dim - 1] = 1;
@@ -152,7 +152,7 @@ at::Tensor construct_offsets(const at::Tensor& sizes) {
     std::iota(offsets_ptr, offsets_ptr + ntensors, 0);
     return offsets;
   }
-  const int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+  const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
   offsets_ptr[0] = 0;
   for (const auto i : c10::irange(ntensors - 1)) {
     const int64_t row_product = std::accumulate(sizes_ptr, sizes_ptr + orig_dim, 1, std::multiplies());
@@ -344,7 +344,7 @@ int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor) {
       static_cast<uint64_t>(std::numeric_limits<int64_t>::max()),
       static_cast<uint64_t>(std::numeric_limits<size_t>::max()));
 
-  const int64_t* sizes_ptr = tensor.data_ptr<int64_t>();
+  const int64_t* sizes_ptr = tensor.const_data_ptr<int64_t>();
   const auto nt_dim = tensor.size(1);
   uint64_t num_elements{0};
 

diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h
@@ -228,7 +228,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
   }
   const Tensor &sizemat = nt->get_nested_sizes(),
                &stridemat = nt->get_nested_strides();
-  int64_t* offsets_ptr = nt->get_storage_offsets().data_ptr<int64_t>();
+  const int64_t* offsets_ptr =
+      nt->get_storage_offsets().const_data_ptr<int64_t>();
   int64_t orig_dim = sizemat.size(1);
   // nesting scalars
   if (orig_dim == 0) {
@@ -243,8 +244,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
   // nesting tensors
   else {
     // if any underlying tensor is non-contiguous
-    const int64_t *sizemat_ptr = sizemat.data_ptr<int64_t>(),
-                  *stridemat_ptr = stridemat.data_ptr<int64_t>();
+    const int64_t *sizemat_ptr = sizemat.const_data_ptr<int64_t>(),
+                  *stridemat_ptr = stridemat.const_data_ptr<int64_t>();
     for (int64_t i = 0; i < ntensors; i++) {
       if (stridemat_ptr[orig_dim - 1] != 1) {
         return false;
@@ -263,8 +264,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
     if (offsets_ptr[0] != 0) {
       return false;
     }
-    sizemat_ptr = sizemat.data_ptr<int64_t>();
-    stridemat_ptr = stridemat.data_ptr<int64_t>();
+    sizemat_ptr = sizemat.const_data_ptr<int64_t>();
+    stridemat_ptr = stridemat.const_data_ptr<int64_t>();
     for (int64_t i = 1; i < ntensors; i++) {
       if (offsets_ptr[i] !=
           offsets_ptr[i - 1] + *sizemat_ptr * *stridemat_ptr) {

diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
@@ -728,7 +728,7 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
 
 // KERNEL_PRIVATEUSEONE/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE
 // registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastPrivateUse1
-#define KERNEL_PRIVATEUSEONE(OP, ...) \
+#define KERNEL_PRIVATEUSEONE(...) \
   KERNEL(c10::DeviceType::PrivateUse1, __VA_ARGS__)
 
 #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE( \

diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
@@ -72,7 +72,7 @@ static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor&
     return std::make_tuple(1., 0);
   }
   bool intMode = true;
-  auto self_p = self.data_ptr<double>();
+  auto self_p = self.const_data_ptr<double>();
   for (const auto i : c10::irange(size)) {
     auto z = self_p[i];
     if(std::isfinite(z)) {
@@ -189,7 +189,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
     }
     for (const auto l : c10::irange(self.size(0))) {
       Tensor row = self.select(0,l);
-      double *row_ptr = row.data_ptr<double>();
+      const double *row_ptr = row.const_data_ptr<double>();
       for (const auto c : c10::irange(firstColumn, lastColumn+1)) {
         stream << std::setw(sz) << row_ptr[c]/scale;
         if(c == lastColumn) {
@@ -279,15 +279,15 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
       tensor = tensor_.to(kCPU, kDouble).contiguous();
     }
     if(tensor.ndimension() == 0) {
-      stream << defaultfloat << tensor.data_ptr<double>()[0] << '\n';
+      stream << defaultfloat << tensor.const_data_ptr<double>()[0] << '\n';
       stream << "[ " << tensor_.toString() << "{}";
     } else if(tensor.ndimension() == 1) {
       if (tensor.numel() > 0) {
         auto [scale, sz] = __printFormat(stream, tensor);
         if(scale != 1) {
           printScale(stream, scale);
         }
-        double* tensor_p = tensor.data_ptr<double>();
+        const double* tensor_p = tensor.const_data_ptr<double>();
         for (const auto i : c10::irange(tensor.size(0))) {
           stream << std::setw(sz) << tensor_p[i]/scale << '\n';
         }

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@@ -126,32 +126,44 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
   }
 };
 
+template <typename dst_t, typename src_t>
+struct VecConvert<
+    dst_t,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<
+        (is_reduced_floating_point_v<dst_t> && is_8bit_integer_v<src_t>) ||
+            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
+    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+  }
+};
+
 template <typename dst_t>
 struct VecConvert<
-  dst_t,
-  1,
-  float,
-  1,
-  typename std::enable_if_t<
-    std::is_same_v<dst_t, unsigned char> || std::is_same_v<dst_t, signed char>,
-    void>> {
-  static inline VectorizedN<dst_t, 1> apply(
-      const VectorizedN<float, 1>& src) {
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
     return convert_float_to_int8<dst_t>(src[0]);
   }
 };
 
 template <typename src_t>
 struct VecConvert<
-  float,
-  1,
-  src_t,
-  1,
-  typename std::enable_if_t<
-    std::is_same_v<src_t, unsigned char> || std::is_same_v<src_t, signed char>,
-    void>> {
-  static inline VectorizedN<float, 1> apply(
-      const VectorizedN<src_t, 1>& src) {
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
     return convert_int8_to_float<src_t>(src[0]);
   }
 };