Update

[ghstack-poisoned]
pytorch · Apr 29, 2024 · bb9302f · bb9302f
2 parents ec6dad7 + 8f27ff2
commit bb9302f
Show file tree

Hide file tree

Showing 397 changed files with 7,167 additions and 40,154 deletions.
diff --git a/.flake8 b/.flake8
@@ -54,6 +54,7 @@ per-file-ignores =
     torch/ao/quantization/fx/_decomposed.py: TOR901
     torch/distributed/_functional_collectives.py: TOR901
     torch/distributed/_spmd/data_parallel.py: TOR901
+    torch/distributed/_tensor/_collective_utils.py: TOR901
 optional-ascii-coding = True
 exclude =
     ./.git,

diff --git a/.github/label_to_label.yml b/.github/label_to_label.yml
@@ -0,0 +1,13 @@
+# Use this to auto apply labels based on other labels.  Applies to both PRs and
+# issues. Currently only supports any and all
+- any:
+  - "module: custom operators"
+  - "module: aotdispatch"
+  then:
+  - "module: pt2-dispatcher"
+- any:
+  - "module: dynamo"
+  - "module: pt2-dispatcher"
+  - "module: inductor"
+  then:
+  - "oncall: pt2"
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -24,3 +24,4 @@ retryable_workflows:
 - linux-binary
 - windows-binary
 labeler_config: labeler.yml
+label_to_label_config: label_to_label.yml
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
@@ -92,7 +92,7 @@ jobs:
           retry_wait_seconds: 30
           command: |
             set -eu
-            python3 -m pip install rockset==1.0.3
+            python3 -m pip install rockset==1.0.3 'xdoctest>=1.1.0'
 
       - name: Start monitoring script
         id: monitor-script

diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -1051,8 +1051,6 @@ exclude_patterns = [
     'test/quantization/fx/test_numeric_suite_fx.py',
     'test/quantization/fx/test_quantize_fx.py',
     'test/quantization/fx/test_subgraph_rewriter.py',
-    'test/test_custom_op_testing.py',
-    'test/test_dataloader.py',
     'test/test_datapipe.py',
     'test/test_decomp.py',
     'test/test_deploy.py',
@@ -1065,7 +1063,6 @@ exclude_patterns = [
     'test/test_function_schema.py',
     'test/test_functional_autograd_benchmark.py',
     'test/test_functional_optim.py',
-    'test/test_functionalization.py',
     'test/test_functionalization_of_rng_ops.py',
     'test/test_futures.py',
     'test/test_fx.py',

diff --git a/BUILD.bazel b/BUILD.bazel
@@ -446,30 +446,13 @@ cu_library(
 # caffe2
 CAFFE2_COPTS = COMMON_COPTS + [
     "-Dcaffe2_EXPORTS",
-    "-DCAFFE2_USE_GLOO",
     "-DCAFFE2_USE_CUDNN",
     "-DCAFFE2_BUILD_MAIN_LIB",
     "-fvisibility-inlines-hidden",
     "-fno-math-errno",
     "-fno-trapping-math",
 ]
 
-filegroup(
-    name = "caffe2_contrib_srcs",
-    srcs = [
-        "caffe2/contrib/aten/aten_op.cc",
-        "caffe2/contrib/gloo/allgather_ops.cc",
-        "caffe2/contrib/gloo/allreduce_ops.cc",
-        "caffe2/contrib/gloo/barrier_ops.cc",
-        "caffe2/contrib/gloo/broadcast_ops.cc",
-        "caffe2/contrib/gloo/common.cc",
-        "caffe2/contrib/gloo/common_world_ops.cc",
-        "caffe2/contrib/gloo/context.cc",
-        "caffe2/contrib/gloo/reduce_scatter_ops.cc",
-        "caffe2/contrib/gloo/store_handler.cc",
-    ],
-)
-
 filegroup(
     name = "caffe2_core_srcs",
     srcs = [
@@ -1024,10 +1007,6 @@ filegroup(
 filegroup(
     name = "caffe2_cuda_cpp_srcs",
     srcs = [
-        "caffe2/contrib/aten/aten_op_gpu.cc",
-        "caffe2/contrib/gloo/allreduce_ops_gpu.cc",
-        "caffe2/contrib/gloo/broadcast_ops_gpu.cc",
-        "caffe2/contrib/gloo/common_world_ops_gpu.cc",
         "caffe2/core/blob_serialization_gpu.cc",
         "caffe2/core/common_cudnn.cc",
         "caffe2/core/common_gpu.cc",
@@ -1271,35 +1250,10 @@ cc_library(
     ],
 )
 
-py_binary(
-    name = "gen_op",
-    srcs = ["caffe2/contrib/aten/gen_op.py"],
-    deps = ["//torchgen"],
-)
-
-genrule(
-    name = "generated_caffe2_aten_op_headers",
-    srcs = [
-        "caffe2/contrib/aten/aten_op_template.h",
-        "aten/src/ATen/Declarations.yaml",
-    ],
-    outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"],
-    cmd = """
-    $(location :gen_op) \
-        --output_prefix gen_ \
-        --install_dir $(@D) \
-        --aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \
-        --template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \
-        --yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""",
-    tools = [":gen_op"],
-)
-
 cc_library(
     name = "caffe2_headers",
     hdrs = glob(
         [
-            "caffe2/contrib/aten/*.h",
-            "caffe2/contrib/gloo/*.h",
             "caffe2/core/*.h",
             "caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
             "caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
@@ -1338,10 +1292,9 @@ cc_library(
     ) + if_cuda(glob([
         "caffe2/**/*.cuh",
         "caffe2/image/*.h",
-    ])) + [":generated_caffe2_aten_op_headers"],
+    ])),
     copts = CAFFE2_COPTS,
     includes = [
-        "caffe2/contrib/aten",
         "caffe2/core/nomnigraph/include",
     ],
     visibility = ["//visibility:public"],
@@ -1385,7 +1338,6 @@ cc_library(
         "caffe2/db/create_db_op.cc",
         "caffe2/db/protodb.cc",
         "caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc",
-        ":caffe2_contrib_srcs",
         ":caffe2_core_srcs",
         ":caffe2_distributed_srcs",
         ":caffe2_ideep_srcs",
@@ -1419,7 +1371,6 @@ cc_library(
         "@fbgemm//:fbgemm_src_headers",
         "@fmt",
         "@foxi",
-        "@gloo",
         "@onnx",
     ] + if_cuda(
         [
@@ -1467,7 +1418,6 @@ cu_library(
         "@cuda//:curand",
         "@cudnn",
         "@eigen",
-        "@gloo",
         "@tensorpipe//:tensorpipe_cuda",
     ],
     alwayslink = True,

diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
@@ -81,7 +81,7 @@ inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
   std::vector<int64_t> result(1, sizes.sizes()[0]);
   if (sizes.dim() > 0) {
     size_t nested_dim = result.size();
-    int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+    const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
     result.resize(nested_dim + sizes.sizes()[1]);
     int64_t sizes_size_0 = sizes.sizes()[0];
     int64_t sizes_size_1 = sizes.sizes()[1];
@@ -114,7 +114,7 @@ at::Tensor construct_nested_strides(const at::Tensor& sizes) {
     return sizes;
   }
   at::Tensor strides = sizes.new_empty(sizes.sizes());
-  const int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+  const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
   int64_t* strides_ptr = strides.data_ptr<int64_t>();
   for (int64_t i = 0; i < sizes.size(0); i++) {
     strides_ptr[orig_dim - 1] = 1;
@@ -152,7 +152,7 @@ at::Tensor construct_offsets(const at::Tensor& sizes) {
     std::iota(offsets_ptr, offsets_ptr + ntensors, 0);
     return offsets;
   }
-  const int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+  const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
   offsets_ptr[0] = 0;
   for (const auto i : c10::irange(ntensors - 1)) {
     const int64_t row_product = std::accumulate(sizes_ptr, sizes_ptr + orig_dim, 1, std::multiplies());
@@ -344,7 +344,7 @@ int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor) {
       static_cast<uint64_t>(std::numeric_limits<int64_t>::max()),
       static_cast<uint64_t>(std::numeric_limits<size_t>::max()));
 
-  const int64_t* sizes_ptr = tensor.data_ptr<int64_t>();
+  const int64_t* sizes_ptr = tensor.const_data_ptr<int64_t>();
   const auto nt_dim = tensor.size(1);
   uint64_t num_elements{0};
 

diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h
@@ -228,7 +228,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
   }
   const Tensor &sizemat = nt->get_nested_sizes(),
                &stridemat = nt->get_nested_strides();
-  int64_t* offsets_ptr = nt->get_storage_offsets().data_ptr<int64_t>();
+  const int64_t* offsets_ptr =
+      nt->get_storage_offsets().const_data_ptr<int64_t>();
   int64_t orig_dim = sizemat.size(1);
   // nesting scalars
   if (orig_dim == 0) {
@@ -243,8 +244,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
   // nesting tensors
   else {
     // if any underlying tensor is non-contiguous
-    const int64_t *sizemat_ptr = sizemat.data_ptr<int64_t>(),
-                  *stridemat_ptr = stridemat.data_ptr<int64_t>();
+    const int64_t *sizemat_ptr = sizemat.const_data_ptr<int64_t>(),
+                  *stridemat_ptr = stridemat.const_data_ptr<int64_t>();
     for (int64_t i = 0; i < ntensors; i++) {
       if (stridemat_ptr[orig_dim - 1] != 1) {
         return false;
@@ -263,8 +264,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
     if (offsets_ptr[0] != 0) {
       return false;
     }
-    sizemat_ptr = sizemat.data_ptr<int64_t>();
-    stridemat_ptr = stridemat.data_ptr<int64_t>();
+    sizemat_ptr = sizemat.const_data_ptr<int64_t>();
+    stridemat_ptr = stridemat.const_data_ptr<int64_t>();
     for (int64_t i = 1; i < ntensors; i++) {
       if (offsets_ptr[i] !=
           offsets_ptr[i - 1] + *sizemat_ptr * *stridemat_ptr) {

diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
@@ -728,7 +728,7 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
 
 // KERNEL_PRIVATEUSEONE/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE
 // registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastPrivateUse1
-#define KERNEL_PRIVATEUSEONE(OP, ...) \
+#define KERNEL_PRIVATEUSEONE(...) \
   KERNEL(c10::DeviceType::PrivateUse1, __VA_ARGS__)
 
 #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE( \

diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
@@ -72,7 +72,7 @@ static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor&
     return std::make_tuple(1., 0);
   }
   bool intMode = true;
-  auto self_p = self.data_ptr<double>();
+  auto self_p = self.const_data_ptr<double>();
   for (const auto i : c10::irange(size)) {
     auto z = self_p[i];
     if(std::isfinite(z)) {
@@ -189,7 +189,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
     }
     for (const auto l : c10::irange(self.size(0))) {
       Tensor row = self.select(0,l);
-      double *row_ptr = row.data_ptr<double>();
+      const double *row_ptr = row.const_data_ptr<double>();
       for (const auto c : c10::irange(firstColumn, lastColumn+1)) {
         stream << std::setw(sz) << row_ptr[c]/scale;
         if(c == lastColumn) {
@@ -279,15 +279,15 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
       tensor = tensor_.to(kCPU, kDouble).contiguous();
     }
     if(tensor.ndimension() == 0) {
-      stream << defaultfloat << tensor.data_ptr<double>()[0] << '\n';
+      stream << defaultfloat << tensor.const_data_ptr<double>()[0] << '\n';
       stream << "[ " << tensor_.toString() << "{}";
     } else if(tensor.ndimension() == 1) {
       if (tensor.numel() > 0) {
         auto [scale, sz] = __printFormat(stream, tensor);
         if(scale != 1) {
           printScale(stream, scale);
         }
-        double* tensor_p = tensor.data_ptr<double>();
+        const double* tensor_p = tensor.const_data_ptr<double>();
         for (const auto i : c10::irange(tensor.size(0))) {
           stream << std::setw(sz) << tensor_p[i]/scale << '\n';
         }

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@@ -126,32 +126,44 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
   }
 };
 
+template <typename dst_t, typename src_t>
+struct VecConvert<
+    dst_t,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<
+        (is_reduced_floating_point_v<dst_t> && is_8bit_integer_v<src_t>) ||
+            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
+    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+  }
+};
+
 template <typename dst_t>
 struct VecConvert<
-  dst_t,
-  1,
-  float,
-  1,
-  typename std::enable_if_t<
-    std::is_same_v<dst_t, unsigned char> || std::is_same_v<dst_t, signed char>,
-    void>> {
-  static inline VectorizedN<dst_t, 1> apply(
-      const VectorizedN<float, 1>& src) {
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
     return convert_float_to_int8<dst_t>(src[0]);
   }
 };
 
 template <typename src_t>
 struct VecConvert<
-  float,
-  1,
-  src_t,
-  1,
-  typename std::enable_if_t<
-    std::is_same_v<src_t, unsigned char> || std::is_same_v<src_t, signed char>,
-    void>> {
-  static inline VectorizedN<float, 1> apply(
-      const VectorizedN<src_t, 1>& src) {
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
     return convert_int8_to_float<src_t>(src[0]);
   }
 };