Update on "Remove hacky_wrapper from VariableType and TraceType"

Previously, VariableType and TraceType kernels were still written in the legacy way, i.e. they took one TensorOptions argument instead of scattered dtype, layout, device, pin_memory, and they used hacky_wrapper to be callable. Now with this PR, variable and tracing kernels are written in the new way and no hacky_wrapper is needed for them. This only affects ops with `use_c10_dispatcher: full`. Differential Revision: [D23466042](https://our.internmc.facebook.com/intern/diff/D23466042/) [ghstack-poisoned]
pytorch · Sep 24, 2020 · b68b323 · b68b323
2 parents 66186ce + dc67b47
commit b68b323
Show file tree

Hide file tree

Showing 802 changed files with 9,957 additions and 4,743 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -924,7 +924,7 @@ jobs:
   smoke_mac_test:
     <<: *binary_linux_test_upload_params
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - run:
@@ -949,7 +949,7 @@ jobs:
   binary_mac_build:
     <<: *binary_mac_params
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout
@@ -1253,7 +1253,7 @@ jobs:
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - run_brew_for_macos_build
@@ -1287,7 +1287,7 @@ jobs:
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - attach_workspace:

diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@@ -135,7 +135,7 @@
   smoke_mac_test:
     <<: *binary_linux_test_upload_params
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - run:
@@ -160,7 +160,7 @@
   binary_mac_build:
     <<: *binary_mac_params
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout

diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -109,7 +109,7 @@
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - run_brew_for_macos_build
@@ -143,7 +143,7 @@
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - attach_workspace:

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
@@ -171,7 +171,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
     # default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
     # Fix the pip error: Couldn't find a version that satisfies the requirement
     pip install --upgrade pip
-    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.4.0.dev202008122
+    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.5.0.dev202009182
   fi
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi
diff --git a/.jenkins/pytorch/print_sccache_log.py b/.jenkins/pytorch/print_sccache_log.py
@@ -6,6 +6,7 @@
     lines = f.readlines()
 
 for line in lines:
-    # Ignore errors from CPU instruction set testing
-    if 'src.c' not in line:
+    # Ignore errors from CPU instruction set or symbol existing testing
+    keywords = ['src.c', 'CheckSymbolExists.c']
+    if all([keyword not in line for keyword in keywords]):
         print(line)
diff --git a/Dockerfile b/Dockerfile
@@ -44,13 +44,13 @@ WORKDIR /opt/pytorch
 COPY --from=conda /opt/conda /opt/conda
 COPY --from=submodule-update /opt/pytorch /opt/pytorch
 RUN --mount=type=cache,target=/opt/ccache \
-    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
     python setup.py install
 
 FROM conda as conda-installs
 ARG INSTALL_CHANNEL=pytorch-nightly
-RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=10.1 && \
+RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=11.0.221 && \
     /opt/conda/bin/conda clean -ya
 
 FROM ${BASE_IMAGE} as official

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -78,6 +78,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu")
 exclude(native_cuda_cu "${native_cuda_cu}" ${native_cuda_cu_sp})
 file(GLOB native_cuda_cpp "native/cuda/*.cpp")
 file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
+file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh")
 file(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
 file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
 file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
@@ -372,7 +373,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
-  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${cudnn_h} ${hip_h} ${miopen_h})
+  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${miopen_h})
 endif()
 
 # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake

diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
@@ -30,14 +30,15 @@ static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector<std::vector<
   return maybe_wrap_dim(dim, tensor_sizes[0].size());
 }
 
-// wrap each of dims basing on dim_post_expr
-static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_expr) {
+// wrap each dim in the dims array, taking dim_post_expr as the true number of dimensions
+static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_post_expr) {
   if (dim_post_expr <= 0) {
     dim_post_expr = 1; // this will make range [-1, 0]
   }
   int64_t min = -dim_post_expr;
   int64_t max = dim_post_expr - 1;
-  for (auto& dim : dims) {
+  for (int64_t i = 0; i < ndims; ++i) {
+    auto &dim = dims[i];
     if (dim < min || dim > max) {
       TORCH_CHECK_INDEX(false,
         "Dimension out of range (expected to be in range of [",
@@ -47,6 +48,13 @@ static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_
   }
 }
 
+// Wrap each dim in a contiguous container, taking dim_post_expr as the true number of dimensions
+// E.g. could also be std::array or c10::SmallVector
+template <typename Container>
+inline void maybe_wrap_dims(Container& dims, int64_t dim_post_expr) {
+  return maybe_wrap_dims_n(dims.data(), dims.size(), dim_post_expr);
+}
+
 // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
 // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
 // to be "skipped" (both for wrap dimension behavior and dimension size checking).

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
@@ -611,6 +611,7 @@ _(aten, sigmoid) \
 _(aten, sign) \
 _(aten, signbit) \
 _(aten, silu) \
+_(aten, sgn) \
 _(aten, sin) \
 _(aten, sinh) \
 _(aten, size) \

diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
@@ -263,7 +263,12 @@ struct SingleElementType : public Type {
   }
 
  protected:
-  SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) {}
+  SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) {
+    if (!this->elem) {
+      throw std::runtime_error(c10::str(
+            "Can not create ", typeKindToString(Kind), " with None type"));
+    }
+  }
 
  private:
   TypePtr elem;

diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
@@ -716,6 +716,9 @@ TupleType::TupleType(
       schema_(std::move(schema)) {
   has_free_variables_ =
       std::any_of(elements_.begin(), elements_.end(), [](TypePtr v) {
+        if (!v) {
+          throw std::runtime_error("Can not create tuple with None type");
+        }
         return v->hasFreeVariables();
       });
   if (schema_) {

diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -239,6 +239,13 @@ struct Vec256 {
     // Specifically map() does not perform the type conversion needed by abs.
     return map([](T x) { return static_cast<T>(std::abs(x)); });
   }
+
+  template <typename other_t_sgn = T,
+            typename std::enable_if<c10::is_complex<other_t_sgn>::value, int>::type = 0>
+  Vec256<T> sgn() const {
+    return map(at::native::sgn_impl);
+  }
+
   template <typename other_t_angle = T,
             typename std::enable_if<!c10::is_complex<other_t_angle>::value, int>::type = 0>
   Vec256<T> angle() const {

diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
@@ -134,6 +134,16 @@ template <> class Vec256<c10::complex<double>> {
     auto angle = _mm256_permute_pd(angle_(), 0x05); // angle    90-angle
     return _mm256_and_pd(angle, real_mask);         // angle    0
   }
+  Vec256<c10::complex<double>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_pd();
+    auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
+    auto abs_val = Vec256(abs);
+
+    auto div = values / abs_val.values;       // x / abs(x)
+
+    return blendv(div, zero, mask);
+  }
   __m256d real_() const {
     const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
                                                                      0xFFFFFFFFFFFFFFFF, 0x0000000000000000));

diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
@@ -171,6 +171,16 @@ template <> class Vec256<c10::complex<float>> {
     auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle    90-angle
     return _mm256_and_ps(angle, real_mask);         // angle    0
   }
+  Vec256<c10::complex<float>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_ps();
+    auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
+    auto abs_val = Vec256(abs);
+
+    auto div = values / abs_val.values;       // x / abs(x)
+
+    return _mm256_blendv_ps(div, zero, mask);
+  }
   __m256 real_() const {
     const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
                                                                    0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));

diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
@@ -175,7 +175,7 @@ Tensor& divide_(Tensor& self, Scalar other) {
 
 // true_divide, an alias for div
 Tensor& true_divide_out(Tensor& result, const Tensor& self, const Tensor& divisor) {
-  return native::div_out(result, self, divisor);
+  return at::div_out(result, self, divisor);
 }
 
 Tensor true_divide(const Tensor& self, const Tensor& divisor) {
@@ -390,14 +390,16 @@ Tensor rsub(const Tensor& self, const Tensor& other, Scalar alpha) {
 }
 
 Tensor& atan2_out(Tensor& result, const Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::binary_op(result, self, other);
+  auto iter = TensorIterator::binary_float_op(result, self, other);
   atan2_stub(iter.device_type(), iter);
   return result;
 }
 
 Tensor atan2(const Tensor& self, const Tensor& other) {
-  Tensor result = at::empty({0}, self.options());
-  return native::atan2_out(result, self, other);
+  Tensor result;
+  auto iter = TensorIterator::binary_float_op(result, self, other);
+  atan2_stub(iter.device_type(), iter);
+  return iter.output();
 }
 
 Tensor& atan2_(Tensor& self, const Tensor& other) {

diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
@@ -277,6 +277,18 @@ them the same thing!)
 If two backends have the same dispatch function, you can write `CPU, CUDA: func`
 to reuse the same function name in both cases.
 
+Available backend options can be found at
+https://github.com/pytorch/pytorch/blob/master/tools/codegen/gen.py#L970.
+In addition to backends above, we also support keyword `Math` which is an alias
+that maps to all backend and autograd backend keys. In other words, function registered to `Math` key
+should be a plain mathematical composition of other `at::` functions and works for any backend.
+
+If you add `dispatch` section to any API that didn't have it before, you **have to** move
+the old implementation to `Math` field so that it's still available for other backends to use.
+
+This work is currently WIP and you can find the design proposal in
+https://github.com/pytorch/pytorch/issues/44680.
+
 ### `device_guard`
 
 ```