Update base for Update on "Redo Vulkan command and descriptor pools."

Differential Revision: [D23820829](https://our.internmc.facebook.com/intern/diff/D23820829) [ghstack-poisoned]
pytorch · Sep 22, 2020 · bce9377 · bce9377
2 parents 8b1910a + aae0777
commit bce9377
Show file tree

Hide file tree

Showing 132 changed files with 3,970 additions and 1,770 deletions.
diff --git a/.circleci/cimodel/data/simple/nightly_ios.py b/.circleci/cimodel/data/simple/nightly_ios.py
@@ -60,7 +60,7 @@ def gen_tree(self):
 
 
 WORKFLOW_DATA = BUILD_CONFIGS + [
-    IOSNightlyJob("binary", is_upload=True),
+    # IOSNightlyJob("binary", is_upload=True),
 ]
 
 

diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -7021,15 +7021,6 @@ workflows:
           ios_arch: arm64
           ios_platform: OS
           name: pytorch_ios_11_2_1_nightly_arm64_build
-      - binary_ios_upload:
-          build_environment: libtorch-ios-11.2.1-nightly-binary-build-upload
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-          requires:
-            - pytorch_ios_11_2_1_nightly_x86_64_build
-            - pytorch_ios_11_2_1_nightly_arm64_build
       - pytorch_linux_build:
           build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c

diff --git a/.gitignore b/.gitignore
@@ -261,6 +261,7 @@ TAGS
 
 # clangd background index
 .clangd/
+.cache/
 
 # bazel symlinks
 bazel-*
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
@@ -171,7 +171,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
     # default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
     # Fix the pip error: Couldn't find a version that satisfies the requirement
     pip install --upgrade pip
-    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.4.0.dev202008122
+    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.5.0.dev202009182
   fi
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
@@ -352,6 +352,22 @@ test_cpp_extensions() {
   assert_git_not_dirty
 }
 
+test_vec256() {
+  # This is to test vec256 instructions DEFAULT/AVX/AVX2 (platform dependent, some platforms might not support AVX/AVX2)
+  if [[ "$BUILD_ENVIRONMENT" != *asan* ]] && [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
+    echo "Testing vec256 instructions"
+    mkdir -p test/test-reports/vec256
+    pushd build/bin
+    vec256_tests=$(find . -maxdepth 1 -executable -name 'vec256_test*')
+    for vec256_exec in $vec256_tests
+    do
+      $vec256_exec --gtest_output=xml:test/test-reports/vec256/$vec256_exec.xml
+    done
+    popd
+    assert_git_not_dirty
+  fi
+}
+
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@@ -395,6 +411,7 @@ else
   test_python_all_except_nn_and_cpp_extensions
   test_cpp_extensions
   test_aten
+  test_vec256
   test_libtorch
   test_custom_script_ops
   test_custom_backend

diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
@@ -743,125 +743,6 @@ std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdi
     }
     return std::tuple<Tensor, Tensor>(values, indices);
 }
-std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THByteTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Char: {
-            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THCharTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Double: {
-            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THDoubleTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Float: {
-            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THFloatTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Int: {
-            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THIntTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Long: {
-            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THLongTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Short: {
-            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THShortTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Half: {
-            auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto indices_ = checked_dense_tensor_unwrap(indices, "indices", 0, "_th_sort_out", false, DeviceType::CPU, ScalarType::Long);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort_out", false, DeviceType::CPU, dispatch_scalar_type);
-            THHalfTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        default:
-            AT_ERROR("_th_sort_out not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor &, Tensor &>(values, indices);
-}
-std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descending) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto values_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto values = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(values_));
-    auto indices_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(ScalarType::Long)).release();
-    auto indices = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(indices_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
-            THByteTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
-            THCharTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
-            THDoubleTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
-            THFloatTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
-            THIntTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
-            THLongTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
-            THShortTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_sort", false, DeviceType::CPU, dispatch_scalar_type);
-            THHalfTensor_sort(values_, indices_, self_, dim, descending);
-            break;
-        }
-        default:
-            AT_ERROR("_th_sort not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return std::tuple<Tensor, Tensor>(values, indices);
-}
 Tensor _th_var(const Tensor & self, bool unbiased) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);

diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h
@@ -31,8 +31,6 @@ Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bo
 Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value);
 std::tuple<Tensor &,Tensor &> _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim);
 std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdim);
-std::tuple<Tensor &,Tensor &> _th_sort_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool descending);
-std::tuple<Tensor,Tensor> _th_sort(const Tensor & self, int64_t dim, bool descending);
 Tensor _th_var(const Tensor & self, bool unbiased);
 Tensor _th_std(const Tensor & self, bool unbiased);
 Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
@@ -611,6 +611,7 @@ _(aten, sigmoid) \
 _(aten, sign) \
 _(aten, signbit) \
 _(aten, silu) \
+_(aten, sgn) \
 _(aten, sin) \
 _(aten, sinh) \
 _(aten, size) \

diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp
@@ -19,6 +19,13 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*) {
     "let us know in the bug tracker.");
 }
 
+void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, Stack*) {
+  TORCH_INTERNAL_ASSERT(0,
+    op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. "
+    "If it's intended to override Math kernel behavior, please open an issue to request a dedicated "
+    "Autograd dispatch key for the backend.");
+}
+
 void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, Stack*) {
   // DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING
   // See Note [named_not_supported_kernel]

diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -17,6 +17,18 @@ struct OperatorKernel;
 // boxing/unboxing codepath.
 CAFFE2_API void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*);
 
+// Note [Ambiguity in AutogradOther kernel]
+// This kernel implements reporting an error message when there're kernels registered
+// to both Math and a backend of AutogradOther, we don't know which kernel to pick:
+// - if we pick Math kernel for AutogradOther, the kernel registered to backend will be
+//   silently ignored and never called.
+// - if we skip using Math kernel for AutogradOther (it might pick Autograd kernel if available),
+//   it'll break all backends mapped to AutogradOther without a direct registration to backend.
+//   See c10/core/DispatchKeySet.cpp for a list of backends mapped to AutogradOther.
+// Thus if backend extender indeed want to override Math kernel behavior, they should request
+// a dedicated Autograd key for their backend to resolve the ambiguity.
+CAFFE2_API void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle&, Stack*);
+
 // Note [named_not_supported_kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // This kernel implements reporting an error message saying that named tensor is
@@ -181,6 +193,7 @@ class CAFFE2_API KernelFunction final {
   static KernelFunction makeFromUnboxedOnlyRuntimeFunction(FuncType* func);
 
   static KernelFunction makeFallthrough();
+  static KernelFunction makeAmbiguousAutogradOther();
   static KernelFunction makeNamedNotSupported();
 
   /**

diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -83,6 +83,14 @@ inline KernelFunction KernelFunction::makeFallthrough() {
     );
 }
 
+inline KernelFunction KernelFunction::makeAmbiguousAutogradOther() {
+    return KernelFunction(
+        nullptr,  // no functor_ object
+        &ambiguous_autogradother_kernel,
+        nullptr  // no unboxed function pointer
+    );
+}
+
 inline KernelFunction KernelFunction::makeNamedNotSupported() {
     return KernelFunction(
         nullptr,  // no functor_ object