avoid some shared_ptr copying on "[PyTorch] Fix getCustomClassType() …

…perf" 1) It was copying the entire hash table every time. 2) We don't need to do a hash lookup at all. Differential Revision: [D25385543](https://our.internmc.facebook.com/intern/diff/D25385543/) **NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D25385543/)! [ghstack-poisoned]
pytorch · Dec 9, 2020 · 0deedff · 0deedff
2 parents da1995f + 274ce26
commit 0deedff
Show file tree

Hide file tree

Showing 84 changed files with 1,934 additions and 1,425 deletions.
diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh
@@ -38,6 +38,8 @@ mkdir -p "$OUT"/pyi/torch/_C
 mkdir -p "$OUT"/pyi/torch/nn
 python -m tools.pyi.gen_pyi \
   --declarations-path "$OUT"/torch/share/ATen/Declarations.yaml \
+  --native-functions-path aten/src/ATen/native/native_functions.yaml \
+  --deprecated-functions-path tools/autograd/deprecated.yaml \
   --out "$OUT"/pyi
 
 # autograd codegen (called by torch codegen but can run independently)

diff --git a/README.md b/README.md
@@ -176,7 +176,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_ex
 On Linux
 ```bash
 # Add LAPACK support for the GPU if needed
-conda install -c pytorch magma-cuda102  # or [ magma-cuda101 | magma-cuda100 | magma-cuda92 ] depending on your cuda version
+conda install -c pytorch magma-cuda110  # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo
 ```
 
 On MacOS

diff --git a/android/test_app/app/build.gradle b/android/test_app/app/build.gradle
@@ -60,20 +60,20 @@ android {
     //}
     flavorDimensions "model", "build", "activity"
     productFlavors {
-        mbq {
+        mnet {
             dimension "model"
-            applicationIdSuffix ".mbq"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2q.pt\"")
-            addManifestPlaceholders([APP_NAME: "MBQ"])
-            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbq\"")
+            applicationIdSuffix ".mnet"
+            buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet.pt\"")
+            addManifestPlaceholders([APP_NAME: "MNET"])
+            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet\"")
         }
-        mbvulkan {
+        mnetVulkan {
             dimension "model"
-            applicationIdSuffix ".mbvulkan"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2-vulkan.pt\"")
+            applicationIdSuffix ".mnet_vulkan"
+            buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet_vulkan.pt\"")
             buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true')
-            addManifestPlaceholders([APP_NAME: "MBQ"])
-            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbvulkan\"")
+            addManifestPlaceholders([APP_NAME: "MNET_VULKAN"])
+            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet-vulkan\"")
         }
         resnet18 {
             dimension "model"

diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
@@ -941,8 +941,8 @@ Tensor new_empty_strided_batching_rule(
         size.size(), ") must match dimensionality of strides (",
         stride.size(), ")");
   auto storage_size = native::storage_size_for(size, stride);
-  for (int64_t idx = 0; idx < physical_strides.size(); ++idx) {
-    physical_strides[idx] *= storage_size;
+  for (auto& physical_stride : physical_strides) {
+    physical_stride *= storage_size;
   }
 
   // physical_strides = [B1 * B2 * S, B2 * S, S] + strides

diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
@@ -264,11 +264,11 @@ static std::vector<Dimname> compute_dot_product_outnames(
   }
   std::vector<Dimname> outnames(num_outnames, Dimname::wildcard());
   int64_t index = 0;
-  for (int64_t j = 0; j < tensor_names.size(); ++j) {
+  for (size_t j = 0; j < tensor_names.size(); ++j) {
     if (j == tensor_dotted_dim) continue;
     outnames[index++] = tensor_names[j];
   }
-  for (int64_t j = 0; j < other_names.size(); ++j) {
+  for (size_t j = 0; j < other_names.size(); ++j) {
     if (j == other_dotted_dim) continue;
     outnames[index++] = other_names[j];
   }

diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
@@ -939,8 +939,8 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso
 }
 
 void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
-  for (int i = 0; i < config.tensors_.size(); i++) {
-    operands_.emplace_back(std::move(config.tensors_[i]));
+  for (auto& tensor: config.tensors_) {
+    operands_.emplace_back(std::move(tensor));
   }
   num_outputs_ = config.num_outputs_;
 }

diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp
@@ -61,10 +61,10 @@ TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
 }
 
 TensorNames& TensorNames::unifyFromRightInplace(const TensorNames& other, const char* op_name) {
-  int64_t size_diff = std::labs(names_.size() - other.names_.size());
+  size_t size_diff = std::labs(names_.size() - other.names_.size());
 
   if (names_.size() > other.names_.size()) {
-    for (int64_t idx = size_diff; idx < names_.size(); ++idx) {
+    for (size_t idx = size_diff; idx < names_.size(); ++idx) {
       names_[idx] = names_[idx].unify(other.names_[idx - size_diff], op_name);
     }
   } else {

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
@@ -22,7 +22,7 @@ namespace ivalue {
 
 // This is in ivalue.cpp because we need to access Type::annotation_str, which
 // is declared in jit_type.h
-void checkCustomClassType(TypePtr expected_type, TypePtr actual_type) {
+void checkCustomClassType(const Type* expected_type, const Type* actual_type) {
   // NB: doing pointer comparison here
   // If in the future there ever arises a need to call operator== on custom class
   // Type's, this needs to be changed!

diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
@@ -959,7 +959,7 @@ c10::ClassTypePtr getCustomClassTypeImpl() {
 }
 
 template <typename T>
-c10::ClassTypePtr getCustomClassType() {
+const c10::ClassTypePtr& getCustomClassType() {
   // Classes are never unregistered from getCustomClassTypeMap and the
   // hash lookup can be a hot path, so just cache.
   // For the same reason, it's fine If this ends up getting duplicated across

diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
@@ -172,7 +172,7 @@ inline at::Generator IValue::toGenerator() const& {
 namespace ivalue {
 
 void CAFFE2_API
-checkCustomClassType(TypePtr expected_type, TypePtr actual_type);
+checkCustomClassType(const Type* expected_type, const Type* actual_type);
 
 template <typename T>
 using Shared = c10::intrusive_ptr<T>;
@@ -756,8 +756,8 @@ c10::intrusive_ptr<T> IValue::toCustomClass() && {
       obj->slots().size() == 1,
       "Tried to cast IValue to custom class but it did "
       "not contain a custom class!");
-  auto expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>();
-  ivalue::checkCustomClassType(expected_type, type());
+  const Type* expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>().get();
+  ivalue::checkCustomClassType(expected_type, type().get());
   auto userObj =
       c10::static_intrusive_pointer_cast<T>(obj->getSlot(0).toCapsule());
   return userObj;
@@ -774,8 +774,8 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
       obj->slots().size() == 1,
       "Tried to cast IValue to custom class but it did "
       "not contain a custom class!");
-  auto expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>();
-  ivalue::checkCustomClassType(expected_type, type());
+  const Type* expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>().get();
+  ivalue::checkCustomClassType(expected_type, type().get());
   auto userObj =
       c10::static_intrusive_pointer_cast<T>(obj->getSlot(0).toCapsule());
   return userObj;

diff --git a/aten/src/ATen/cuda/CUDASolver.cpp b/aten/src/ATen/cuda/CUDASolver.cpp
@@ -46,14 +46,14 @@ void getrf<c10::complex<double>>(
   TORCH_CUSOLVER_CHECK(cusolverDnZgetrf_bufferSize(
       handle, m, n, reinterpret_cast<cuDoubleComplex*>(dA), ldda, &lwork));
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
-  void* buffer = allocator.allocate(sizeof(cuDoubleComplex) * lwork).get();
+  auto dataPtr = allocator.allocate(sizeof(cuDoubleComplex) * lwork);
   TORCH_CUSOLVER_CHECK(cusolverDnZgetrf(
       handle,
       m,
       n,
       reinterpret_cast<cuDoubleComplex*>(dA),
       ldda,
-      static_cast<cuDoubleComplex*>(buffer),
+      static_cast<cuDoubleComplex*>(dataPtr.get()),
       ipiv,
       info));
 }
@@ -71,14 +71,14 @@ void getrf<c10::complex<float>>(
   TORCH_CUSOLVER_CHECK(cusolverDnCgetrf_bufferSize(
       handle, m, n, reinterpret_cast<cuComplex*>(dA), ldda, &lwork));
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
-  void* buffer = allocator.allocate(sizeof(cuComplex) * lwork).get();
+  auto dataPtr = allocator.allocate(sizeof(cuComplex) * lwork);
   TORCH_CUSOLVER_CHECK(cusolverDnCgetrf(
       handle,
       m,
       n,
       reinterpret_cast<cuComplex*>(dA),
       ldda,
-      static_cast<cuComplex*>(buffer),
+      static_cast<cuComplex*>(dataPtr.get()),
       ipiv,
       info));
 }

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
@@ -177,14 +177,10 @@ auto ConvParams::needs_64bit_indexing_no_split(const at::Tensor& input, const at
   int64_t outsize = 1;
   if (transposed) {
     std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
-    for (int64_t i = 1; i < o.size(); i++) {
-      outsize *= o[i];
-    }
+    outsize = prod_intlist(o.begin() + 1, o.end());
   } else {
     std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
-    for (int64_t i = 1; i < o.size(); i++) {
-      outsize *= o[i];
-    }
+    outsize = prod_intlist(o.begin() + 1, o.end());
   }
   return outsize > int_max;
 }

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -188,6 +188,9 @@ FOREACH_UNARY_OP(sinh);
 FOREACH_UNARY_OP(round);
 FOREACH_UNARY_OP(lgamma);
 FOREACH_UNARY_OP(frac);
+FOREACH_UNARY_OP(trunc);
+FOREACH_UNARY_OP(reciprocal);
+FOREACH_UNARY_OP(sigmoid);
 
 FOREACH_POINTWISE_OP_SCALAR(addcdiv);
 FOREACH_POINTWISE_OP_SCALAR(addcmul);
@@ -201,7 +204,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_slow(TensorList tensors1, TensorList
                                                                                              \
   std::vector<Tensor> result;                                                                \
   result.reserve(tensors1.size());                                                           \
-  for (int i = 0; i < tensors1.size(); i++) {                                                \
+  for (size_t i = 0; i < tensors1.size(); i++) {                                             \
     result.emplace_back(at::NAME(tensors1[i], tensors2[i]));                                 \
   }                                                                                          \
                                                                                              \