[PyTorch] Devirtualize TensorImpl::storage() (#51050)

Summary: Pull Request resolved: #51050 Subclasses want to be able to make storage() calls throw, so we find some free space in TensorImpl to add a flag that they can set to make that happen without making storage() virtual. It should still be inlineable. ghstack-source-id: 121819684 Test Plan: Compared `perf stat` on 1M iterations on AdIndexer benchmark before/after Before: ``` 74,483.15 msec task-clock # 0.999 CPUs utilized ( +- 0.14% ) 16,637 context-switches # 0.223 K/sec ( +- 11.97% ) 3 cpu-migrations # 0.000 K/sec ( +- 7.20% ) 107,085 page-faults # 0.001 M/sec ( +- 2.39% ) 147,356,440,831 cycles # 1.978 GHz ( +- 0.14% ) (50.06%) 278,678,430,378 instructions # 1.89 insn per cycle ( +- 0.01% ) (50.05%) 43,540,698,177 branches # 584.571 M/sec ( +- 0.01% ) (50.05%) 141,028,843 branch-misses # 0.32% of all branches ( +- 1.00% ) (50.05%) ``` After: ``` 74,178.77 msec task-clock # 0.999 CPUs utilized ( +- 0.31% ) 17,125 context-switches # 0.231 K/sec ( +- 3.41% ) 3 cpu-migrations # 0.000 K/sec 109,535 page-faults # 0.001 M/sec ( +- 1.04% ) 146,803,364,372 cycles # 1.979 GHz ( +- 0.30% ) (50.03%) 277,726,600,254 instructions # 1.89 insn per cycle ( +- 0.02% ) (50.03%) 43,299,659,815 branches # 583.720 M/sec ( +- 0.03% ) (50.03%) 130,504,094 branch-misses # 0.30% of all branches ( +- 1.14% ) (50.03%) ``` Looks like approximately 0.3% instruction count win (and similarly for cycles, but that's within noise). Reviewed By: ezyang Differential Revision: D26013815 fbshipit-source-id: 07939957929070e18b9981d492d8279c9bb33c55
pytorch · Feb 17, 2021 · 059ee85 · 059ee85
1 parent 4305609
commit 059ee85
Show file tree

Hide file tree

Showing 14 changed files with 78 additions and 23 deletions.
diff --git a/aten/src/ATen/BatchedTensorImpl.cpp b/aten/src/ATen/BatchedTensorImpl.cpp
@@ -15,6 +15,7 @@ BatchedTensorImpl::BatchedTensorImpl(Tensor value, BatchDims bdims)
   , bdims_(std::move(bdims))
 {
   TORCH_INTERNAL_ASSERT(value_.defined());
+  set_storage_access_should_throw();
   checkInvariants();
 
   const auto public_dims = value_.dim() - bdims_.size();
@@ -80,10 +81,6 @@ bool BatchedTensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
   return is_contiguous_;
 }
 
-const Storage& BatchedTensorImpl::storage() const {
-  TORCH_CHECK(false, "Due to limitations, we cannot access the storage() of a tensor from inside of vmap.");
-}
-
 // The following are some internal inherited methods that we do not support.
 // They should never get called.
 void BatchedTensorImpl::set_size(int64_t dim, int64_t new_size) {
@@ -102,6 +99,10 @@ bool BatchedTensorImpl::has_storage() const {
 }
 #endif
 
+const char* BatchedTensorImpl::tensorimpl_type_name() const {
+  return "BatchedTensorImpl";
+}
+
 Tensor makeBatched(const Tensor& tensor, BatchDims bdims) {
   TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor));
   auto tensor_dim = tensor.dim();

diff --git a/aten/src/ATen/BatchedTensorImpl.h b/aten/src/ATen/BatchedTensorImpl.h
@@ -80,11 +80,11 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
 #ifdef DEBUG
   bool has_storage() const override;
 #endif
-  const Storage& storage() const override;
 
  private:
   // see NOTE: [BatchedTensorImpl levels invariant]
   void checkInvariants() const;
+  const char* tensorimpl_type_name() const override;
 
   Tensor value_;
 

diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
@@ -28,6 +28,7 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
       bool is_non_overlapping_and_dense = true)
       : TensorImpl(key_set, data_type, device),
         opaque_handle_(std::move(opaque_handle)) {
+    set_storage_access_should_throw();
     sizes_and_strides_.set_sizes(sizes);
     refresh_numel();
     is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
@@ -71,10 +72,6 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
   }
 #endif
 
-  const Storage& storage() const override {
-    AT_ERROR("opaque tensors do not have storage");
-  }
-
   /**
    * Return a TensorImpl that is a shallow-copy of this TensorImpl.
    *
@@ -180,6 +177,10 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
   }
 
  private:
+  const char* tensorimpl_type_name() const override {
+    return "OpaqueTensorImpl";
+  }
+
   OpaqueHandle opaque_handle_;
 };
 

diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
@@ -50,6 +50,7 @@ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::Typ
   AT_ASSERT(values_.device() == device());
 
   is_non_overlapping_and_dense_ = false;
+  set_storage_access_should_throw();
 }
 
 IntArrayRef SparseTensorImpl::strides() const {
@@ -76,9 +77,11 @@ bool SparseTensorImpl::has_storage() const {
   return false;
 }
 #endif
-const Storage& SparseTensorImpl::storage() const {
-  AT_ERROR("sparse tensors do not have storage");
+
+const char* SparseTensorImpl::tensorimpl_type_name() const {
+  return "SparseTensorImpl";
 }
+
 void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, const Tensor& values) {
   TORCH_CHECK(allow_tensor_metadata_change(), "set_indices_and_values_unsafe ", err_msg_tensor_metadata_change_not_allowed);
 

diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
@@ -50,7 +50,6 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
 #ifdef DEBUG
   bool has_storage() const override;
 #endif
-  const Storage& storage() const override;
 
   // WARNING: This function does NOT preserve invariants of sparse_dim/dense_dim with
   // respect to indices and values
@@ -261,6 +260,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
     dest_sparse_impl->values_ = src_sparse_impl->values();
     dest_sparse_impl->coalesced_ = src_sparse_impl->coalesced();
   }
+
+  const char* tensorimpl_type_name() const override;
 };
 
 } // namespace at
diff --git a/aten/src/ATen/native/metal/MetalTensorImpl.h b/aten/src/ATen/native/metal/MetalTensorImpl.h
@@ -48,6 +48,10 @@ struct TORCH_API MetalTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
   }
 
  private:
+  const char* tensorimpl_type_name() const override {
+    return "MetalTensorImpl";
+  }
+
   SmallVector<int64_t, 5> strides_;
 };
 } // namespace at

diff --git a/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h b/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
@@ -41,6 +41,10 @@ struct VulkanOpaqueTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
   }
 
  private:
+  const char* tensorimpl_type_name() const override {
+    return "VulkanOpaqueTensorImpl";
+  }
+
   SmallVector<int64_t, 5> strides_;
 };
 

diff --git a/aten/src/ATen/quantized/QTensorImpl.cpp b/aten/src/ATen/quantized/QTensorImpl.cpp
@@ -10,4 +10,8 @@ QTensorImpl::QTensorImpl(
     : TensorImpl(std::move(storage), key_set, data_type),
       quantizer_(quantizer) {}
 
+const char* QTensorImpl::tensorimpl_type_name() const {
+  return "QTensorImpl";
+}
+
 } // namespace at
diff --git a/aten/src/ATen/quantized/QTensorImpl.h b/aten/src/ATen/quantized/QTensorImpl.h
@@ -93,6 +93,8 @@ struct TORCH_API QTensorImpl : public c10::TensorImpl {
  private:
   QuantizerPtr quantizer_;
 
+  const char* tensorimpl_type_name() const override;
+
   /**
    * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / storage_offset)
    * from one TensorImpl to another TensorImpl.

diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
@@ -399,6 +399,14 @@ TEST(BasicTest, FactoryMethodsTest) {
   ASSERT_FALSE(tensor1.requires_grad());
   ASSERT_FALSE(tensor1.is_pinned());
 
+  // Sparse tensor CPU test to avoid requiring CUDA to catch simple bugs.1
+  tensor1 = at::empty({4}, at::TensorOptions().dtype(at::kHalf).layout(at::kSparse));
+  ASSERT_EQ(tensor1.dtype(), at::kHalf);
+  ASSERT_EQ(tensor1.layout(), at::kSparse);
+  ASSERT_EQ(tensor1.device(), at::kCPU);
+  ASSERT_FALSE(tensor1.requires_grad());
+  ASSERT_ANY_THROW(tensor1.is_pinned());
+
   if (torch::cuda::is_available()) {
     // Test setting pin memory
     tensor1 = at::empty({4}, at::TensorOptions().pinned_memory(true));

diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
@@ -242,6 +242,10 @@ bool TensorImpl::has_storage() const {
 }
 #endif
 
+void TensorImpl::throw_storage_access_error() const {
+  TORCH_CHECK(false, "Cannot access storage of ", tensorimpl_type_name());
+}
+
 bool TensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
 #ifdef DEBUG
   AT_ASSERT(compute_contiguous() == is_contiguous_);
@@ -255,10 +259,6 @@ bool TensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
   return is_contiguous_;
 }
 
-const Storage& TensorImpl::storage() const {
-  return storage_;
-}
-
 static void deletePlacementDeleteContext(void* ptr) {
   delete static_cast<PlacementDeleteContext*>(ptr);
 }

diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
@@ -451,7 +451,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Avoid using this method if possible; try to use only Tensor APIs to perform
    * operations.
    */
-  virtual const Storage& storage() const;
+  TENSORIMPL_MAYBE_VIRTUAL const Storage& storage() const {
+    if (C10_UNLIKELY(storage_access_should_throw_)) {
+      throw_storage_access_error();
+    }
+    return storage_;
+  }
 
   /**
    * The number of elements in a tensor.
@@ -779,6 +784,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return storage_offset_;
   }
 
+ protected:
+  /**
+   * Returns the human-readable name of the actual type of this object (e.g.,
+   * TensorImpl, BatchedTensorImpl, etc.). Used for error messages.
+   */
+  virtual const char* tensorimpl_type_name() const {
+    return "TensorImpl";
+  }
+
+ private:
+  [[noreturn]] void throw_storage_access_error() const;
+
+ public:
   /**
    * True if a tensor has no elements (e.g., numel() == 0).
    */
@@ -1683,6 +1701,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // See NOTE [ Metadata Change for a Detached Tensor ] for details.
   static const char * const err_msg_tensor_metadata_change_not_allowed;
 
+  void set_storage_access_should_throw() {
+    storage_access_should_throw_ = true;
+  }
+
   Storage storage_;
 
 private:
@@ -1761,6 +1783,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // Tensor is contiguous
   bool is_contiguous_ = true;
 
+  // Tensor is a subclass that does not permit storage access.
+  bool storage_access_should_throw_ = false;
+
   // default member initializers for bit-fields only available with -std=c++2a or -std=gnu++2a
   inline void init_bitfields() {
     is_channels_last_ = false;
@@ -1877,7 +1902,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 //    SizesAndStrides strides (pre-allocated 4)
 //    storage offset
 //    numel
-//    data type, device_opt, is_contiguous, bitfields
+//    data type, device, is_contiguous, storage_access_should_throw_, bitfields
 //    DispatchKeySet
 //
 static_assert(sizeof(void*) != sizeof(int64_t) || // if 64-bit...

diff --git a/c10/core/UndefinedTensorImpl.cpp b/c10/core/UndefinedTensorImpl.cpp
@@ -6,6 +6,7 @@ namespace c10 {
 // should this use the globalContext?  Can it get a context passed in somehow?
 UndefinedTensorImpl::UndefinedTensorImpl()
 : TensorImpl(DispatchKey::Undefined, caffe2::TypeMeta(), c10::nullopt) {
+  set_storage_access_should_throw();
 }
 
 int64_t UndefinedTensorImpl::size(int64_t d) const {
@@ -23,17 +24,18 @@ bool UndefinedTensorImpl::has_storage() const {
 }
 #endif
 
-const Storage& UndefinedTensorImpl::storage() const {
-  AT_ERROR("storage() called on undefined Tensor");
-}
-
 void UndefinedTensorImpl::set_storage_offset(int64_t) {
   AT_ERROR("set_storage_offset() called on an undefined Tensor");
 }
 
 IntArrayRef UndefinedTensorImpl::strides() const {
   AT_ERROR("strides() called on undefined Tensor");
 }
+
+const char* UndefinedTensorImpl::tensorimpl_type_name() const {
+  return "UndefinedTensorImpl";
+}
+
 UndefinedTensorImpl UndefinedTensorImpl::_singleton;
 
 }
diff --git a/c10/core/UndefinedTensorImpl.h b/c10/core/UndefinedTensorImpl.h
@@ -23,11 +23,11 @@ struct C10_API UndefinedTensorImpl final : public TensorImpl {
 #ifdef DEBUG
   bool has_storage() const override;
 #endif
-  const Storage& storage() const override;
   void set_storage_offset(int64_t offset) override;
 private:
   UndefinedTensorImpl();
   static UndefinedTensorImpl _singleton;
+  const char* tensorimpl_type_name() const override;
 };
 
 } // namespace c10