Skip to content

Commit

Permalink
[PyTorch] Devirtualize TensorImpl::storage() (#51050)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #51050

Subclasses want to be able to make storage() calls throw, so
we find some free space in TensorImpl to add a flag that they can set
to make that happen without making storage() virtual. It should still
be inlineable.
ghstack-source-id: 121819684

Test Plan:
Compared `perf stat` on 1M iterations on AdIndexer benchmark before/after

Before:
```
         74,483.15 msec task-clock                #    0.999 CPUs utilized            ( +-  0.14% )
            16,637      context-switches          #    0.223 K/sec                    ( +- 11.97% )
                 3      cpu-migrations            #    0.000 K/sec                    ( +-  7.20% )
           107,085      page-faults               #    0.001 M/sec                    ( +-  2.39% )
   147,356,440,831      cycles                    #    1.978 GHz                      ( +-  0.14% )  (50.06%)
   278,678,430,378      instructions              #    1.89  insn per cycle           ( +-  0.01% )  (50.05%)
    43,540,698,177      branches                  #  584.571 M/sec                    ( +-  0.01% )  (50.05%)
       141,028,843      branch-misses             #    0.32% of all branches          ( +-  1.00% )  (50.05%)

```

After:
```
         74,178.77 msec task-clock                #    0.999 CPUs utilized            ( +-  0.31% )
            17,125      context-switches          #    0.231 K/sec                    ( +-  3.41% )
                 3      cpu-migrations            #    0.000 K/sec
           109,535      page-faults               #    0.001 M/sec                    ( +-  1.04% )
   146,803,364,372      cycles                    #    1.979 GHz                      ( +-  0.30% )  (50.03%)
   277,726,600,254      instructions              #    1.89  insn per cycle           ( +-  0.02% )  (50.03%)
    43,299,659,815      branches                  #  583.720 M/sec                    ( +-  0.03% )  (50.03%)
       130,504,094      branch-misses             #    0.30% of all branches          ( +-  1.14% )  (50.03%)

```

Looks like approximately 0.3% instruction count win (and similarly for cycles, but that's within noise).

Reviewed By: ezyang

Differential Revision: D26013815

fbshipit-source-id: 07939957929070e18b9981d492d8279c9bb33c55
  • Loading branch information
swolchok authored and facebook-github-bot committed Feb 17, 2021
1 parent 4305609 commit 059ee85
Show file tree
Hide file tree
Showing 14 changed files with 78 additions and 23 deletions.
9 changes: 5 additions & 4 deletions aten/src/ATen/BatchedTensorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ BatchedTensorImpl::BatchedTensorImpl(Tensor value, BatchDims bdims)
, bdims_(std::move(bdims))
{
TORCH_INTERNAL_ASSERT(value_.defined());
set_storage_access_should_throw();
checkInvariants();

const auto public_dims = value_.dim() - bdims_.size();
Expand Down Expand Up @@ -80,10 +81,6 @@ bool BatchedTensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
return is_contiguous_;
}

const Storage& BatchedTensorImpl::storage() const {
TORCH_CHECK(false, "Due to limitations, we cannot access the storage() of a tensor from inside of vmap.");
}

// The following are some internal inherited methods that we do not support.
// They should never get called.
void BatchedTensorImpl::set_size(int64_t dim, int64_t new_size) {
Expand All @@ -102,6 +99,10 @@ bool BatchedTensorImpl::has_storage() const {
}
#endif

const char* BatchedTensorImpl::tensorimpl_type_name() const {
return "BatchedTensorImpl";
}

Tensor makeBatched(const Tensor& tensor, BatchDims bdims) {
TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor));
auto tensor_dim = tensor.dim();
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/BatchedTensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,11 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
#ifdef DEBUG
bool has_storage() const override;
#endif
const Storage& storage() const override;

private:
// see NOTE: [BatchedTensorImpl levels invariant]
void checkInvariants() const;
const char* tensorimpl_type_name() const override;

Tensor value_;

Expand Down
9 changes: 5 additions & 4 deletions aten/src/ATen/OpaqueTensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
bool is_non_overlapping_and_dense = true)
: TensorImpl(key_set, data_type, device),
opaque_handle_(std::move(opaque_handle)) {
set_storage_access_should_throw();
sizes_and_strides_.set_sizes(sizes);
refresh_numel();
is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
Expand Down Expand Up @@ -71,10 +72,6 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
}
#endif

const Storage& storage() const override {
AT_ERROR("opaque tensors do not have storage");
}

/**
* Return a TensorImpl that is a shallow-copy of this TensorImpl.
*
Expand Down Expand Up @@ -180,6 +177,10 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
}

private:
const char* tensorimpl_type_name() const override {
return "OpaqueTensorImpl";
}

OpaqueHandle opaque_handle_;
};

Expand Down
7 changes: 5 additions & 2 deletions aten/src/ATen/SparseTensorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::Typ
AT_ASSERT(values_.device() == device());

is_non_overlapping_and_dense_ = false;
set_storage_access_should_throw();
}

IntArrayRef SparseTensorImpl::strides() const {
Expand All @@ -76,9 +77,11 @@ bool SparseTensorImpl::has_storage() const {
return false;
}
#endif
const Storage& SparseTensorImpl::storage() const {
AT_ERROR("sparse tensors do not have storage");

const char* SparseTensorImpl::tensorimpl_type_name() const {
return "SparseTensorImpl";
}

void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, const Tensor& values) {
TORCH_CHECK(allow_tensor_metadata_change(), "set_indices_and_values_unsafe ", err_msg_tensor_metadata_change_not_allowed);

Expand Down
3 changes: 2 additions & 1 deletion aten/src/ATen/SparseTensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
#ifdef DEBUG
bool has_storage() const override;
#endif
const Storage& storage() const override;

// WARNING: This function does NOT preserve invariants of sparse_dim/dense_dim with
// respect to indices and values
Expand Down Expand Up @@ -261,6 +260,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
dest_sparse_impl->values_ = src_sparse_impl->values();
dest_sparse_impl->coalesced_ = src_sparse_impl->coalesced();
}

const char* tensorimpl_type_name() const override;
};

} // namespace at
4 changes: 4 additions & 0 deletions aten/src/ATen/native/metal/MetalTensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ struct TORCH_API MetalTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
}

private:
const char* tensorimpl_type_name() const override {
return "MetalTensorImpl";
}

SmallVector<int64_t, 5> strides_;
};
} // namespace at
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ struct VulkanOpaqueTensorImpl : public OpaqueTensorImpl<OpaqueHandle> {
}

private:
const char* tensorimpl_type_name() const override {
return "VulkanOpaqueTensorImpl";
}

SmallVector<int64_t, 5> strides_;
};

Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/quantized/QTensorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,8 @@ QTensorImpl::QTensorImpl(
: TensorImpl(std::move(storage), key_set, data_type),
quantizer_(quantizer) {}

const char* QTensorImpl::tensorimpl_type_name() const {
return "QTensorImpl";
}

} // namespace at
2 changes: 2 additions & 0 deletions aten/src/ATen/quantized/QTensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ struct TORCH_API QTensorImpl : public c10::TensorImpl {
private:
QuantizerPtr quantizer_;

const char* tensorimpl_type_name() const override;

/**
* Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / storage_offset)
* from one TensorImpl to another TensorImpl.
Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/test/basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,14 @@ TEST(BasicTest, FactoryMethodsTest) {
ASSERT_FALSE(tensor1.requires_grad());
ASSERT_FALSE(tensor1.is_pinned());

// Sparse tensor CPU test to avoid requiring CUDA to catch simple bugs.1
tensor1 = at::empty({4}, at::TensorOptions().dtype(at::kHalf).layout(at::kSparse));
ASSERT_EQ(tensor1.dtype(), at::kHalf);
ASSERT_EQ(tensor1.layout(), at::kSparse);
ASSERT_EQ(tensor1.device(), at::kCPU);
ASSERT_FALSE(tensor1.requires_grad());
ASSERT_ANY_THROW(tensor1.is_pinned());

if (torch::cuda::is_available()) {
// Test setting pin memory
tensor1 = at::empty({4}, at::TensorOptions().pinned_memory(true));
Expand Down
8 changes: 4 additions & 4 deletions c10/core/TensorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,10 @@ bool TensorImpl::has_storage() const {
}
#endif

void TensorImpl::throw_storage_access_error() const {
TORCH_CHECK(false, "Cannot access storage of ", tensorimpl_type_name());
}

bool TensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
#ifdef DEBUG
AT_ASSERT(compute_contiguous() == is_contiguous_);
Expand All @@ -255,10 +259,6 @@ bool TensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
return is_contiguous_;
}

const Storage& TensorImpl::storage() const {
return storage_;
}

static void deletePlacementDeleteContext(void* ptr) {
delete static_cast<PlacementDeleteContext*>(ptr);
}
Expand Down
29 changes: 27 additions & 2 deletions c10/core/TensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
* Avoid using this method if possible; try to use only Tensor APIs to perform
* operations.
*/
virtual const Storage& storage() const;
TENSORIMPL_MAYBE_VIRTUAL const Storage& storage() const {
if (C10_UNLIKELY(storage_access_should_throw_)) {
throw_storage_access_error();
}
return storage_;
}

/**
* The number of elements in a tensor.
Expand Down Expand Up @@ -779,6 +784,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
return storage_offset_;
}

protected:
/**
* Returns the human-readable name of the actual type of this object (e.g.,
* TensorImpl, BatchedTensorImpl, etc.). Used for error messages.
*/
virtual const char* tensorimpl_type_name() const {
return "TensorImpl";
}

private:
[[noreturn]] void throw_storage_access_error() const;

public:
/**
* True if a tensor has no elements (e.g., numel() == 0).
*/
Expand Down Expand Up @@ -1683,6 +1701,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
// See NOTE [ Metadata Change for a Detached Tensor ] for details.
static const char * const err_msg_tensor_metadata_change_not_allowed;

void set_storage_access_should_throw() {
storage_access_should_throw_ = true;
}

Storage storage_;

private:
Expand Down Expand Up @@ -1761,6 +1783,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
// Tensor is contiguous
bool is_contiguous_ = true;

// Tensor is a subclass that does not permit storage access.
bool storage_access_should_throw_ = false;

// default member initializers for bit-fields only available with -std=c++2a or -std=gnu++2a
inline void init_bitfields() {
is_channels_last_ = false;
Expand Down Expand Up @@ -1877,7 +1902,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
// SizesAndStrides strides (pre-allocated 4)
// storage offset
// numel
// data type, device_opt, is_contiguous, bitfields
// data type, device, is_contiguous, storage_access_should_throw_, bitfields
// DispatchKeySet
//
static_assert(sizeof(void*) != sizeof(int64_t) || // if 64-bit...
Expand Down
10 changes: 6 additions & 4 deletions c10/core/UndefinedTensorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ namespace c10 {
// should this use the globalContext? Can it get a context passed in somehow?
UndefinedTensorImpl::UndefinedTensorImpl()
: TensorImpl(DispatchKey::Undefined, caffe2::TypeMeta(), c10::nullopt) {
set_storage_access_should_throw();
}

int64_t UndefinedTensorImpl::size(int64_t d) const {
Expand All @@ -23,17 +24,18 @@ bool UndefinedTensorImpl::has_storage() const {
}
#endif

const Storage& UndefinedTensorImpl::storage() const {
AT_ERROR("storage() called on undefined Tensor");
}

void UndefinedTensorImpl::set_storage_offset(int64_t) {
AT_ERROR("set_storage_offset() called on an undefined Tensor");
}

IntArrayRef UndefinedTensorImpl::strides() const {
AT_ERROR("strides() called on undefined Tensor");
}

const char* UndefinedTensorImpl::tensorimpl_type_name() const {
return "UndefinedTensorImpl";
}

UndefinedTensorImpl UndefinedTensorImpl::_singleton;

}
2 changes: 1 addition & 1 deletion c10/core/UndefinedTensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ struct C10_API UndefinedTensorImpl final : public TensorImpl {
#ifdef DEBUG
bool has_storage() const override;
#endif
const Storage& storage() const override;
void set_storage_offset(int64_t offset) override;
private:
UndefinedTensorImpl();
static UndefinedTensorImpl _singleton;
const char* tensorimpl_type_name() const override;
};

} // namespace c10

0 comments on commit 059ee85

Please sign in to comment.