Skip to content

Commit

Permalink
avoid some shared_ptr copying on "[PyTorch] Fix getCustomClassType() …
Browse files Browse the repository at this point in the history
…perf"

1) It was copying the entire hash table every time.
2) We don't need to do a hash lookup at all.

Differential Revision: [D25385543](https://our.internmc.facebook.com/intern/diff/D25385543/)

**NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D25385543/)!

[ghstack-poisoned]
  • Loading branch information
swolchok committed Dec 9, 2020
2 parents da1995f + 274ce26 commit 0deedff
Show file tree
Hide file tree
Showing 84 changed files with 1,934 additions and 1,425 deletions.
2 changes: 2 additions & 0 deletions .jenkins/pytorch/codegen-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ mkdir -p "$OUT"/pyi/torch/_C
mkdir -p "$OUT"/pyi/torch/nn
python -m tools.pyi.gen_pyi \
--declarations-path "$OUT"/torch/share/ATen/Declarations.yaml \
--native-functions-path aten/src/ATen/native/native_functions.yaml \
--deprecated-functions-path tools/autograd/deprecated.yaml \
--out "$OUT"/pyi

# autograd codegen (called by torch codegen but can run independently)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_ex
On Linux
```bash
# Add LAPACK support for the GPU if needed
conda install -c pytorch magma-cuda102 # or [ magma-cuda101 | magma-cuda100 | magma-cuda92 ] depending on your cuda version
conda install -c pytorch magma-cuda110 # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo
```

On MacOS
Expand Down
20 changes: 10 additions & 10 deletions android/test_app/app/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,20 @@ android {
//}
flavorDimensions "model", "build", "activity"
productFlavors {
mbq {
mnet {
dimension "model"
applicationIdSuffix ".mbq"
buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2q.pt\"")
addManifestPlaceholders([APP_NAME: "MBQ"])
buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbq\"")
applicationIdSuffix ".mnet"
buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet.pt\"")
addManifestPlaceholders([APP_NAME: "MNET"])
buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet\"")
}
mbvulkan {
mnetVulkan {
dimension "model"
applicationIdSuffix ".mbvulkan"
buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2-vulkan.pt\"")
applicationIdSuffix ".mnet_vulkan"
buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet_vulkan.pt\"")
buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true')
addManifestPlaceholders([APP_NAME: "MBQ"])
buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbvulkan\"")
addManifestPlaceholders([APP_NAME: "MNET_VULKAN"])
buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet-vulkan\"")
}
resnet18 {
dimension "model"
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/BatchingRegistrations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -941,8 +941,8 @@ Tensor new_empty_strided_batching_rule(
size.size(), ") must match dimensionality of strides (",
stride.size(), ")");
auto storage_size = native::storage_size_for(size, stride);
for (int64_t idx = 0; idx < physical_strides.size(); ++idx) {
physical_strides[idx] *= storage_size;
for (auto& physical_stride : physical_strides) {
physical_stride *= storage_size;
}

// physical_strides = [B1 * B2 * S, B2 * S, S] + strides
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/NamedTensorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,11 @@ static std::vector<Dimname> compute_dot_product_outnames(
}
std::vector<Dimname> outnames(num_outnames, Dimname::wildcard());
int64_t index = 0;
for (int64_t j = 0; j < tensor_names.size(); ++j) {
for (size_t j = 0; j < tensor_names.size(); ++j) {
if (j == tensor_dotted_dim) continue;
outnames[index++] = tensor_names[j];
}
for (int64_t j = 0; j < other_names.size(); ++j) {
for (size_t j = 0; j < other_names.size(); ++j) {
if (j == other_dotted_dim) continue;
outnames[index++] = other_names[j];
}
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/TensorIterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -939,8 +939,8 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso
}

void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
for (int i = 0; i < config.tensors_.size(); i++) {
operands_.emplace_back(std::move(config.tensors_[i]));
for (auto& tensor: config.tensors_) {
operands_.emplace_back(std::move(tensor));
}
num_outputs_ = config.num_outputs_;
}
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/TensorNames.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@ TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
}

TensorNames& TensorNames::unifyFromRightInplace(const TensorNames& other, const char* op_name) {
int64_t size_diff = std::labs(names_.size() - other.names_.size());
size_t size_diff = std::labs(names_.size() - other.names_.size());

if (names_.size() > other.names_.size()) {
for (int64_t idx = size_diff; idx < names_.size(); ++idx) {
for (size_t idx = size_diff; idx < names_.size(); ++idx) {
names_[idx] = names_[idx].unify(other.names_[idx - size_diff], op_name);
}
} else {
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/core/ivalue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace ivalue {

// This is in ivalue.cpp because we need to access Type::annotation_str, which
// is declared in jit_type.h
void checkCustomClassType(TypePtr expected_type, TypePtr actual_type) {
void checkCustomClassType(const Type* expected_type, const Type* actual_type) {
// NB: doing pointer comparison here
// If in the future there ever arises a need to call operator== on custom class
// Type's, this needs to be changed!
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/core/ivalue.h
Original file line number Diff line number Diff line change
Expand Up @@ -959,7 +959,7 @@ c10::ClassTypePtr getCustomClassTypeImpl() {
}

template <typename T>
c10::ClassTypePtr getCustomClassType() {
const c10::ClassTypePtr& getCustomClassType() {
// Classes are never unregistered from getCustomClassTypeMap and the
// hash lookup can be a hot path, so just cache.
// For the same reason, it's fine If this ends up getting duplicated across
Expand Down
10 changes: 5 additions & 5 deletions aten/src/ATen/core/ivalue_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ inline at::Generator IValue::toGenerator() const& {
namespace ivalue {

void CAFFE2_API
checkCustomClassType(TypePtr expected_type, TypePtr actual_type);
checkCustomClassType(const Type* expected_type, const Type* actual_type);

template <typename T>
using Shared = c10::intrusive_ptr<T>;
Expand Down Expand Up @@ -756,8 +756,8 @@ c10::intrusive_ptr<T> IValue::toCustomClass() && {
obj->slots().size() == 1,
"Tried to cast IValue to custom class but it did "
"not contain a custom class!");
auto expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>();
ivalue::checkCustomClassType(expected_type, type());
const Type* expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>().get();
ivalue::checkCustomClassType(expected_type, type().get());
auto userObj =
c10::static_intrusive_pointer_cast<T>(obj->getSlot(0).toCapsule());
return userObj;
Expand All @@ -774,8 +774,8 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
obj->slots().size() == 1,
"Tried to cast IValue to custom class but it did "
"not contain a custom class!");
auto expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>();
ivalue::checkCustomClassType(expected_type, type());
const Type* expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>().get();
ivalue::checkCustomClassType(expected_type, type().get());
auto userObj =
c10::static_intrusive_pointer_cast<T>(obj->getSlot(0).toCapsule());
return userObj;
Expand Down
8 changes: 4 additions & 4 deletions aten/src/ATen/cuda/CUDASolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ void getrf<c10::complex<double>>(
TORCH_CUSOLVER_CHECK(cusolverDnZgetrf_bufferSize(
handle, m, n, reinterpret_cast<cuDoubleComplex*>(dA), ldda, &lwork));
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
void* buffer = allocator.allocate(sizeof(cuDoubleComplex) * lwork).get();
auto dataPtr = allocator.allocate(sizeof(cuDoubleComplex) * lwork);
TORCH_CUSOLVER_CHECK(cusolverDnZgetrf(
handle,
m,
n,
reinterpret_cast<cuDoubleComplex*>(dA),
ldda,
static_cast<cuDoubleComplex*>(buffer),
static_cast<cuDoubleComplex*>(dataPtr.get()),
ipiv,
info));
}
Expand All @@ -71,14 +71,14 @@ void getrf<c10::complex<float>>(
TORCH_CUSOLVER_CHECK(cusolverDnCgetrf_bufferSize(
handle, m, n, reinterpret_cast<cuComplex*>(dA), ldda, &lwork));
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
void* buffer = allocator.allocate(sizeof(cuComplex) * lwork).get();
auto dataPtr = allocator.allocate(sizeof(cuComplex) * lwork);
TORCH_CUSOLVER_CHECK(cusolverDnCgetrf(
handle,
m,
n,
reinterpret_cast<cuComplex*>(dA),
ldda,
static_cast<cuComplex*>(buffer),
static_cast<cuComplex*>(dataPtr.get()),
ipiv,
info));
}
Expand Down
8 changes: 2 additions & 6 deletions aten/src/ATen/native/Convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,10 @@ auto ConvParams::needs_64bit_indexing_no_split(const at::Tensor& input, const at
int64_t outsize = 1;
if (transposed) {
std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
for (int64_t i = 1; i < o.size(); i++) {
outsize *= o[i];
}
outsize = prod_intlist(o.begin() + 1, o.end());
} else {
std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
for (int64_t i = 1; i < o.size(); i++) {
outsize *= o[i];
}
outsize = prod_intlist(o.begin() + 1, o.end());
}
return outsize > int_max;
}
Expand Down
5 changes: 4 additions & 1 deletion aten/src/ATen/native/ForeachOpsKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ FOREACH_UNARY_OP(sinh);
FOREACH_UNARY_OP(round);
FOREACH_UNARY_OP(lgamma);
FOREACH_UNARY_OP(frac);
FOREACH_UNARY_OP(trunc);
FOREACH_UNARY_OP(reciprocal);
FOREACH_UNARY_OP(sigmoid);

FOREACH_POINTWISE_OP_SCALAR(addcdiv);
FOREACH_POINTWISE_OP_SCALAR(addcmul);
Expand All @@ -201,7 +204,7 @@ std::vector<Tensor> foreach_tensor_##NAME##_slow(TensorList tensors1, TensorList
\
std::vector<Tensor> result; \
result.reserve(tensors1.size()); \
for (int i = 0; i < tensors1.size(); i++) { \
for (size_t i = 0; i < tensors1.size(); i++) { \
result.emplace_back(at::NAME(tensors1[i], tensors2[i])); \
} \
\
Expand Down

0 comments on commit 0deedff

Please sign in to comment.