Skip to content

Commit

Permalink
[pytorch/ops] Concat fast path w/ zero tensor (#46805)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #46805

The current implementation goes with slow path if there is zero tensor in the list. This is inefficient. Use the fast path for torch.cat even if there are empty tensors. This wastes one thread block for the empty tensor, but still much better than the slow path.

Test Plan: CI + sandcastle

Differential Revision: D24524441

fbshipit-source-id: 522dea42628207bd77a8dfba39476b1dc3c1de45
  • Loading branch information
xw285cornell authored and facebook-github-bot committed Oct 27, 2020
1 parent af27da9 commit 6656908
Showing 1 changed file with 17 additions and 12 deletions.
29 changes: 17 additions & 12 deletions aten/src/ATen/native/cuda/Shape.cu
Expand Up @@ -237,7 +237,12 @@ void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
batchCounter < CAT_ARRAY_BATCH_SIZE &&
(i+batchCounter) < inputs.size();
++batchCounter) {
int64_t dimSize = at::native::size(inputs[i+batchCounter], dimension);
int64_t dimSize = 0;
// There is a legacy case where a 1-D empty tensor can be concat with
// high-dimensional tensor
if (inputs[i+batchCounter].numel() > 0) {
dimSize = at::native::size(inputs[i+batchCounter], dimension);
}

stackInputs[batchCounter].input =
inputs[i+batchCounter].data_ptr<scalar_t>();
Expand Down Expand Up @@ -338,7 +343,12 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
batchCounter < CAT_ARRAY_BATCH_SIZE &&
(i+batchCounter) < inputs.size();
++batchCounter) {
int64_t dimSize = at::native::size(inputs[i+batchCounter], dimension);
int64_t dimSize = 0;
// There is a legacy case where a 1-D empty tensor can be concat with
// high-dimensional tensor
if (inputs[i+batchCounter].numel() > 0) {
dimSize = at::native::size(inputs[i+batchCounter], dimension);
}
catMetaData.input[batchCounter] = inputs[i+batchCounter].data_ptr<scalar_t>();
catMetaData.offset[batchCounter] = offset;
catMetaData.dimSize[batchCounter] = dimSize;
Expand Down Expand Up @@ -431,7 +441,6 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
auto should_skip = [](const Tensor &t) {
return t.dim() == 1 && at::native::size(t, 0) == 0;
};
bool hasSkippedInput = false;

const Tensor *notSkippedTensor = NULL; // non-owning reference
int nDims = 0;
Expand All @@ -452,10 +461,8 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
}
at::assert_no_internal_overlap(out);

for (int i = 0; i < inputs.size(); i++)
{
for (int i = 0; i < inputs.size(); i++) {
if (should_skip(inputs[i])) {
hasSkippedInput = true;
continue;
}
nDims = inputs[i].dim();
Expand Down Expand Up @@ -501,11 +508,10 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
// We parallelize the copy if all 6 conditions pass:
//
// 1. There is more than one input tensor
// 2. No empty inputs
// 3. The out tensor is 32-bit indexable
// 4. The number of dimensions is <= 4
// 5. All input tensors are contiguous (output tensor may be non-contig)
// 6. All input tensors can use 32-bit indexing
// 2. The out tensor is 32-bit indexable
// 3. The number of dimensions is <= 4
// 4. All input tensors are contiguous (output tensor may be non-contig)
// 5. All input tensors can use 32-bit indexing

const bool all32BitIndexable = std::all_of(inputs.begin(), inputs.end(),
[] (const Tensor& t) {
Expand All @@ -522,7 +528,6 @@ Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
});
allSameType = allSameType && (out.scalar_type() == firstType);
if (inputs.size() > 1 &&
!hasSkippedInput &&
out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
at::cuda::detail::canUse32BitIndexMath(out) &&
allContiguous &&
Expand Down

0 comments on commit 6656908

Please sign in to comment.