From 7088078079855690d972f1ee7bae2055692af6bb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 9 Sep 2024 13:26:20 -0700 Subject: [PATCH] [ExecuTorch] Add trailing dims memoization to improve performance of permute_copy_out Otherwise we recompute trailing dims every time we call coordinateToIndex. Differential Revision: [D62154216](https://our.internmc.facebook.com/intern/diff/D62154216/) [ghstack-poisoned] --- kernels/portable/cpu/op_permute_copy.cpp | 11 +++++--- runtime/core/exec_aten/util/tensor_util.h | 32 +++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/kernels/portable/cpu/op_permute_copy.cpp b/kernels/portable/cpu/op_permute_copy.cpp index 1362b57c005..288a3362627 100644 --- a/kernels/portable/cpu/op_permute_copy.cpp +++ b/kernels/portable/cpu/op_permute_copy.cpp @@ -60,15 +60,20 @@ Tensor& permute_copy_out( out); const auto in_type = out.scalar_type(); + + size_t in_coord[kTensorDimensionLimit] = {0}; + size_t trailing_dims_memo[kTensorDimensionLimit]; + executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo); + // in and out must be the same dtype ET_SWITCH_ALL_TYPES(in_type, ctx, "permute_copy.out", CTYPE, [&] { const CTYPE* const in_data = in.const_data_ptr(); CTYPE* const out_data = out.mutable_data_ptr(); - size_t in_coord[kTensorDimensionLimit] = {0}; - for (size_t i = 0; i < out.numel(); ++i) { - out_data[i] = in_data[coordinateToIndex(in, in_coord)]; + out_data[i] = + in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo( + in, in_coord, trailing_dims_memo)]; increment_coordinate_permuted(in, in_coord, dims); } }); diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index 630f0cdb4a1..b303feafd46 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -921,6 +921,38 @@ inline size_t coordinateToIndex( return index; } +/** + * Produce a memoized array for use with repeated calls to + * coordinateToIndexWithTrailingDimsMemo, which will be faster than + * repeated calls to coordinateToIndex. + */ +inline void memoizeTrailingDims( + const exec_aten::Tensor& tensor, + size_t trailing_dims_memo[kTensorDimensionLimit]) { + const auto tensorDim = tensor.dim(); + size_t dims = 1; + for (int ii = tensorDim - 1; ii >= 0; --ii) { + trailing_dims_memo[ii] = dims; + dims *= static_cast(tensor.size(ii)); + } +} + +/** + * Like coordinateToIndex, but faster for repeated calls with the same + * tensor. trailing_dims_memo must be produced by a call to + * memoizeTrailingDims. + */ +inline size_t coordinateToIndexWithTrailingDimsMemo( + const exec_aten::Tensor& tensor, + const size_t* const coordinate, + const size_t trailing_dims_memo[kTensorDimensionLimit]) { + size_t index = 0; + for (int d = 0; d < tensor.dim(); ++d) { + index += coordinate[d] * trailing_dims_memo[d]; + } + return index; +} + /** * Given the linear index return the N-dimensional tensor coordinate. This is * the inverse operation of coordinateToIndex.