diff --git a/kernels/portable/cpu/op_permute_copy.cpp b/kernels/portable/cpu/op_permute_copy.cpp index 1362b57c005..288a3362627 100644 --- a/kernels/portable/cpu/op_permute_copy.cpp +++ b/kernels/portable/cpu/op_permute_copy.cpp @@ -60,15 +60,20 @@ Tensor& permute_copy_out( out); const auto in_type = out.scalar_type(); + + size_t in_coord[kTensorDimensionLimit] = {0}; + size_t trailing_dims_memo[kTensorDimensionLimit]; + executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo); + // in and out must be the same dtype ET_SWITCH_ALL_TYPES(in_type, ctx, "permute_copy.out", CTYPE, [&] { const CTYPE* const in_data = in.const_data_ptr(); CTYPE* const out_data = out.mutable_data_ptr(); - size_t in_coord[kTensorDimensionLimit] = {0}; - for (size_t i = 0; i < out.numel(); ++i) { - out_data[i] = in_data[coordinateToIndex(in, in_coord)]; + out_data[i] = + in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo( + in, in_coord, trailing_dims_memo)]; increment_coordinate_permuted(in, in_coord, dims); } }); diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index 630f0cdb4a1..b303feafd46 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -921,6 +921,38 @@ inline size_t coordinateToIndex( return index; } +/** + * Produce a memoized array for use with repeated calls to + * coordinateToIndexWithTrailingDimsMemo, which will be faster than + * repeated calls to coordinateToIndex. + */ +inline void memoizeTrailingDims( + const exec_aten::Tensor& tensor, + size_t trailing_dims_memo[kTensorDimensionLimit]) { + const auto tensorDim = tensor.dim(); + size_t dims = 1; + for (int ii = tensorDim - 1; ii >= 0; --ii) { + trailing_dims_memo[ii] = dims; + dims *= static_cast(tensor.size(ii)); + } +} + +/** + * Like coordinateToIndex, but faster for repeated calls with the same + * tensor. trailing_dims_memo must be produced by a call to + * memoizeTrailingDims. + */ +inline size_t coordinateToIndexWithTrailingDimsMemo( + const exec_aten::Tensor& tensor, + const size_t* const coordinate, + const size_t trailing_dims_memo[kTensorDimensionLimit]) { + size_t index = 0; + for (int d = 0; d < tensor.dim(); ++d) { + index += coordinate[d] * trailing_dims_memo[d]; + } + return index; +} + /** * Given the linear index return the N-dimensional tensor coordinate. This is * the inverse operation of coordinateToIndex.