From 64120a937fedd9a9baeb602346fa4357f32c0699 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 16 Oct 2025 21:07:41 -0700 Subject: [PATCH 01/19] try not using a variable --- kernels/portable/cpu/util/elementwise_util.h | 34 ++++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index cc1110e10d7..3e3522a52c6 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -105,7 +105,7 @@ inline void dtype_specialized_elementwise_fn_impl( out.numel(), ::executorch::extension::internal::GRAIN_SIZE, [&](const auto begin, const auto end) { - std::array inputs_data_ptrs = { + std::array inputs_data_ptrs = { inputs.first->template const_data_ptr()...}; CTYPE_OUT* const data_out = out.mutable_data_ptr(); @@ -119,11 +119,11 @@ inline void dtype_specialized_elementwise_fn_impl( // small-sized tests will test whether using Vectorized broke our // lambda. #ifndef NDEBUG - std::array loaded_inputs{}; + std::array loaded_inputs{}; #else // NDEBUG - std::array loaded_inputs{}; + std::array loaded_inputs{}; #endif // NDEBUG - for (const auto input_idx : c10::irange(kNumInputs)) { + for (const auto input_idx : c10::irange(sizeof...(inputs))) { loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; } #ifndef NDEBUG @@ -136,8 +136,8 @@ inline void dtype_specialized_elementwise_fn_impl( // Main vectorized loop. for (auto idx = vectorized_begin; idx < vectorized_end; idx += Vec::size()) { - std::array loaded_vec_inputs{}; - for (const auto input_idx : c10::irange(kNumInputs)) { + std::array loaded_vec_inputs{}; + for (const auto input_idx : c10::irange(sizeof...(inputs))) { loaded_vec_inputs[input_idx] = Vec::loadu(&inputs_data_ptrs[input_idx][idx]); } @@ -148,11 +148,11 @@ inline void dtype_specialized_elementwise_fn_impl( // Scalar epilogue. for (const auto idx : c10::irange(vectorized_end, end)) { #ifndef NDEBUG - std::array loaded_inputs{}; + std::array loaded_inputs{}; #else // NDEBUG - std::array loaded_inputs{}; + std::array loaded_inputs{}; #endif // NDEBUG - for (const auto input_idx : c10::irange(kNumInputs)) { + for (const auto input_idx : c10::irange(sizeof...(inputs))) { loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; } #ifndef NDEBUG @@ -172,20 +172,20 @@ inline void dtype_specialized_elementwise_fn_impl( out.numel(), ::executorch::extension::internal::GRAIN_SIZE, [&](const auto begin, const auto end) { - std::array inputs_data_ptrs = { + std::array inputs_data_ptrs = { inputs.first->template const_data_ptr()...}; CTYPE_OUT* const data_out = out.mutable_data_ptr(); const auto range = - BroadcastIndexesRange( + BroadcastIndexesRange( out, (*inputs.first)...); auto begin_it = range.begin(); begin_it += begin; for (; (*begin_it)[0] < end; ++begin_it) { const auto& indexes = *begin_it; - std::array loaded_inputs{}; - for (const auto idx : c10::irange(kNumInputs)) { + std::array loaded_inputs{}; + for (const auto idx : c10::irange(sizeof...(inputs))) { loaded_inputs[idx] = inputs_data_ptrs[idx][indexes[idx + 1]]; } data_out[indexes[0]] = std::apply(compute_fun, loaded_inputs); @@ -236,7 +236,7 @@ inline void apply_elementwise_fn_generic_impl( const char* data_ptr; ssize_t element_size; }; - std::array inputs_info = {(InputInfo{ + std::array inputs_info = {(InputInfo{ internal::get_load_to_compute_fn( ctx, *inputs.first, inputs.second), reinterpret_cast(inputs.first->const_data_ptr()), @@ -255,14 +255,14 @@ inline void apply_elementwise_fn_generic_impl( ::executorch::extension::internal::GRAIN_SIZE, [&](const auto begin, const auto end) { const auto range = - BroadcastIndexesRange( + BroadcastIndexesRange( out, (*inputs.first)...); auto begin_it = range.begin(); begin_it += begin; for (; (*begin_it)[0] < end; ++begin_it) { const auto& indexes = *begin_it; - std::array loaded_inputs{}; - for (const auto idx : c10::irange(kNumInputs)) { + std::array loaded_inputs{}; + for (const auto idx : c10::irange(sizeof...(inputs))) { const auto& input_info = inputs_info[idx]; loaded_inputs[idx] = input_info.load_to_compute( &input_info From 504888f7dc3663758c5c1d5d196368d847f885bb Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 16 Oct 2025 20:15:24 -0700 Subject: [PATCH 02/19] update cmakelists --- kernels/optimized/CMakeLists.txt | 5 ++++- kernels/portable/CMakeLists.txt | 2 +- kernels/portable/cpu/util/CMakeLists.txt | 5 ++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index f87e2c8d722..cc2aa305ffd 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -21,7 +21,10 @@ if(NOT EXECUTORCH_ROOT) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) endif() -set(_common_compile_options -Wno-deprecated-declarations) +set(_common_compile_options + $<$:/wd4996> + $<$>:-Wno-deprecated-declarations -fPIC> +) # Note for apple platform we can rely on Accelerate framework Will come back to # this diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index 5004cb2520e..8cedd154318 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -23,7 +23,7 @@ endif() set(_common_compile_options $<$:/wd4996> - $<$>:-Wno-deprecated-declarations> + $<$>:-Wno-deprecated-declarations -fPIC> ) include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) diff --git a/kernels/portable/cpu/util/CMakeLists.txt b/kernels/portable/cpu/util/CMakeLists.txt index 047760f321e..2475284fe41 100644 --- a/kernels/portable/cpu/util/CMakeLists.txt +++ b/kernels/portable/cpu/util/CMakeLists.txt @@ -21,7 +21,10 @@ endif() list(TRANSFORM _kernels_util_all_deps__srcs PREPEND "${EXECUTORCH_ROOT}/") -set(_common_compile_options -Wno-deprecated-declarations) +set(_common_compile_options + $<$:/wd4996> + $<$>:-Wno-deprecated-declarations -fPIC> +) add_library(kernels_util_all_deps ${_kernels_util_all_deps__srcs}) target_link_libraries(kernels_util_all_deps PRIVATE executorch_core) From 37624cb5a02a54d992c26ff76d8b9e216432fb9f Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 16 Oct 2025 21:18:18 -0700 Subject: [PATCH 03/19] msvc really hates constexpr variables I guess --- kernels/optimized/cpu/op_bmm.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp index 9cbd30cb6e1..97bbc3aa7d5 100644 --- a/kernels/optimized/cpu/op_bmm.cpp +++ b/kernels/optimized/cpu/op_bmm.cpp @@ -150,15 +150,14 @@ Tensor& opt_bmm_out( ET_KERNEL_CHECK( ctx, check_bmm_out_args(self, mat2, out), InvalidArgument, out); - constexpr auto name = "bmm.out"; auto self_type = self.scalar_type(); if (executorch::runtime::isComplexType(self_type)) { - ET_SWITCH_COMPLEXH_TYPES(self_type, ctx, name, CTYPE, [&]() { + ET_SWITCH_COMPLEXH_TYPES(self_type, ctx, "bmm.out", CTYPE, [&]() { bmm_kernel(self, mat2, out); }); } else { - ET_SWITCH_REALHBF16_TYPES(self_type, ctx, name, CTYPE, [&]() { + ET_SWITCH_REALHBF16_TYPES(self_type, ctx, "bmm.out", CTYPE, [&]() { bmm_kernel(self, mat2, out); }); } From 448f941ca437c550c6c0426650f6ad48280d50f4 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 16 Oct 2025 21:53:09 -0700 Subject: [PATCH 04/19] statement expression -> dowhile --- runtime/core/exec_aten/util/tensor_util.h | 42 +++++++++++------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index 67a813f8d34..8392264da2d 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -66,7 +66,7 @@ * dimension of all the tensors as the upper bound for the for loop. */ #define ET_CHECK_SAME_SHAPE2(a__, b__) \ - ({ \ + do { \ const size_t a_numel__ = (a__).numel(); \ const size_t b_numel__ = (b__).numel(); \ const size_t a_dim__ = (a__).dim(); \ @@ -89,10 +89,10 @@ a_size__, \ b_size__); \ } \ - }) + } while (0) #define ET_CHECK_SAME_SHAPE3(a__, b__, c__) \ - ({ \ + do { \ const size_t a_numel__ = (a__).numel(); \ const size_t b_numel__ = (b__).numel(); \ const size_t c_numel__ = (c__).numel(); \ @@ -124,11 +124,11 @@ b_size__, \ c_size__); \ } \ - }) + } while (0) /// Asserts that all tensors have the same dtype. #define ET_CHECK_SAME_DTYPE2(a__, b__) \ - ({ \ + do { \ const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type(); \ const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type(); \ ET_CHECK_MSG( \ @@ -136,10 +136,10 @@ ET_TENSOR_CHECK_PREFIX__ ": dtype={%" PRId8 ", %" PRId8 "}", \ static_cast(a_type__), \ static_cast(b_type__)); \ - }) + } while (0) #define ET_CHECK_SAME_DTYPE3(a__, b__, c__) \ - ({ \ + do { \ const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type(); \ const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type(); \ const ::executorch::aten::ScalarType c_type__ = (c__).scalar_type(); \ @@ -150,7 +150,7 @@ static_cast(a_type__), \ static_cast(b_type__), \ static_cast(c_type__)); \ - }) + } while (0) /** * Asserts that all tensors have the same shape and dtype. @@ -159,7 +159,7 @@ * macros independently, because it only calls ET_CHECK_MSG once. */ #define ET_CHECK_SAME_SHAPE_AND_DTYPE2(a__, b__) \ - ({ \ + do { \ const size_t a_numel__ = (a__).numel(); \ const size_t b_numel__ = (b__).numel(); \ const size_t a_dim__ = (a__).dim(); \ @@ -189,10 +189,10 @@ a_size__, \ b_size__); \ } \ - }) + } while (0) #define ET_CHECK_SAME_SHAPE_AND_DTYPE3(a__, b__, c__) \ - ({ \ + do { \ const size_t a_numel__ = (a__).numel(); \ const size_t b_numel__ = (b__).numel(); \ const size_t c_numel__ = (c__).numel(); \ @@ -233,13 +233,13 @@ b_size__, \ c_size__); \ } \ - }) + } while (0) /** * Assert that the input tensor is contiguous tensor. */ #define ET_CHECK_CONTIGUOUS(a__) \ - ({ \ + do { \ const ::executorch::aten::ArrayRef \ strides = a__.strides(); \ const ::executorch::aten::ArrayRef sizes = \ @@ -260,7 +260,7 @@ strides[i - 1], \ strides[i] * sizes[i]); \ } \ - }) + } while (0) /** * Assert the input two tensors share same strides. @@ -268,7 +268,7 @@ * of any input tensors. */ #define ET_CHECK_SAME_STRIDES2(a__, b__) \ - ({ \ + do { \ ET_CHECK_MSG( \ a__.dim() == b__.dim(), \ "Two tensors shall have same number of strides, but not %zu and %zu.", \ @@ -288,7 +288,7 @@ (int32_t)a_strides[i], \ (int32_t)b_strides[i]); \ } \ - }) + } while (0) /** * Assert the input three tensors share same strides. @@ -296,7 +296,7 @@ * of any input tensors. */ #define ET_CHECK_SAME_STRIDES3(a__, b__, c__) \ - ({ \ + do { \ ET_CHECK_MSG( \ a__.dim() == b__.dim() && b__.dim() == c__.dim(), \ "Three tensors shall have same number of strides, " \ @@ -322,17 +322,17 @@ (int32_t)b_strides[i], \ (int32_t)c_strides[i]); \ } \ - }) + } while (0) #define ET_CHECK_DEFAULT_OR_CHANNELSLAST_DIMORDER(t__) \ - ({ \ + do { \ ET_CHECK_MSG( \ is_contiguous_dim_order( \ t__.dim_order().data(), t__.dim_order().size()) || \ is_channels_last_dim_order( \ t__.dim_order().data(), t__.dim_order().size()), \ "Tensor must have default or channels last dim order"); \ - }) + } while (0) /** * DEPRECATED: Please use ET_CHECK_OR_RETURN_FALSE instead and provide @@ -1218,7 +1218,7 @@ ET_NODISCARD inline Error resize_tensor( } return internal::resize_tensor_impl( - t.unsafeGetTensorImpl(), {new_sizes_casted.data(), new_sizes_ndim}); + t.unsafeGetTensorImpl(), {new_sizes_casted.data(), new_sizes_ndim} while (0); } /// DEPRECATED: Use `resize_tensor()` instead, which can fail non-fatally. From 14ec2905c28f082d4d4de5f647a84ba90b6b8383 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 16 Oct 2025 22:04:44 -0700 Subject: [PATCH 05/19] one }) was ok --- runtime/core/exec_aten/util/tensor_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index 8392264da2d..129fa25ae54 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -1218,7 +1218,7 @@ ET_NODISCARD inline Error resize_tensor( } return internal::resize_tensor_impl( - t.unsafeGetTensorImpl(), {new_sizes_casted.data(), new_sizes_ndim} while (0); + t.unsafeGetTensorImpl(), {new_sizes_casted.data(), new_sizes_ndim}); } /// DEPRECATED: Use `resize_tensor()` instead, which can fail non-fatally. From 6c39e595c3ed28074ebeeeffe388eaa598ecdf9a Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 16 Oct 2025 22:36:07 -0700 Subject: [PATCH 06/19] op name --- kernels/portable/cpu/op_masked_scatter.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernels/portable/cpu/op_masked_scatter.cpp b/kernels/portable/cpu/op_masked_scatter.cpp index 16cef033670..31b33fa6cd5 100644 --- a/kernels/portable/cpu/op_masked_scatter.cpp +++ b/kernels/portable/cpu/op_masked_scatter.cpp @@ -41,13 +41,11 @@ Tensor& masked_scatter_out( InvalidArgument, out); - constexpr auto op_name = "masked_scatter.out"; - int64_t idx = 0; int64_t src_numel = src.numel(); bool src_numel_check = true; - ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE, [&]() { + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "masked_scatter.out", CTYPE, [&]() { const CTYPE* const src_data = src.const_data_ptr(); apply_binary_elementwise_fn( [src_data, &idx, &src_numel, &src_numel_check]( From cc0c14ffbdc6b7ed6a948c17ac3903026918ff55 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 16 Oct 2025 23:08:28 -0700 Subject: [PATCH 07/19] name in real_copy and attempt at fixing lambda ternary issue in topk --- kernels/portable/cpu/op_topk.cpp | 6 ++++-- kernels/portable/cpu/op_view_as_real_copy.cpp | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp index bdea02f83bc..5a16ff8685b 100644 --- a/kernels/portable/cpu/op_topk.cpp +++ b/kernels/portable/cpu/op_topk.cpp @@ -118,10 +118,12 @@ void perform_topk( } // Perform topk on the queue - const auto elem_greater = [](const elem_t& x, const elem_t& y) -> bool { + const std::function elem_greater = + [](const elem_t& x, const elem_t& y) -> bool { return float_less_than(y.first, x.first); }; - const auto elem_less = [](const elem_t& x, const elem_t& y) -> bool { + const std::function elem_less = + [](const elem_t& x, const elem_t& y) -> bool { return float_less_than(x.first, y.first); }; const auto cmp = largest ? elem_greater : elem_less; diff --git a/kernels/portable/cpu/op_view_as_real_copy.cpp b/kernels/portable/cpu/op_view_as_real_copy.cpp index 4a2803eded0..e016661c9f4 100644 --- a/kernels/portable/cpu/op_view_as_real_copy.cpp +++ b/kernels/portable/cpu/op_view_as_real_copy.cpp @@ -64,10 +64,9 @@ Tensor& view_as_real_copy_out( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); - constexpr auto op_name = "view_as_real_copy.out"; - ET_SWITCH_COMPLEXH_TYPES(self.scalar_type(), ctx, op_name, CTYPE_IN, [&] { - ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + ET_SWITCH_COMPLEX_TYPES(self.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_IN, [&] { + ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_OUT, [&] { _to_impl(self, out); }); }); From 9d97fd9b5b8f11610344311ffdab32e4b9092194 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 16 Oct 2025 23:09:59 -0700 Subject: [PATCH 08/19] maybe the include fixes topk --- kernels/portable/cpu/op_topk.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp index 5a16ff8685b..19ae675b040 100644 --- a/kernels/portable/cpu/op_topk.cpp +++ b/kernels/portable/cpu/op_topk.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include From d64d969f108bcae3c79f70351e7773c9406db301 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 17 Oct 2025 11:27:20 -0700 Subject: [PATCH 09/19] remove unused var --- kernels/portable/cpu/util/elementwise_util.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index 3e3522a52c6..cd2a8395304 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -85,7 +85,6 @@ inline void dtype_specialized_elementwise_fn_impl( static_assert( (std::is_same_v> && ...)); - constexpr auto kNumInputs = sizeof...(inputs); // All inputs must be of type CTYPE_COMPUTE. ET_DCHECK( ((inputs.first->scalar_type() == @@ -229,8 +228,6 @@ inline void apply_elementwise_fn_generic_impl( const Tensor& out, SupportedTensorDtypes out_dtypes, Args... inputs) { - constexpr auto kNumInputs = sizeof...(inputs); - struct InputInfo { load_to_compute_fn load_to_compute; const char* data_ptr; From 162f018dc41053dabd3dc45a83be499753411e3b Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 17 Oct 2025 11:36:43 -0700 Subject: [PATCH 10/19] lint --- kernels/portable/cpu/op_view_as_real_copy.cpp | 13 ++++++------ kernels/portable/cpu/util/elementwise_util.h | 17 ++++++++-------- runtime/core/exec_aten/util/tensor_util.h | 20 +++++++++---------- 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/kernels/portable/cpu/op_view_as_real_copy.cpp b/kernels/portable/cpu/op_view_as_real_copy.cpp index e016661c9f4..fd1721da7cf 100644 --- a/kernels/portable/cpu/op_view_as_real_copy.cpp +++ b/kernels/portable/cpu/op_view_as_real_copy.cpp @@ -64,12 +64,13 @@ Tensor& view_as_real_copy_out( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); - - ET_SWITCH_COMPLEX_TYPES(self.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_IN, [&] { - ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_OUT, [&] { - _to_impl(self, out); - }); - }); + ET_SWITCH_COMPLEX_TYPES( + self.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_IN, [&] { + ET_SWITCH_FLOATH_TYPES( + out.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_OUT, [&] { + _to_impl(self, out); + }); + }); return out; } diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index cd2a8395304..c70f9a75245 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -104,8 +104,9 @@ inline void dtype_specialized_elementwise_fn_impl( out.numel(), ::executorch::extension::internal::GRAIN_SIZE, [&](const auto begin, const auto end) { - std::array inputs_data_ptrs = { - inputs.first->template const_data_ptr()...}; + std::array + inputs_data_ptrs = { + inputs.first->template const_data_ptr()...}; CTYPE_OUT* const data_out = out.mutable_data_ptr(); @@ -176,9 +177,9 @@ inline void dtype_specialized_elementwise_fn_impl( CTYPE_OUT* const data_out = out.mutable_data_ptr(); - const auto range = - BroadcastIndexesRange( - out, (*inputs.first)...); + const auto range = BroadcastIndexesRange< + sizeof...(inputs), + support_noncontiguous_tensors>(out, (*inputs.first)...); auto begin_it = range.begin(); begin_it += begin; for (; (*begin_it)[0] < end; ++begin_it) { @@ -251,9 +252,9 @@ inline void apply_elementwise_fn_generic_impl( out.numel(), ::executorch::extension::internal::GRAIN_SIZE, [&](const auto begin, const auto end) { - const auto range = - BroadcastIndexesRange( - out, (*inputs.first)...); + const auto range = BroadcastIndexesRange< + sizeof...(inputs), + support_noncontiguous_tensors>(out, (*inputs.first)...); auto begin_it = range.begin(); begin_it += begin; for (; (*begin_it)[0] < end; ++begin_it) { diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h index 129fa25ae54..9b490da244c 100644 --- a/runtime/core/exec_aten/util/tensor_util.h +++ b/runtime/core/exec_aten/util/tensor_util.h @@ -66,7 +66,7 @@ * dimension of all the tensors as the upper bound for the for loop. */ #define ET_CHECK_SAME_SHAPE2(a__, b__) \ - do { \ + do { \ const size_t a_numel__ = (a__).numel(); \ const size_t b_numel__ = (b__).numel(); \ const size_t a_dim__ = (a__).dim(); \ @@ -92,7 +92,7 @@ } while (0) #define ET_CHECK_SAME_SHAPE3(a__, b__, c__) \ - do { \ + do { \ const size_t a_numel__ = (a__).numel(); \ const size_t b_numel__ = (b__).numel(); \ const size_t c_numel__ = (c__).numel(); \ @@ -128,7 +128,7 @@ /// Asserts that all tensors have the same dtype. #define ET_CHECK_SAME_DTYPE2(a__, b__) \ - do { \ + do { \ const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type(); \ const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type(); \ ET_CHECK_MSG( \ @@ -139,7 +139,7 @@ } while (0) #define ET_CHECK_SAME_DTYPE3(a__, b__, c__) \ - do { \ + do { \ const ::executorch::aten::ScalarType a_type__ = (a__).scalar_type(); \ const ::executorch::aten::ScalarType b_type__ = (b__).scalar_type(); \ const ::executorch::aten::ScalarType c_type__ = (c__).scalar_type(); \ @@ -159,7 +159,7 @@ * macros independently, because it only calls ET_CHECK_MSG once. */ #define ET_CHECK_SAME_SHAPE_AND_DTYPE2(a__, b__) \ - do { \ + do { \ const size_t a_numel__ = (a__).numel(); \ const size_t b_numel__ = (b__).numel(); \ const size_t a_dim__ = (a__).dim(); \ @@ -192,7 +192,7 @@ } while (0) #define ET_CHECK_SAME_SHAPE_AND_DTYPE3(a__, b__, c__) \ - do { \ + do { \ const size_t a_numel__ = (a__).numel(); \ const size_t b_numel__ = (b__).numel(); \ const size_t c_numel__ = (c__).numel(); \ @@ -239,7 +239,7 @@ * Assert that the input tensor is contiguous tensor. */ #define ET_CHECK_CONTIGUOUS(a__) \ - do { \ + do { \ const ::executorch::aten::ArrayRef \ strides = a__.strides(); \ const ::executorch::aten::ArrayRef sizes = \ @@ -268,7 +268,7 @@ * of any input tensors. */ #define ET_CHECK_SAME_STRIDES2(a__, b__) \ - do { \ + do { \ ET_CHECK_MSG( \ a__.dim() == b__.dim(), \ "Two tensors shall have same number of strides, but not %zu and %zu.", \ @@ -296,7 +296,7 @@ * of any input tensors. */ #define ET_CHECK_SAME_STRIDES3(a__, b__, c__) \ - do { \ + do { \ ET_CHECK_MSG( \ a__.dim() == b__.dim() && b__.dim() == c__.dim(), \ "Three tensors shall have same number of strides, " \ @@ -325,7 +325,7 @@ } while (0) #define ET_CHECK_DEFAULT_OR_CHANNELSLAST_DIMORDER(t__) \ - do { \ + do { \ ET_CHECK_MSG( \ is_contiguous_dim_order( \ t__.dim_order().data(), t__.dim_order().size()) || \ From 20c44b075f7af0c7171a7968102702ec0b7a656e Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 17 Oct 2025 11:52:31 -0700 Subject: [PATCH 11/19] no std::function --- kernels/portable/cpu/op_topk.cpp | 47 +++++++++++++++++++------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp index 19ae675b040..2c0ef906e10 100644 --- a/kernels/portable/cpu/op_topk.cpp +++ b/kernels/portable/cpu/op_topk.cpp @@ -6,15 +6,14 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include -#include -#include - #include #include #include +#include +#include +#include + namespace torch { namespace executor { namespace native { @@ -119,21 +118,31 @@ void perform_topk( } // Perform topk on the queue - const std::function elem_greater = - [](const elem_t& x, const elem_t& y) -> bool { - return float_less_than(y.first, x.first); - }; - const std::function elem_less = - [](const elem_t& x, const elem_t& y) -> bool { - return float_less_than(x.first, y.first); - }; - const auto cmp = largest ? elem_greater : elem_less; - if (use_partial_sort) { - std::partial_sort(queue, queue + k, queue + dim_size, cmp); + if (largest) { + const auto elem_greater = + [](const elem_t& x, const elem_t& y) -> bool { + return float_less_than(y.first, x.first); + }; + if (use_partial_sort) { + std::partial_sort(queue, queue + k, queue + dim_size, elem_greater); + } else { + std::nth_element(queue, queue + k - 1, queue + dim_size, elem_greater); + if (sorted) { + std::sort(queue, queue + k - 1, elem_greater); + } + } } else { - std::nth_element(queue, queue + k - 1, queue + dim_size, cmp); - if (sorted) { - std::sort(queue, queue + k - 1, cmp); + const auto elem_less = + [](const elem_t& x, const elem_t& y) -> bool { + return float_less_than(x.first, y.first); + }; + if (use_partial_sort) { + std::partial_sort(queue, queue + k, queue + dim_size, elem_less); + } else { + std::nth_element(queue, queue + k - 1, queue + dim_size, elem_less); + if (sorted) { + std::sort(queue, queue + k - 1, elem_less); + } } } From a6bfcbc6455827a09aeccd6e11a068109cc5bd0e Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 17 Oct 2025 13:18:59 -0700 Subject: [PATCH 12/19] remove std::function and complex->complexh --- kernels/portable/cpu/op_topk.cpp | 9 ++++----- kernels/portable/cpu/op_view_as_real_copy.cpp | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp index 2c0ef906e10..e356a376703 100644 --- a/kernels/portable/cpu/op_topk.cpp +++ b/kernels/portable/cpu/op_topk.cpp @@ -119,21 +119,20 @@ void perform_topk( // Perform topk on the queue if (largest) { - const auto elem_greater = - [](const elem_t& x, const elem_t& y) -> bool { + const auto elem_greater = [](const elem_t& x, const elem_t& y) -> bool { return float_less_than(y.first, x.first); }; if (use_partial_sort) { std::partial_sort(queue, queue + k, queue + dim_size, elem_greater); } else { - std::nth_element(queue, queue + k - 1, queue + dim_size, elem_greater); + std::nth_element( + queue, queue + k - 1, queue + dim_size, elem_greater); if (sorted) { std::sort(queue, queue + k - 1, elem_greater); } } } else { - const auto elem_less = - [](const elem_t& x, const elem_t& y) -> bool { + const auto elem_less = [](const elem_t& x, const elem_t& y) -> bool { return float_less_than(x.first, y.first); }; if (use_partial_sort) { diff --git a/kernels/portable/cpu/op_view_as_real_copy.cpp b/kernels/portable/cpu/op_view_as_real_copy.cpp index fd1721da7cf..6df1704e2c8 100644 --- a/kernels/portable/cpu/op_view_as_real_copy.cpp +++ b/kernels/portable/cpu/op_view_as_real_copy.cpp @@ -64,7 +64,7 @@ Tensor& view_as_real_copy_out( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); - ET_SWITCH_COMPLEX_TYPES( + ET_SWITCH_COMPLEXH_TYPES( self.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_IN, [&] { ET_SWITCH_FLOATH_TYPES( out.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_OUT, [&] { From fda2b176f84035d6e4bf9bb75d20ab0051661e4c Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 17 Oct 2025 13:20:01 -0700 Subject: [PATCH 13/19] fix compiler flag --- extension/llm/tokenizers | 2 +- kernels/portable/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index b2a3d77d672..d710a0cf10c 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit b2a3d77d672d50f24d0ae7cd6df3b428469c2548 +Subproject commit d710a0cf10cfa8cb7ffda33c4e61af63119bc95f diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index 8cedd154318..5004cb2520e 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -23,7 +23,7 @@ endif() set(_common_compile_options $<$:/wd4996> - $<$>:-Wno-deprecated-declarations -fPIC> + $<$>:-Wno-deprecated-declarations> ) include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) From 653317000246f97fee577600eba72e1ad4fc78e8 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 17 Oct 2025 13:20:35 -0700 Subject: [PATCH 14/19] fix compiler flag --- kernels/optimized/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index cc2aa305ffd..01a10f77846 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -23,7 +23,7 @@ endif() set(_common_compile_options $<$:/wd4996> - $<$>:-Wno-deprecated-declarations -fPIC> + $<$>:-Wno-deprecated-declarations> ) # Note for apple platform we can rely on Accelerate framework Will come back to From d94e032cb267a9e983b54edd9976bf7152cb87fe Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 17 Oct 2025 13:21:30 -0700 Subject: [PATCH 15/19] fix compiler flag --- kernels/portable/cpu/util/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/portable/cpu/util/CMakeLists.txt b/kernels/portable/cpu/util/CMakeLists.txt index 2475284fe41..eabf3add9b0 100644 --- a/kernels/portable/cpu/util/CMakeLists.txt +++ b/kernels/portable/cpu/util/CMakeLists.txt @@ -23,7 +23,7 @@ list(TRANSFORM _kernels_util_all_deps__srcs PREPEND "${EXECUTORCH_ROOT}/") set(_common_compile_options $<$:/wd4996> - $<$>:-Wno-deprecated-declarations -fPIC> + $<$>:-Wno-deprecated-declarations> ) add_library(kernels_util_all_deps ${_kernels_util_all_deps__srcs}) From 785782148e5bc8e1b9409c4215fda6f46fb184a0 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 17 Oct 2025 13:24:22 -0700 Subject: [PATCH 16/19] remove tokenizers change from pr --- extension/llm/tokenizers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index d710a0cf10c..b2a3d77d672 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit d710a0cf10cfa8cb7ffda33c4e61af63119bc95f +Subproject commit b2a3d77d672d50f24d0ae7cd6df3b428469c2548 From 83cf868b647ad1a94baf4f2d962b22cf930b3893 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Mon, 20 Oct 2025 12:12:42 -0700 Subject: [PATCH 17/19] add static --- kernels/portable/cpu/op_convolution_backward.cpp | 2 +- kernels/portable/cpu/op_gather.cpp | 2 +- kernels/portable/cpu/op_max.cpp | 2 +- kernels/portable/cpu/op_max_pool2d_with_indices_backward.cpp | 2 +- kernels/portable/cpu/op_min.cpp | 2 +- kernels/portable/cpu/op_native_batch_norm.cpp | 4 ++-- kernels/portable/cpu/op_native_group_norm.cpp | 2 +- kernels/portable/cpu/op_pdist_forward.cpp | 2 +- kernels/portable/cpu/op_prod.cpp | 4 ++-- kernels/portable/cpu/op_reflection_pad1d.cpp | 2 +- kernels/portable/cpu/op_reflection_pad2d.cpp | 2 +- kernels/portable/cpu/op_reflection_pad3d.cpp | 2 +- kernels/portable/cpu/op_repeat_interleave.cpp | 2 +- kernels/portable/cpu/op_replication_pad1d.cpp | 2 +- kernels/portable/cpu/op_replication_pad2d.cpp | 2 +- kernels/portable/cpu/op_replication_pad3d.cpp | 2 +- kernels/portable/cpu/op_roll.cpp | 2 +- kernels/portable/cpu/op_scalar_tensor.cpp | 2 +- kernels/portable/cpu/op_scatter.cpp | 4 ++-- kernels/portable/cpu/op_topk.cpp | 2 +- kernels/portable/cpu/op_var.cpp | 4 ++-- 21 files changed, 25 insertions(+), 25 deletions(-) diff --git a/kernels/portable/cpu/op_convolution_backward.cpp b/kernels/portable/cpu/op_convolution_backward.cpp index 2535ed4eb6c..ffe9ed57b41 100644 --- a/kernels/portable/cpu/op_convolution_backward.cpp +++ b/kernels/portable/cpu/op_convolution_backward.cpp @@ -305,7 +305,7 @@ std::tuple convolution_backward_out( ret_val); } - constexpr auto name = "convolution_backward.out"; + static constexpr auto name = "convolution_backward.out"; ET_SWITCH_FLOATHBF16_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() { conv2d_backward_impl( diff --git a/kernels/portable/cpu/op_gather.cpp b/kernels/portable/cpu/op_gather.cpp index 02ea502ca63..a42256ac4fc 100644 --- a/kernels/portable/cpu/op_gather.cpp +++ b/kernels/portable/cpu/op_gather.cpp @@ -85,7 +85,7 @@ Tensor& gather_out( InvalidArgument, out); - constexpr auto name = "gather.out"; + static constexpr auto name = "gather.out"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() { gather_helper(in, index, out, dim); diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp index 467c8ccffd5..38ed50317d1 100644 --- a/kernels/portable/cpu/op_max.cpp +++ b/kernels/portable/cpu/op_max.cpp @@ -124,7 +124,7 @@ max_unary_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { ET_KERNEL_CHECK(ctx, canCast(in_type, out_type), InvalidArgument, out); - constexpr auto name = "max.unary_out"; + static constexpr auto name = "max.unary_out"; ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] { ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] { diff --git a/kernels/portable/cpu/op_max_pool2d_with_indices_backward.cpp b/kernels/portable/cpu/op_max_pool2d_with_indices_backward.cpp index 2013d1272d9..99dc8a89293 100644 --- a/kernels/portable/cpu/op_max_pool2d_with_indices_backward.cpp +++ b/kernels/portable/cpu/op_max_pool2d_with_indices_backward.cpp @@ -169,7 +169,7 @@ Tensor& max_pool2d_with_indices_backward_out( InvalidArgument, grad_input); - constexpr auto name = "max_pool2d_with_indices_backward.grad_input"; + static constexpr auto name = "max_pool2d_with_indices_backward.grad_input"; ET_SWITCH_FLOATHBF16_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() { max_pool_backward_impl(grad_input, grad_output, indices); diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp index 304321bb9f8..711774d4cce 100644 --- a/kernels/portable/cpu/op_min.cpp +++ b/kernels/portable/cpu/op_min.cpp @@ -124,7 +124,7 @@ min_unary_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { ET_KERNEL_CHECK(ctx, canCast(in_type, out_type), InvalidArgument, out); - constexpr auto name = "min.unary_out"; + static constexpr auto name = "min.unary_out"; ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] { ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] { diff --git a/kernels/portable/cpu/op_native_batch_norm.cpp b/kernels/portable/cpu/op_native_batch_norm.cpp index aa6919924f1..b8905e0ef35 100644 --- a/kernels/portable/cpu/op_native_batch_norm.cpp +++ b/kernels/portable/cpu/op_native_batch_norm.cpp @@ -102,7 +102,7 @@ std::tuple _native_batch_norm_legit_no_training_out( size_t outer = getLeadingDims(in, C_dim); size_t inner = getTrailingDims(in, C_dim); - constexpr auto name = "native_batch_norm_legit_no_training.out"; + static constexpr auto name = "native_batch_norm_legit_no_training.out"; ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE, [&] { const CTYPE* in_data = in.const_data_ptr(); @@ -259,7 +259,7 @@ std::tuple _native_batch_norm_legit_no_stats_out( InvalidArgument, ret_val); - constexpr auto name = "_native_batch_norm_legit.no_stats_out"; + static constexpr auto name = "_native_batch_norm_legit.no_stats_out"; ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE, [&] { const CTYPE* in_data = in.const_data_ptr(); diff --git a/kernels/portable/cpu/op_native_group_norm.cpp b/kernels/portable/cpu/op_native_group_norm.cpp index 7882204e57e..9e300dc7829 100644 --- a/kernels/portable/cpu/op_native_group_norm.cpp +++ b/kernels/portable/cpu/op_native_group_norm.cpp @@ -190,7 +190,7 @@ std::tuple native_group_norm_out( ret_val); } - constexpr auto name = "native_group_norm.out"; + static constexpr auto name = "native_group_norm.out"; ET_SWITCH_FLOATHBF16_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() { group_norm( diff --git a/kernels/portable/cpu/op_pdist_forward.cpp b/kernels/portable/cpu/op_pdist_forward.cpp index e412e43aa0c..f4093260ff6 100644 --- a/kernels/portable/cpu/op_pdist_forward.cpp +++ b/kernels/portable/cpu/op_pdist_forward.cpp @@ -40,7 +40,7 @@ Tensor& _pdist_forward_out( out); ScalarType in_type = in.scalar_type(); - constexpr auto name = "_pdist_forward.out"; + static constexpr auto name = "_pdist_forward.out"; ET_SWITCH_FLOATHBF16_TYPES( in_type, ctx, name, CTYPE, [&] { pdist(in, out, p); }); diff --git a/kernels/portable/cpu/op_prod.cpp b/kernels/portable/cpu/op_prod.cpp index 54580459d7c..ba76a1f200c 100644 --- a/kernels/portable/cpu/op_prod.cpp +++ b/kernels/portable/cpu/op_prod.cpp @@ -32,7 +32,7 @@ Tensor& prod_out( ScalarType in_type = in.scalar_type(); ScalarType out_type = out.scalar_type(); - constexpr auto name = "prod.int_out"; + static constexpr auto name = "prod.int_out"; ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] { ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] { @@ -72,7 +72,7 @@ Tensor& prod_int_out( ScalarType in_type = in.scalar_type(); ScalarType out_type = out.scalar_type(); - constexpr auto name = "prod.int_out"; + static constexpr auto name = "prod.int_out"; ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] { ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] { diff --git a/kernels/portable/cpu/op_reflection_pad1d.cpp b/kernels/portable/cpu/op_reflection_pad1d.cpp index 5f1b68e210d..a591368f44e 100644 --- a/kernels/portable/cpu/op_reflection_pad1d.cpp +++ b/kernels/portable/cpu/op_reflection_pad1d.cpp @@ -44,7 +44,7 @@ Tensor& reflection_pad1d_out( out); ScalarType in_type = in.scalar_type(); - constexpr auto name = "reflection_pad1d.out"; + static constexpr auto name = "reflection_pad1d.out"; ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] { pad1d(reflection_ix, in, out, padding); diff --git a/kernels/portable/cpu/op_reflection_pad2d.cpp b/kernels/portable/cpu/op_reflection_pad2d.cpp index 821d09253c9..6ef3ad7bff6 100644 --- a/kernels/portable/cpu/op_reflection_pad2d.cpp +++ b/kernels/portable/cpu/op_reflection_pad2d.cpp @@ -44,7 +44,7 @@ Tensor& reflection_pad2d_out( out); ScalarType in_type = in.scalar_type(); - constexpr auto name = "reflection_pad2d.out"; + static constexpr auto name = "reflection_pad2d.out"; ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] { pad2d(reflection_ix, in, out, padding); diff --git a/kernels/portable/cpu/op_reflection_pad3d.cpp b/kernels/portable/cpu/op_reflection_pad3d.cpp index cb0dd39a071..57338dd47d3 100644 --- a/kernels/portable/cpu/op_reflection_pad3d.cpp +++ b/kernels/portable/cpu/op_reflection_pad3d.cpp @@ -44,7 +44,7 @@ Tensor& reflection_pad3d_out( out); ScalarType in_type = in.scalar_type(); - constexpr auto name = "reflection_pad3d.out"; + static constexpr auto name = "reflection_pad3d.out"; ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] { pad3d(reflection_ix, in, out, padding); diff --git a/kernels/portable/cpu/op_repeat_interleave.cpp b/kernels/portable/cpu/op_repeat_interleave.cpp index 50da02c5646..a7b9f18f434 100644 --- a/kernels/portable/cpu/op_repeat_interleave.cpp +++ b/kernels/portable/cpu/op_repeat_interleave.cpp @@ -72,7 +72,7 @@ Tensor& repeat_interleave_Tensor_out( int64_t repeats_sum = 0; - constexpr auto name = "repeat_interleave.Tensor_out"; + static constexpr auto name = "repeat_interleave.Tensor_out"; ET_SWITCH_TWO_TYPES(Int, Long, repeats.scalar_type(), ctx, name, CTYPE, [&] { const CTYPE* repeats_data = repeats.const_data_ptr(); diff --git a/kernels/portable/cpu/op_replication_pad1d.cpp b/kernels/portable/cpu/op_replication_pad1d.cpp index 0b38c4f1540..5d4b907adac 100644 --- a/kernels/portable/cpu/op_replication_pad1d.cpp +++ b/kernels/portable/cpu/op_replication_pad1d.cpp @@ -36,7 +36,7 @@ Tensor& replication_pad1d_out( out); ScalarType in_type = in.scalar_type(); - constexpr auto name = "replication_pad1d.out"; + static constexpr auto name = "replication_pad1d.out"; ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] { pad1d(replication_ix, in, out, padding); diff --git a/kernels/portable/cpu/op_replication_pad2d.cpp b/kernels/portable/cpu/op_replication_pad2d.cpp index e3d79644db7..693f4df8636 100644 --- a/kernels/portable/cpu/op_replication_pad2d.cpp +++ b/kernels/portable/cpu/op_replication_pad2d.cpp @@ -36,7 +36,7 @@ Tensor& replication_pad2d_out( out); ScalarType in_type = in.scalar_type(); - constexpr auto name = "replication_pad2d.out"; + static constexpr auto name = "replication_pad2d.out"; ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] { pad2d(replication_ix, in, out, padding); diff --git a/kernels/portable/cpu/op_replication_pad3d.cpp b/kernels/portable/cpu/op_replication_pad3d.cpp index f23bde05beb..12e82d53167 100644 --- a/kernels/portable/cpu/op_replication_pad3d.cpp +++ b/kernels/portable/cpu/op_replication_pad3d.cpp @@ -36,7 +36,7 @@ Tensor& replication_pad3d_out( out); ScalarType in_type = in.scalar_type(); - constexpr auto name = "replication_pad3d.out"; + static constexpr auto name = "replication_pad3d.out"; ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] { pad3d(replication_ix, in, out, padding); diff --git a/kernels/portable/cpu/op_roll.cpp b/kernels/portable/cpu/op_roll.cpp index 109be64fbed..4d314b3d191 100644 --- a/kernels/portable/cpu/op_roll.cpp +++ b/kernels/portable/cpu/op_roll.cpp @@ -80,7 +80,7 @@ Tensor& roll_out( size_t dim_shift_array_length = static_cast(in.dim()); // NOLINT IntArrayRef dim_shifts(dim_shift_array, dim_shift_array_length); - constexpr auto name = "roll.out"; + static constexpr auto name = "roll.out"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, name, CTYPE, [&] { const CTYPE* in_data = in.const_data_ptr(); diff --git a/kernels/portable/cpu/op_scalar_tensor.cpp b/kernels/portable/cpu/op_scalar_tensor.cpp index bff4ecc318c..8136400a18f 100644 --- a/kernels/portable/cpu/op_scalar_tensor.cpp +++ b/kernels/portable/cpu/op_scalar_tensor.cpp @@ -22,7 +22,7 @@ scalar_tensor_out(KernelRuntimeContext& ctx, const Scalar& s, Tensor& out) { ScalarType out_type = out.scalar_type(); - constexpr auto name = "scalar_tensor.out"; + static constexpr auto name = "scalar_tensor.out"; ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE, [&]() { auto opt_val_casted = utils::internal::check_overflow_scalar_cast(s); diff --git a/kernels/portable/cpu/op_scatter.cpp b/kernels/portable/cpu/op_scatter.cpp index 42d40c8284d..a93839193fe 100644 --- a/kernels/portable/cpu/op_scatter.cpp +++ b/kernels/portable/cpu/op_scatter.cpp @@ -119,7 +119,7 @@ Tensor& scatter_src_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); - constexpr auto name = "scatter.src_out"; + static constexpr auto name = "scatter.src_out"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() { scatter_src_helper(in, dim, index, src, out); @@ -146,7 +146,7 @@ Tensor& scatter_value_out( ET_KERNEL_CHECK( ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); - constexpr auto name = "scatter.value_out"; + static constexpr auto name = "scatter.value_out"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() { auto opt_val = utils::internal::check_overflow_scalar_cast(value); diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp index e356a376703..55453264af4 100644 --- a/kernels/portable/cpu/op_topk.cpp +++ b/kernels/portable/cpu/op_topk.cpp @@ -198,7 +198,7 @@ std::tuple topk_values( InvalidArgument, out); - constexpr auto name = "topk.values"; + static constexpr auto name = "topk.values"; if (in.numel() == 0 || (k == 0 && in.dim() > 0)) { return out; diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp index fcaa79a54fe..202d7df80bc 100644 --- a/kernels/portable/cpu/op_var.cpp +++ b/kernels/portable/cpu/op_var.cpp @@ -91,7 +91,7 @@ Tensor& var_out( const size_t num = get_reduced_dim_product(in, dim_list); const size_t denom = unbiased ? num - 1 : num; - constexpr auto name = "var.out"; + static constexpr auto name = "var.out"; ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { @@ -123,7 +123,7 @@ Tensor& var_correction_out( InvalidArgument, out); - constexpr auto name = "var.correction_out"; + static constexpr auto name = "var.correction_out"; double correction_val = 1; if (correction.has_value()) { From 904b2b434babd71019b87f1d06796eee61593d7b Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Mon, 20 Oct 2025 12:27:53 -0700 Subject: [PATCH 18/19] more statics and explicit typing for lambdas --- kernels/optimized/cpu/op_bmm.cpp | 6 ++- kernels/portable/cpu/op_masked_scatter.cpp | 4 +- kernels/portable/cpu/op_topk.cpp | 38 ++++++---------- kernels/portable/cpu/util/elementwise_util.h | 48 ++++++++++---------- 4 files changed, 47 insertions(+), 49 deletions(-) diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp index 97bbc3aa7d5..51bfb2dfc3b 100644 --- a/kernels/optimized/cpu/op_bmm.cpp +++ b/kernels/optimized/cpu/op_bmm.cpp @@ -152,12 +152,14 @@ Tensor& opt_bmm_out( auto self_type = self.scalar_type(); + static constexpr auto name = "bmm.out"; + if (executorch::runtime::isComplexType(self_type)) { - ET_SWITCH_COMPLEXH_TYPES(self_type, ctx, "bmm.out", CTYPE, [&]() { + ET_SWITCH_COMPLEXH_TYPES(self_type, ctx, name, CTYPE, [&]() { bmm_kernel(self, mat2, out); }); } else { - ET_SWITCH_REALHBF16_TYPES(self_type, ctx, "bmm.out", CTYPE, [&]() { + ET_SWITCH_REALHBF16_TYPES(self_type, ctx, name, CTYPE, [&]() { bmm_kernel(self, mat2, out); }); } diff --git a/kernels/portable/cpu/op_masked_scatter.cpp b/kernels/portable/cpu/op_masked_scatter.cpp index 31b33fa6cd5..e91a8d0a08d 100644 --- a/kernels/portable/cpu/op_masked_scatter.cpp +++ b/kernels/portable/cpu/op_masked_scatter.cpp @@ -45,7 +45,9 @@ Tensor& masked_scatter_out( int64_t src_numel = src.numel(); bool src_numel_check = true; - ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "masked_scatter.out", CTYPE, [&]() { + static constexpr auto name = "masked_scatter.out"; + + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE, [&]() { const CTYPE* const src_data = src.const_data_ptr(); apply_binary_elementwise_fn( [src_data, &idx, &src_numel, &src_numel_check]( diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp index 55453264af4..a02030e01d7 100644 --- a/kernels/portable/cpu/op_topk.cpp +++ b/kernels/portable/cpu/op_topk.cpp @@ -118,30 +118,22 @@ void perform_topk( } // Perform topk on the queue - if (largest) { - const auto elem_greater = [](const elem_t& x, const elem_t& y) -> bool { - return float_less_than(y.first, x.first); - }; - if (use_partial_sort) { - std::partial_sort(queue, queue + k, queue + dim_size, elem_greater); - } else { - std::nth_element( - queue, queue + k - 1, queue + dim_size, elem_greater); - if (sorted) { - std::sort(queue, queue + k - 1, elem_greater); - } - } + bool (*elem_greater)(const elem_t&, const elem_t&) = + [](const elem_t& x, const elem_t& y) -> bool { + return float_less_than(y.first, x.first); + }; + bool (*elem_less)(const elem_t&, const elem_t&) = + [](const elem_t& x, const elem_t& y) -> bool { + return float_less_than(x.first, y.first); + }; + bool (*cmp)(const elem_t&, const elem_t&) = + largest ? elem_greater : elem_less; + if (use_partial_sort) { + std::partial_sort(queue, queue + k, queue + dim_size, cmp); } else { - const auto elem_less = [](const elem_t& x, const elem_t& y) -> bool { - return float_less_than(x.first, y.first); - }; - if (use_partial_sort) { - std::partial_sort(queue, queue + k, queue + dim_size, elem_less); - } else { - std::nth_element(queue, queue + k - 1, queue + dim_size, elem_less); - if (sorted) { - std::sort(queue, queue + k - 1, elem_less); - } + std::nth_element(queue, queue + k - 1, queue + dim_size, cmp); + if (sorted) { + std::sort(queue, queue + k - 1, cmp); } } diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index c70f9a75245..f5a29f71256 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -85,6 +85,7 @@ inline void dtype_specialized_elementwise_fn_impl( static_assert( (std::is_same_v> && ...)); + static constexpr auto kNumInputs = sizeof...(inputs); // All inputs must be of type CTYPE_COMPUTE. ET_DCHECK( ((inputs.first->scalar_type() == @@ -104,9 +105,8 @@ inline void dtype_specialized_elementwise_fn_impl( out.numel(), ::executorch::extension::internal::GRAIN_SIZE, [&](const auto begin, const auto end) { - std::array - inputs_data_ptrs = { - inputs.first->template const_data_ptr()...}; + std::array inputs_data_ptrs = { + inputs.first->template const_data_ptr()...}; CTYPE_OUT* const data_out = out.mutable_data_ptr(); @@ -119,11 +119,11 @@ inline void dtype_specialized_elementwise_fn_impl( // small-sized tests will test whether using Vectorized broke our // lambda. #ifndef NDEBUG - std::array loaded_inputs{}; + std::array loaded_inputs{}; #else // NDEBUG - std::array loaded_inputs{}; + std::array loaded_inputs{}; #endif // NDEBUG - for (const auto input_idx : c10::irange(sizeof...(inputs))) { + for (const auto input_idx : c10::irange(kNumInputs)) { loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; } #ifndef NDEBUG @@ -136,8 +136,8 @@ inline void dtype_specialized_elementwise_fn_impl( // Main vectorized loop. for (auto idx = vectorized_begin; idx < vectorized_end; idx += Vec::size()) { - std::array loaded_vec_inputs{}; - for (const auto input_idx : c10::irange(sizeof...(inputs))) { + std::array loaded_vec_inputs{}; + for (const auto input_idx : c10::irange(kNumInputs)) { loaded_vec_inputs[input_idx] = Vec::loadu(&inputs_data_ptrs[input_idx][idx]); } @@ -148,11 +148,11 @@ inline void dtype_specialized_elementwise_fn_impl( // Scalar epilogue. for (const auto idx : c10::irange(vectorized_end, end)) { #ifndef NDEBUG - std::array loaded_inputs{}; + std::array loaded_inputs{}; #else // NDEBUG - std::array loaded_inputs{}; + std::array loaded_inputs{}; #endif // NDEBUG - for (const auto input_idx : c10::irange(sizeof...(inputs))) { + for (const auto input_idx : c10::irange(kNumInputs)) { loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; } #ifndef NDEBUG @@ -172,20 +172,20 @@ inline void dtype_specialized_elementwise_fn_impl( out.numel(), ::executorch::extension::internal::GRAIN_SIZE, [&](const auto begin, const auto end) { - std::array inputs_data_ptrs = { + std::array inputs_data_ptrs = { inputs.first->template const_data_ptr()...}; CTYPE_OUT* const data_out = out.mutable_data_ptr(); - const auto range = BroadcastIndexesRange< - sizeof...(inputs), - support_noncontiguous_tensors>(out, (*inputs.first)...); + const auto range = + BroadcastIndexesRange( + out, (*inputs.first)...); auto begin_it = range.begin(); begin_it += begin; for (; (*begin_it)[0] < end; ++begin_it) { const auto& indexes = *begin_it; - std::array loaded_inputs{}; - for (const auto idx : c10::irange(sizeof...(inputs))) { + std::array loaded_inputs{}; + for (const auto idx : c10::irange(kNumInputs)) { loaded_inputs[idx] = inputs_data_ptrs[idx][indexes[idx + 1]]; } data_out[indexes[0]] = std::apply(compute_fun, loaded_inputs); @@ -229,12 +229,14 @@ inline void apply_elementwise_fn_generic_impl( const Tensor& out, SupportedTensorDtypes out_dtypes, Args... inputs) { + static constexpr auto kNumInputs = sizeof...(inputs); + struct InputInfo { load_to_compute_fn load_to_compute; const char* data_ptr; ssize_t element_size; }; - std::array inputs_info = {(InputInfo{ + std::array inputs_info = {(InputInfo{ internal::get_load_to_compute_fn( ctx, *inputs.first, inputs.second), reinterpret_cast(inputs.first->const_data_ptr()), @@ -252,15 +254,15 @@ inline void apply_elementwise_fn_generic_impl( out.numel(), ::executorch::extension::internal::GRAIN_SIZE, [&](const auto begin, const auto end) { - const auto range = BroadcastIndexesRange< - sizeof...(inputs), - support_noncontiguous_tensors>(out, (*inputs.first)...); + const auto range = + BroadcastIndexesRange( + out, (*inputs.first)...); auto begin_it = range.begin(); begin_it += begin; for (; (*begin_it)[0] < end; ++begin_it) { const auto& indexes = *begin_it; - std::array loaded_inputs{}; - for (const auto idx : c10::irange(sizeof...(inputs))) { + std::array loaded_inputs{}; + for (const auto idx : c10::irange(kNumInputs)) { const auto& input_info = inputs_info[idx]; loaded_inputs[idx] = input_info.load_to_compute( &input_info From 045187077b81463abb2e96850e2b487a7f3f34bf Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 21 Oct 2025 10:42:52 -0700 Subject: [PATCH 19/19] minor tweeks --- kernels/optimized/cpu/op_bmm.cpp | 4 ++-- kernels/portable/cpu/op_topk.cpp | 3 ++- kernels/portable/cpu/op_view_as_real_copy.cpp | 14 +++++++------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp index 51bfb2dfc3b..171f14de399 100644 --- a/kernels/optimized/cpu/op_bmm.cpp +++ b/kernels/optimized/cpu/op_bmm.cpp @@ -150,10 +150,10 @@ Tensor& opt_bmm_out( ET_KERNEL_CHECK( ctx, check_bmm_out_args(self, mat2, out), InvalidArgument, out); - auto self_type = self.scalar_type(); - static constexpr auto name = "bmm.out"; + auto self_type = self.scalar_type(); + if (executorch::runtime::isComplexType(self_type)) { ET_SWITCH_COMPLEXH_TYPES(self_type, ctx, name, CTYPE, [&]() { bmm_kernel(self, mat2, out); diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp index a02030e01d7..3082bc94662 100644 --- a/kernels/portable/cpu/op_topk.cpp +++ b/kernels/portable/cpu/op_topk.cpp @@ -117,7 +117,8 @@ void perform_topk( queue[i].second = i; } - // Perform topk on the queue + // Perform topk on the queue, explict typing for the lambda to satisfy + // msvc compiler. bool (*elem_greater)(const elem_t&, const elem_t&) = [](const elem_t& x, const elem_t& y) -> bool { return float_less_than(y.first, x.first); diff --git a/kernels/portable/cpu/op_view_as_real_copy.cpp b/kernels/portable/cpu/op_view_as_real_copy.cpp index 6df1704e2c8..4461ecb02f8 100644 --- a/kernels/portable/cpu/op_view_as_real_copy.cpp +++ b/kernels/portable/cpu/op_view_as_real_copy.cpp @@ -64,13 +64,13 @@ Tensor& view_as_real_copy_out( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(self, out), InvalidArgument, out); - ET_SWITCH_COMPLEXH_TYPES( - self.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_IN, [&] { - ET_SWITCH_FLOATH_TYPES( - out.scalar_type(), ctx, "view_as_real_copy.out", CTYPE_OUT, [&] { - _to_impl(self, out); - }); - }); + static constexpr auto op_name = "view_as_real_copy.out"; + + ET_SWITCH_COMPLEXH_TYPES(self.scalar_type(), ctx, op_name, CTYPE_IN, [&] { + ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + _to_impl(self, out); + }); + }); return out; }