From b7074dc2ee535b52ec04fb82374cdc2f1c4af8c1 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 9 Sep 2024 13:26:23 -0700 Subject: [PATCH] [ExecuTorch] Make ForcedUnroll usage in bf16 BlasKernel actually work for -Oz builds Clang is very resistant to inlining under -Oz. For ForcedUnroll to actually unroll, we need to force-inline the lambda. Differential Revision: [D62154247](https://our.internmc.facebook.com/intern/diff/D62154247/) [ghstack-poisoned] --- kernels/optimized/blas/BlasKernel.cpp | 4 ++-- runtime/platform/compiler.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp index cc619b7061a..7202c8cd472 100644 --- a/kernels/optimized/blas/BlasKernel.cpp +++ b/kernels/optimized/blas/BlasKernel.cpp @@ -54,7 +54,7 @@ static_assert(kF32RegistersPerIteration == 1 << kF32RegistersPerIterationShift); static inline double reduce(float32x4_t x[kF32RegistersPerIteration]) { int offset = kF32RegistersPerIteration; utils::ForcedUnroll{}( - [&offset, &x](auto idx) { + [&offset, &x](auto idx) ET_INLINE_ATTRIBUTE { offset /= 2; for (int i = 0; i < offset; ++i) { x[i] = vaddq_f32(x[i], x[offset + i]); @@ -115,7 +115,7 @@ float dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) { const auto* vec1_ = vec1 + j; const auto* vec2_ = vec2 + j; utils::ForcedUnroll{}( - [vec1_, vec2_, &sum](auto k) { + [vec1_, vec2_, &sum](auto k) ET_INLINE_ATTRIBUTE { dot_with_fp32_arith_main_inner_loop(vec1_, vec2_, sum, k); }); } diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h index 9a8e18c0f1e..b6f7fc8642f 100644 --- a/runtime/platform/compiler.h +++ b/runtime/platform/compiler.h @@ -57,6 +57,7 @@ #define ET_NORETURN [[noreturn]] #define ET_NOINLINE __attribute__((noinline)) #define ET_INLINE __attribute__((always_inline)) inline +#define ET_INLINE_ATTRIBUTE __attribute__((always_inline)) #if defined(__GNUC__)