From b7074dc2ee535b52ec04fb82374cdc2f1c4af8c1 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Mon, 9 Sep 2024 13:26:23 -0700
Subject: [PATCH] [ExecuTorch] Make ForcedUnroll usage in bf16 BlasKernel
 actually work for -Oz builds

Clang is very resistant to inlining under -Oz. For ForcedUnroll to actually unroll, we need to force-inline the lambda.

Differential Revision: [D62154247](https://our.internmc.facebook.com/intern/diff/D62154247/)

[ghstack-poisoned]
---
 kernels/optimized/blas/BlasKernel.cpp | 4 ++--
 runtime/platform/compiler.h           | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp
index cc619b7061a..7202c8cd472 100644
--- a/kernels/optimized/blas/BlasKernel.cpp
+++ b/kernels/optimized/blas/BlasKernel.cpp
@@ -54,7 +54,7 @@ static_assert(kF32RegistersPerIteration == 1 << kF32RegistersPerIterationShift);
 static inline double reduce(float32x4_t x[kF32RegistersPerIteration]) {
   int offset = kF32RegistersPerIteration;
   utils::ForcedUnroll<kF32RegistersPerIterationShift>{}(
-      [&offset, &x](auto idx) {
+      [&offset, &x](auto idx) ET_INLINE_ATTRIBUTE {
         offset /= 2;
         for (int i = 0; i < offset; ++i) {
           x[i] = vaddq_f32(x[i], x[offset + i]);
@@ -115,7 +115,7 @@ float dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) {
     const auto* vec1_ = vec1 + j;
     const auto* vec2_ = vec2 + j;
     utils::ForcedUnroll<kF32RegisterPairsPerIteration>{}(
-        [vec1_, vec2_, &sum](auto k) {
+        [vec1_, vec2_, &sum](auto k) ET_INLINE_ATTRIBUTE {
           dot_with_fp32_arith_main_inner_loop(vec1_, vec2_, sum, k);
         });
   }
diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h
index 9a8e18c0f1e..b6f7fc8642f 100644
--- a/runtime/platform/compiler.h
+++ b/runtime/platform/compiler.h
@@ -57,6 +57,7 @@
 #define ET_NORETURN [[noreturn]]
 #define ET_NOINLINE __attribute__((noinline))
 #define ET_INLINE __attribute__((always_inline)) inline
+#define ET_INLINE_ATTRIBUTE __attribute__((always_inline))
 
 #if defined(__GNUC__)