diff --git a/kernels/optimized/blas/BlasKernel.cpp b/kernels/optimized/blas/BlasKernel.cpp index cc619b7061a..7202c8cd472 100644 --- a/kernels/optimized/blas/BlasKernel.cpp +++ b/kernels/optimized/blas/BlasKernel.cpp @@ -54,7 +54,7 @@ static_assert(kF32RegistersPerIteration == 1 << kF32RegistersPerIterationShift); static inline double reduce(float32x4_t x[kF32RegistersPerIteration]) { int offset = kF32RegistersPerIteration; utils::ForcedUnroll{}( - [&offset, &x](auto idx) { + [&offset, &x](auto idx) ET_INLINE_ATTRIBUTE { offset /= 2; for (int i = 0; i < offset; ++i) { x[i] = vaddq_f32(x[i], x[offset + i]); @@ -115,7 +115,7 @@ float dot_with_fp32_arith(const T* vec1, const T* vec2, int64_t len) { const auto* vec1_ = vec1 + j; const auto* vec2_ = vec2 + j; utils::ForcedUnroll{}( - [vec1_, vec2_, &sum](auto k) { + [vec1_, vec2_, &sum](auto k) ET_INLINE_ATTRIBUTE { dot_with_fp32_arith_main_inner_loop(vec1_, vec2_, sum, k); }); } diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h index 9a8e18c0f1e..b6f7fc8642f 100644 --- a/runtime/platform/compiler.h +++ b/runtime/platform/compiler.h @@ -57,6 +57,7 @@ #define ET_NORETURN [[noreturn]] #define ET_NOINLINE __attribute__((noinline)) #define ET_INLINE __attribute__((always_inline)) inline +#define ET_INLINE_ATTRIBUTE __attribute__((always_inline)) #if defined(__GNUC__)