From fea63592667a0922844f79f45de2a33b7dc567e8 Mon Sep 17 00:00:00 2001 From: Zonglin Peng Date: Wed, 20 Aug 2025 11:35:14 -0700 Subject: [PATCH 1/3] migrate most aten ops tests to facto (#13483) Summary: **Have to ship this as an intermediate step to unblock 3 workstreams on the stack** Modify aten tests to ingress FACTO generated testcases. - Each test gets 30~50 cases with good coverage on - Optimized VS unoptimized flows - dtype switch cases Known issues: - FACTO test class is too big to run on default "heavyweight" CI - current skipping the whole target on CI. Will add back once skycastle flow is ready - some FACTO is creating inputs that kernels does not handle, mainly dtypes - will create exception handling for that. - TODO marks the 2 FACTO doesnt work well on the 2 ops. Reviewed By: manuelcandales, hsharma35 Differential Revision: D79121474 --- backends/cadence/utils/facto_util.py | 155 ++++++++++++++++++++++++--- 1 file changed, 138 insertions(+), 17 deletions(-) diff --git a/backends/cadence/utils/facto_util.py b/backends/cadence/utils/facto_util.py index b896f8a8e89..7a7afbac128 100644 --- a/backends/cadence/utils/facto_util.py +++ b/backends/cadence/utils/facto_util.py @@ -10,6 +10,8 @@ from functools import lru_cache from typing import List, OrderedDict, Tuple +import facto.specdb.function as fn + import torch from facto.inputgen.argtuple.gen import ArgumentTupleGenerator from facto.inputgen.specs.model import ConstraintProducer as cp @@ -22,13 +24,21 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: tensor_constraints = [ - cp.Dtype.In(lambda deps: [torch.int, torch.float]), - cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]), + cp.Dtype.In( + lambda deps: [ + torch.int8, + torch.int16, + torch.uint8, + torch.uint16, + torch.float32, + ] + ), cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), cp.Value.Le(lambda deps, dtype, struct: 2**4), cp.Rank.Ge(lambda deps: 1), cp.Size.Ge(lambda deps, r, d: 1), cp.Size.Le(lambda deps, r, d: 2**9), + cp.Rank.Le(lambda deps: 2**3), ] match op_name: @@ -36,7 +46,6 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: if index == 0: # condition tensor_constraints = [ cp.Dtype.In(lambda deps: [torch.bool]), - cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]), cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), cp.Value.Le(lambda deps, dtype, struct: 2**4), cp.Rank.Ge(lambda deps: 1), @@ -45,19 +54,35 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: ] else: tensor_constraints = [ - cp.Dtype.In(lambda deps: [torch.float, torch.int]), - cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]), + cp.Dtype.In( + lambda deps: [ + torch.int8, + torch.int16, + torch.uint8, + torch.uint16, + torch.float32, + ] + ), cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), cp.Value.Le(lambda deps, dtype, struct: 2**4), cp.Rank.Ge(lambda deps: 1), cp.Size.Ge(lambda deps, r, d: 1), cp.Size.Le(lambda deps, r, d: 2**9), ] + case "embedding.default": + tensor_constraints = [ + cp.Dtype.In(lambda deps: [torch.float, torch.int]), + cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]), + cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), + cp.Value.Le(lambda deps, dtype, struct: 2**4), + cp.Rank.Ge(lambda deps: 1), + cp.Size.Ge(lambda deps, r, d: 1), + cp.Size.Le(lambda deps, r, d: 2**9), + ] case "sigmoid.default": tensor_constraints.extend( [ - cp.Dtype.In(lambda deps: [torch.float]), - cp.Rank.Le(lambda deps: 2**2), + cp.Dtype.In(lambda deps: [torch.float32]), cp.Value.Ge(lambda deps, dtype, struct: -2), cp.Value.Le(lambda deps, dtype, struct: 2), ] @@ -65,8 +90,7 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: case "rsqrt.default": tensor_constraints.extend( [ - cp.Dtype.In(lambda deps: [torch.float]), - cp.Rank.Le(lambda deps: 2**2), + cp.Dtype.In(lambda deps: [torch.float32]), cp.Value.Gt( lambda deps, dtype, struct: 0 ), # only generate real numbers @@ -76,14 +100,12 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: case "mean.dim": tensor_constraints.extend( [ - cp.Dtype.In(lambda deps: [torch.float]), - cp.Rank.Le(lambda deps: 2**2), + cp.Dtype.In(lambda deps: [torch.float32]), ] ) case "exp.default": tensor_constraints.extend( [ - cp.Rank.Le(lambda deps: 2**3), cp.Value.Ge(lambda deps, dtype, struct: -(2**2)), cp.Value.Le(lambda deps, dtype, struct: 2**2), ] @@ -96,26 +118,96 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]: cp.Value.Le(lambda deps, dtype, struct: 2), ] ) - case _: + case "constant_pad_nd.default": tensor_constraints.extend( [ - cp.Rank.Le(lambda deps: 2**2), + cp.Dtype.In(lambda deps: [torch.float32]), + cp.Size.Le(lambda deps, r, d: 2**2), + ] + ) + case "avg_pool2d.default": + tensor_constraints.extend( + [ + cp.Rank.Eq(lambda deps: 4), + ] + ) + case "bmm.default" | "addmm.default" | "mm.default": + tensor_constraints.extend( + [ + cp.Dtype.Eq(lambda deps: torch.float), + cp.Size.Le(lambda deps, r, d: 2**2), + cp.Value.Le(lambda deps, dtype, struct: 2**4), ] ) + case "div.Tensor": + tensor_constraints.extend( + [ + cp.Value.Ne(lambda deps, dtype, struct: 0), + ] + ) + case "div.Tensor_mode" | "minimum.default": + if index == 0: + tensor_constraints = [ + cp.Dtype.In(lambda deps: [torch.int64, torch.int32, torch.float32]), + cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), + cp.Value.Le(lambda deps, dtype, struct: 2**4), + cp.Rank.Ge(lambda deps: 1), + cp.Size.Ge(lambda deps, r, d: 1), + cp.Size.Le(lambda deps, r, d: 2**2), + ] + else: + tensor_constraints = [ + cp.Dtype.In(lambda deps: [torch.int64, torch.int32, torch.float32]), + cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), + cp.Value.Le(lambda deps, dtype, struct: 2**4), + cp.Rank.Ge(lambda deps: 1), + cp.Rank.Eq(lambda deps: deps[0].dim()), + cp.Size.Eq(lambda deps, r, d: fn.safe_size(deps[0], d)), + ] + case "_native_batch_norm_legit_no_training.default": + tensor_constraints.extend( + [ + cp.Rank.Le(lambda deps: 3), + ], + ) + case "reciprocal.default": + tensor_constraints = [ + cp.Value.Ge(lambda deps, dtype, struct: -(2**2)), + cp.Value.Le(lambda deps, dtype, struct: 2**2), + cp.Size.Le(lambda deps, r, d: 2**3), + ] + case "_softmax.default": + tensor_constraints.extend( + [ + cp.Dtype.Eq(lambda deps: torch.float32), + cp.Size.Le(lambda deps, r, d: 2**2), + ] + ) + case _: + pass return tensor_constraints def apply_scalar_contraints(op_name: str) -> list[ScalarDtype]: match op_name: - case "add.Scalar" | "sub.Scalar" | "mul.Scalar" | "div.Scalar": + case ( + "add.Scalar" + | "sub.Scalar" + | "mul.Scalar" + | "div.Scalar" + | "constant_pad_nd.default" + ): + return [ScalarDtype.int] + case "full.default": return [ScalarDtype.int] - case _: return [ScalarDtype.float, ScalarDtype.int] @lru_cache(maxsize=None) -def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, str]]]: +def facto_testcase_gen( # noqa: C901 + op_name: str, +) -> List[Tuple[List[str], OrderedDict[str, str]]]: # minimal example to test add.Tensor using FACTO spec = SpecDictDB[op_name] @@ -149,6 +241,12 @@ def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, s cp.Dtype.In(lambda deps: apply_scalar_contraints(op_name)), ] ) + if in_spec.name == "dtype": # full.default + spec.inspec[index].constraints.extend( + [ + cp.Dtype.In(lambda deps: [torch.long, torch.float]), + ] + ) elif in_spec.type.is_tensor(): spec.inspec[index].constraints.extend( apply_tensor_contraints(op_name, index) @@ -166,6 +264,29 @@ def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, s cp.Dtype.In(lambda deps: [torch.bool]), ] ) + elif in_spec.type.is_length_list(): + spec.inspec[index].constraints.extend( + [ + cp.Value.Ge(lambda deps, dtype, struct: 0), + ] + ) + if op_name == "avg_pool2d.default": + spec.inspec[index].constraints.extend( + [ + cp.Length.Eq(lambda deps: 2), + ] + ) + elif in_spec.type.is_shape(): + spec.inspec[index].constraints.extend( + [ + cp.Rank.Ge(lambda deps: 1), + cp.Rank.Le(lambda deps: 2**2), + cp.Value.Gt(lambda deps, dtype, struct: 0), + cp.Value.Le(lambda deps, dtype, struct: 2**2), + cp.Size.Ge(lambda deps, r, d: 1), + cp.Size.Le(lambda deps, r, d: 2**2), + ] + ) return [ (posargs, inkwargs) From f5f3524c83d2f2ce855c43ae233ec181320f8094 Mon Sep 17 00:00:00 2001 From: Zonglin Peng Date: Wed, 20 Aug 2025 11:35:14 -0700 Subject: [PATCH 2/3] fix MM nullptr from zero bias (#13523) Summary: solve ``` *Error* Unhandled user exception: LoadProhibitedCause (0x00000000) ``` Differential Revision: D80487955 --- backends/cadence/hifi/kernels/kernels.cpp | 13 ++++++++++++- backends/cadence/hifi/operators/op_mm.cpp | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp index bf4a2d143fd..d2cf6dd5057 100644 --- a/backends/cadence/hifi/kernels/kernels.cpp +++ b/backends/cadence/hifi/kernels/kernels.cpp @@ -21,8 +21,19 @@ memcpy(void* dst, const void* src, size_t num_bytes) { } void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) { + ET_LOG(Info, "Attempting to allocate %zu bytes of temp memory", size); Result temp_mem_res = ctx.allocate_temp(size); - return temp_mem_res.ok() ? temp_mem_res.get() : nullptr; + if (temp_mem_res.ok()) { + void* ptr = temp_mem_res.get(); + ET_LOG(Info, "Successfully allocated temp memory at %p", ptr); + return ptr; + } else { + ET_LOG( + Error, + "Failed to allocate temp memory, error: 0x%x", + static_cast(temp_mem_res.error())); + return nullptr; + } } // Quantize a fp32 value to an int8_t/uint8_t value diff --git a/backends/cadence/hifi/operators/op_mm.cpp b/backends/cadence/hifi/operators/op_mm.cpp index abb53a7ad7c..dae466e21ac 100644 --- a/backends/cadence/hifi/operators/op_mm.cpp +++ b/backends/cadence/hifi/operators/op_mm.cpp @@ -79,6 +79,17 @@ Tensor& mm_out( (WORD32* __restrict__)kernels::allocate_temp_memory( ctx, (n * p) * sizeof(WORD32)); + // Allocate zero-initialized bias for matmul function (it doesn't accept + // NULL) + FLOAT32* __restrict__ p_bias_zero = + (FLOAT32* __restrict__)kernels::allocate_temp_memory( + ctx, m * sizeof(FLOAT32)); + + // Initialize bias to zero since mm operation has no bias + for (int i = 0; i < m; i++) { + p_bias_zero[i] = 0.0f; + } + WORD32 p_inp_shape[2]; p_inp_shape[0] = n; p_inp_shape[1] = p; @@ -109,11 +120,13 @@ Tensor& mm_out( const FLOAT32* __restrict__ p_vec = (const FLOAT32* __restrict__)p_o; + // mm will always be converted to addmm and to linear, and move transpose to + // graph WORD32 val = xa_nn_matmul_f32xf32_f32( p_out, p_mat1, p_vec, - NULL, + p_bias_zero, rows, cols1, row_stride1, @@ -121,7 +134,6 @@ Tensor& mm_out( vec_offset, out_offset, out_stride); - return out; } From 3d6b898c7851d3c4effae2efb94b93690b13b148 Mon Sep 17 00:00:00 2001 From: Zonglin Peng Date: Wed, 20 Aug 2025 11:35:14 -0700 Subject: [PATCH 3/3] Add 1MB temp allocator for executor (#13533) Summary: Method::load creates a PlatformMemoryAllocator as fallback when no temp allocator is provided. So our KernelRuntimeContext will always be PlatformMemoryAllocator since no temp mem is allocated. Overrides et_pal_allocate to allocate code pointer: https://fburl.com/code/216qnnvt Differential Revision: D80578578 --- .../fusion_g3/operators/op_native_layer_norm.cpp | 7 +++++-- backends/cadence/hifi/kernels/kernels.cpp | 14 +++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp index 7e71df62d54..11c3edbb6a2 100644 --- a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp +++ b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp @@ -221,12 +221,15 @@ std::tuple native_layer_norm_out( num_elm *= normalized_shape[i]; } + constexpr size_t kAlignment = + 16; // 16-byte alignment for vectorized operations + float* weight_data; if (weight.has_value()) { weight_data = weight.value().mutable_data_ptr(); } else { executorch::runtime::Result temp_mem_weight = - ctx.allocate_temp(num_elm * sizeof(float)); + ctx.allocate_temp(num_elm * sizeof(float), kAlignment); weight_data = (float*)(temp_mem_weight.get()); for (int i = 0; i < num_elm; i++) { @@ -238,7 +241,7 @@ std::tuple native_layer_norm_out( bias_data = bias.value().mutable_data_ptr(); } else { executorch::runtime::Result temp_mem_bias = - ctx.allocate_temp(num_elm * sizeof(float)); + ctx.allocate_temp(num_elm * sizeof(float), kAlignment); bias_data = (float*)(temp_mem_bias.get()); for (int i = 0; i < num_elm; i++) { diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp index d2cf6dd5057..98708349fb1 100644 --- a/backends/cadence/hifi/kernels/kernels.cpp +++ b/backends/cadence/hifi/kernels/kernels.cpp @@ -21,11 +21,19 @@ memcpy(void* dst, const void* src, size_t num_bytes) { } void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) { - ET_LOG(Info, "Attempting to allocate %zu bytes of temp memory", size); - Result temp_mem_res = ctx.allocate_temp(size); + constexpr size_t kAlignment = + 16; // 16-byte alignment for vectorized operations + ET_LOG( + Info, + "Attempting to allocate %zu bytes of temp memory (16-byte aligned)", + size); + Result temp_mem_res = ctx.allocate_temp(size, kAlignment); if (temp_mem_res.ok()) { void* ptr = temp_mem_res.get(); - ET_LOG(Info, "Successfully allocated temp memory at %p", ptr); + ET_LOG( + Info, + "Successfully allocated temp memory at %p (16-byte aligned)", + ptr); return ptr; } else { ET_LOG(