diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp similarity index 100% rename from backends/cadence/hifi/operators/dequantize_per_tensor.cpp rename to backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp index 342c982a076..82fa7502dea 100644 --- a/backends/cadence/hifi/operators/op_mean.cpp +++ b/backends/cadence/hifi/operators/op_mean.cpp @@ -56,7 +56,7 @@ int prepare_data( return num_axis_dims; } -Tensor& mean_dim_out( +Tensor& mean_out( RuntimeContext& ctx, const Tensor& in, optional> dim_list, diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp similarity index 100% rename from backends/cadence/hifi/operators/quantize_per_tensor.cpp rename to backends/cadence/hifi/operators/op_quantize_per_tensor.cpp diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/op_quantized_layer_norm.cpp similarity index 100% rename from backends/cadence/hifi/operators/quantized_layer_norm.cpp rename to backends/cadence/hifi/operators/op_quantized_layer_norm.cpp diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/op_quantized_linear_out.cpp similarity index 100% rename from backends/cadence/hifi/operators/quantized_linear_out.cpp rename to backends/cadence/hifi/operators/op_quantized_linear_out.cpp diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp similarity index 98% rename from backends/cadence/hifi/operators/quantized_relu_out.cpp rename to backends/cadence/hifi/operators/op_quantized_relu_out.cpp index d78e555ad1e..0860109f7c1 100644 --- a/backends/cadence/hifi/operators/quantized_relu_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp @@ -45,7 +45,7 @@ void quantized_relu_( } } -void quantized_relu_out( +void quantized_relu_per_tensor_out( KernelRuntimeContext& ctx, const Tensor& input, const Tensor& in_zero_point, @@ -100,4 +100,4 @@ void quantized_relu_out( } // namespace native } // namespace HiFi } // namespace impl -} // namespace cadence \ No newline at end of file +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_remainder.cpp b/backends/cadence/hifi/operators/op_remainder.cpp index d8c4a6d2d80..99cd6ad544e 100644 --- a/backends/cadence/hifi/operators/op_remainder.cpp +++ b/backends/cadence/hifi/operators/op_remainder.cpp @@ -8,6 +8,7 @@ #include +#include #include #include #include @@ -15,8 +16,6 @@ #include #include -#include "kernels.h" - using executorch::aten::RuntimeContext; using executorch::aten::Scalar; using executorch::aten::ScalarType; diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp index e026afd2c92..5db8ed8425e 100644 --- a/backends/cadence/hifi/operators/op_softmax.cpp +++ b/backends/cadence/hifi/operators/op_softmax.cpp @@ -8,11 +8,11 @@ #include +#include #include #include #include #include -#include "kernels.h" using executorch::aten::ScalarType; using executorch::aten::Tensor; @@ -24,7 +24,7 @@ namespace impl { namespace HiFi { namespace native { -Tensor& softmax_out( +Tensor& _softmax_out( KernelRuntimeContext& ctx, const Tensor& in, int64_t dim, diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index 6c671a5f24a..1c2b481410d 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -1,243 +1,70 @@ load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - # Define build targets for all operators registered in the tables above. - runtime.cxx_library( - name = "quantize_per_tensor", - srcs = [ - "quantize_per_tensor.cpp" - ], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) +def define_operator(name: str, deps: list[str] | None = None) -> None: + op_name = "op_{}".format(name) - runtime.cxx_library( - name = "dequantize_per_tensor", - srcs = [ - "dequantize_per_tensor.cpp" - ], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) + # Deps used by all operators. + common_deps = [ + "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/kernels/portable/cpu/pattern:all_deps", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/backends/cadence/hifi/kernels:kernels", + "//executorch/kernels/portable/cpu/util:dtype_util", + "//executorch/kernels/portable/cpu/util:elementwise_util", + "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" + ] + if deps == None: + deps = [] runtime.cxx_library( - name = "quantized_layer_norm", - srcs = [ - "quantized_layer_norm.cpp" - ], - exported_headers = ["operators.h"], + name = op_name, + srcs = [op_name + ".cpp"], platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], visibility = [ "//executorch/backends/cadence/...", "@EXECUTORCH_CLIENTS", ], - ) - - runtime.cxx_library( - name = "quantized_linear_out", - srcs = [ - "quantized_linear_out.cpp" - ], + deps = deps + common_deps, exported_headers = ["operators.h"], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - runtime.cxx_library( - name = "op_add", - srcs = [ - "op_add.cpp", - ], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions", - "//executorch/kernels/portable/cpu/util:dtype_util", - "//executorch/kernels/portable/cpu/util:elementwise_util", - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - - runtime.cxx_library( - name = "op_mul", - srcs = [ - "op_mul.cpp", - ], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/kernels/portable/cpu/util:dtype_util", - "//executorch/kernels/portable/cpu/util:elementwise_util", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) - - runtime.cxx_library( - name = "op_sub", - srcs = [ - "op_sub.cpp", - ], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/kernels/portable/cpu/util:dtype_util", - "//executorch/kernels/portable/cpu/util:elementwise_util", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], ) - runtime.cxx_library( - name = "op_div", - srcs = [ - "op_div.cpp", - ], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/kernels/portable/cpu/util:dtype_util", - "//executorch/kernels/portable/cpu/util:elementwise_util", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) +OPERATORS = [ + "add", + "atan2", + "cat", + "clamp", + "dequantize_per_tensor", + "div", + "full", + "maximum", + "mean", + "minimum", + "mul", + "permute_copy", + "pow", + "quantize_per_tensor", + "quantized_layer_norm", + "quantized_linear_out", + "quantized_relu_out", + "remainder", + "rsqrt", + "sigmoid", + "softmax", + "sub", + "tanh", + "where" +] - runtime.cxx_library( - name = "op_sigmoid", - srcs = [ - "op_sigmoid.cpp", - ], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/kernels/portable/cpu/util:dtype_util", - "//executorch/kernels/portable/cpu/util:elementwise_util", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. - runtime.cxx_library( - name = "op_tanh", - srcs = [ - "op_tanh.cpp", - ], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ - - runtime.cxx_library( - name = "op_where", - srcs = [ - "op_where.cpp", - ], - platforms = CXX, - deps = [ - "//executorch/kernels/portable/cpu/util:all_deps", - "//executorch/kernels/portable/cpu/pattern:all_deps", - "//executorch/runtime/kernel:kernel_includes", - "//executorch/backends/cadence/hifi/kernels:kernels", - "//executorch/kernels/portable/cpu/util:elementwise_util", - "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" - ], - visibility = [ - "//executorch/backends/cadence/...", - "@EXECUTORCH_CLIENTS", - ], - ) + # Define build targets for all operators registered in the tables above. + for op in OPERATORS: + define_operator(op) diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c new file mode 100644 index 00000000000..b069035dc90 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c @@ -0,0 +1,232 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +#include "xa_nnlib_common.h" + +#include + +/* + * Currently only supports upto 5D input tensors. + * 1/2/3/4 D input tensors will be scaled up to 5D. + * For example, 2x3 -> 1x1x1x2x3. + */ + +WORD32 xa_nn_transpose_8_8(WORD8 * __restrict__ p_out + ,const WORD32 *const p_out_shape + ,const WORD8 * __restrict__ p_inp + ,const WORD32 *const p_inp_shape + ,const WORD32 * __restrict__ p_permute_vec + ,WORD32 num_out_dims + ,WORD32 num_inp_dims) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp, -1); + XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1); + + /* Invalid input checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1); + XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1); + + int itr = 0; + for(itr=0; itr < num_inp_dims; itr++) + { + XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1); + } + for(itr=0; itr < num_out_dims; itr++) + { + XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1); + } + + /* Output shape provided must be correct based on input + * shape and permute values */ + for(itr=0; itr < num_out_dims; itr++) + { + int output_dim = p_out_shape[itr]; + int expected_dim = p_inp_shape[p_permute_vec[itr]]; + XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1); + } + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1); + + /* Shift all dim with 1 in the outer part */ + int eff_output_shape[5]; + int eff_permute_vec[5]; + + for(int i = 0; i < num_out_dims; i++) + { + eff_output_shape[i] = p_out_shape[i]; + eff_permute_vec[i] = p_permute_vec[i]; + } + + int one_i=num_out_dims-1, non_one_i=num_out_dims-1; + while(one_i > 0 && non_one_i >=0){ + while(one_i > 0 && eff_output_shape[one_i]!=1){ + one_i--; + } + non_one_i = one_i; + while(non_one_i >= 0 && eff_output_shape[non_one_i]==1) + { + non_one_i--; + } + if(one_i > 0 && non_one_i >=0){ + int temp; + /*swap output_shape*/ + { + temp = eff_output_shape[one_i]; + eff_output_shape[one_i] = eff_output_shape[non_one_i]; + eff_output_shape[non_one_i] = temp; + } + /*swap permute_vec*/ + { + temp = eff_permute_vec[one_i]; + eff_permute_vec[one_i] = eff_permute_vec[non_one_i]; + eff_permute_vec[non_one_i] = temp; + } + + } + } + + + /* Promoting lesser dim tensors to 5D tensors. + * Also updating the permute_vec and shapes as needed for optimization */ + int p_5D_inp_shape[5] = {1, 1, 1, 1, 1}; + int p_5D_out_shape[5] = {1, 1, 1, 1, 1}; + int p_5D_permute_vec[5] = {0, 1, 2, 3, 4}; + + /* Check if any inner inp dimension is same in the output */ + int last_dim_same = 1, last_n_same_dim = 0; + itr = num_inp_dims - 1; + while(itr >= 0) + { + last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim; + last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0; + itr--; + } + + int dims_added = 5 - num_inp_dims; + itr = num_inp_dims - 1; + int same_count = last_n_same_dim; + int count = 4; + while(itr >= 0) + { + p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr]; + p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr]; + same_count--; + itr--; + count = (same_count > 0) ? count : count - 1; + } + + itr = num_inp_dims - 1; + same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0; + count = 4; + while(itr >= 0) + { + p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added; + same_count--; + itr--; + count--; + } + + int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4; + int inp_dim1, inp_dim2, inp_dim3, inp_dim4; + int inp_stride[5]; + + out_dim0 = p_5D_out_shape[0]; + out_dim1 = p_5D_out_shape[1]; + out_dim2 = p_5D_out_shape[2]; + out_dim3 = p_5D_out_shape[3]; + out_dim4 = p_5D_out_shape[4]; + + inp_dim1 = p_5D_inp_shape[1]; + inp_dim2 = p_5D_inp_shape[2]; + inp_dim3 = p_5D_inp_shape[3]; + inp_dim4 = p_5D_inp_shape[4]; + + inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4; + inp_stride[1] = inp_dim2*inp_dim3*inp_dim4; + inp_stride[2] = inp_dim3*inp_dim4; + inp_stride[3] = inp_dim4; + inp_stride[4] = 1; + + if(last_n_same_dim) + { + int itr0, itr1, itr2, itr3; + WORD8 *p_inp0 = (WORD8*)p_inp; + for(itr0 = 0; itr0 < out_dim0; itr0++) + { + WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); +#pragma loop_count min=1 + for(itr1 = 0; itr1 < out_dim1; itr1++) + { + WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); +#pragma loop_count min=1 + for(itr2 = 0; itr2 < out_dim2; itr2++) + { + WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); +#pragma loop_count min=1 + for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4) + { + WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); + memcpy(p_out, p_inp4, out_dim4); + } + } + } + } + } + else + { + int itr0, itr1, itr2, itr3, itr4; + WORD8 *p_inp0 = (WORD8*)p_inp; + for(itr0 = 0; itr0 < out_dim0; itr0++) + { + WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); + for(itr1 = 0; itr1 < out_dim1; itr1++) + { + WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); + for(itr2 = 0; itr2 < out_dim2; itr2++) + { + WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); + for(itr3 = 0; itr3 < out_dim3; itr3++) + { + WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); + for(itr4 = 0; itr4 < out_dim4; itr4++) + { + WORD8 d0 = *(p_inp4); + p_inp4 += inp_stride[p_5D_permute_vec[4]]; + *p_out++ = d0; + + } + } + } + } + } + } + + return 0; +}