diff --git a/.gitmodules b/.gitmodules index 58f2133ed67..afae765e2b8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -66,7 +66,7 @@ url = https://github.com/pybind/pybind11.git [submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"] path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3 - url = https://github.com/foss-xtensa/nnlib-FusionG3/ + url = https://github.com/foss-xtensa/nnlib-FusionG3.git [submodule "third-party/ao"] path = third-party/ao url = https://github.com/pytorch/ao.git diff --git a/backends/cadence/aot/functions_fusion_g3.yaml b/backends/cadence/aot/functions_fusion_g3.yaml index f1f934b9701..5ca05544806 100644 --- a/backends/cadence/aot/functions_fusion_g3.yaml +++ b/backends/cadence/aot/functions_fusion_g3.yaml @@ -50,12 +50,12 @@ - op: div.out kernels: - arg_meta: null - kernel_name: torch::executor::div_out + kernel_name: cadence::impl::G3::div_out - op: div.out_mode kernels: - arg_meta: null - kernel_name: torch::executor::div_out_mode + kernel_name: cadence::impl::G3::div_out_mode - op: embedding.out kernels: @@ -71,7 +71,6 @@ kernels: - arg_meta: null kernel_name: cadence::impl::G3::mul_out - - op: mul.Scalar_out kernels: - arg_meta: null @@ -80,7 +79,7 @@ - op: permute_copy.out kernels: - arg_meta: null - kernel_name: torch::executor::permute_copy_out + kernel_name: cadence::impl::G3::permute_copy_out - op: sigmoid.out kernels: @@ -90,7 +89,7 @@ - op: slice_copy.Tensor_out kernels: - arg_meta: null - kernel_name: torch::executor::slice_copy_Tensor_out + kernel_name: cadence::impl::G3::slice_copy_Tensor_out - op: split_with_sizes_copy.out kernels: @@ -100,7 +99,12 @@ - op: sub.out kernels: - arg_meta: null - kernel_name: torch::executor::sub_out + kernel_name: cadence::impl::G3::sub_out + +- op: sub.Scalar_out + kernels: + - arg_meta: null + kernel_name: cadence::impl::G3::sub_scalar_out - op: view_copy.out kernels: @@ -117,6 +121,16 @@ - arg_meta: null kernel_name: cadence::impl::G3::native_layer_norm_out +- op: mean.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::G3::mean_dim_out + +- op: exp.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::G3::exp_out + # custom ops - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) variants: function diff --git a/backends/cadence/fusion_g3/operators/CMakeLists.txt b/backends/cadence/fusion_g3/operators/CMakeLists.txt index 704b4aa741a..cac16bddc50 100644 --- a/backends/cadence/fusion_g3/operators/CMakeLists.txt +++ b/backends/cadence/fusion_g3/operators/CMakeLists.txt @@ -36,6 +36,12 @@ set(_aten_ops__srcs "${CMAKE_CURRENT_SOURCE_DIR}/op_native_layer_norm.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/op_quantize.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/op_dequantize.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_sub.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_div.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_mean.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_slice_copy.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_permute_copy.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_exp.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp" @@ -51,6 +57,7 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp" ) add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp index f40fcc973b0..d51fee5338f 100644 --- a/backends/cadence/fusion_g3/operators/op_add.cpp +++ b/backends/cadence/fusion_g3/operators/op_add.cpp @@ -39,6 +39,7 @@ Tensor& add_out( ScalarType common_type = executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type()); +#ifdef OP_ARG_CHECK // Check Common Dtype ET_KERNEL_CHECK( ctx, @@ -62,12 +63,12 @@ Tensor& add_out( torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok, InvalidArgument, out); +#endif // Compute Dtype ScalarType compute_type = torch::executor::native::utils::get_compute_type(common_type); - // @lint-ignore CLANGTIDY facebook-hte-CArray static constexpr const char op_name[] = "add.out"; int kTensorDimensionLimit = 5; @@ -253,6 +254,7 @@ Tensor& add_scalar_out( torch::executor::native::utils::promote_type_with_scalar( a.scalar_type(), b); +#ifdef OP_ARG_CHECK // Check Common Dtype ET_KERNEL_CHECK( ctx, @@ -276,7 +278,7 @@ Tensor& add_scalar_out( executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out); - +#endif // Compute Dtype ScalarType compute_type = torch::executor::native::utils::get_compute_type(common_type); diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp index f0f327c024b..74fd96a2120 100644 --- a/backends/cadence/fusion_g3/operators/op_cat.cpp +++ b/backends/cadence/fusion_g3/operators/op_cat.cpp @@ -6,13 +6,18 @@ * LICENSE file in the root directory of this source tree. */ +#include +#include + #include #include +#include #include #include +using ::executorch::aten::ArrayRef; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::Error; @@ -23,7 +28,6 @@ using ::executorch::runtime::KernelRuntimeContext; * updated to have support for below data types, these can be removed and * operator need to be updated accordingly */ -enum datatype { Ushort = 20, Uint = 23 }; namespace cadence { namespace impl { @@ -32,20 +36,22 @@ namespace native { Tensor& cat_out( KernelRuntimeContext& ctx, - exec_aten::ArrayRef tensors, + ArrayRef tensors, int64_t dim, Tensor& out) { if (dim < 0) { dim += out.dim(); } + int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit; + +#ifdef OP_ARG_CHECK ET_KERNEL_CHECK( ctx, torch::executor::check_cat_args(tensors, dim, out), InvalidArgument, out); - int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit; Tensor::SizesType expected_out_size[kTensorDimensionLimit]; size_t expected_out_dim = 0; torch::executor::get_cat_out_target_size( @@ -57,6 +63,20 @@ Tensor& cat_out( out, {expected_out_size, expected_out_dim}) == Error::Ok, InvalidArgument, out); +#endif + // Special handling when all inputs are 1D-empty tensors for aten + // consistency In that case, just return an 1D-empty tensor without checking + // dim + bool all_1d_empty = true; + for (size_t i = 0; i < tensors.size(); ++i) { + if (tensors[i].numel() != 0 || tensors[i].dim() != 1) { + all_1d_empty = false; + break; + } + } + if (all_1d_empty) { + return out; + } const signed char* inp_tensors[tensors.size()]; const int* inp_tensors_shapes[tensors.size()]; @@ -64,7 +84,7 @@ Tensor& cat_out( int inp_shapes_size[tensors.size()]; int temp_sizes[tensors.size()][kTensorDimensionLimit]; - exec_aten::ArrayRef temp_size; + ArrayRef temp_size; for (int i = 0; i < tensors.size(); i++) { inp_tensors[i] = tensors[i].const_data_ptr(); @@ -79,55 +99,23 @@ Tensor& cat_out( signed char* out_data = out.mutable_data_ptr(); - const exec_aten::ArrayRef out_size = out.sizes(); + const ArrayRef out_size = out.sizes(); int out_shapes[kTensorDimensionLimit]; for (int i = 0; i < out_size.size(); i++) // output shapes { out_shapes[i] = out_size[i]; } - if (out.scalar_type() == ScalarType::Int) { - xa_nn_cat( - out_data, - out_shapes, - inp_tensors, - inp_tensors_shapes, - inp_shapes_size[0], - tensors.size(), - (int)dim, - sizeof(int)); - } else if (out.scalar_type() == ScalarType::Short) { - xa_nn_cat( - out_data, - out_shapes, - inp_tensors, - inp_tensors_shapes, - inp_shapes_size[0], - tensors.size(), - (int)dim, - sizeof(short)); - } else if (out.scalar_type() == ScalarType::Char) { - xa_nn_cat( - out_data, - out_shapes, - inp_tensors, - inp_tensors_shapes, - inp_shapes_size[0], - tensors.size(), - (int)dim, - sizeof(char)); - } else if (out.scalar_type() == (ScalarType)Uint) { - xa_nn_cat( - out_data, - out_shapes, - inp_tensors, - inp_tensors_shapes, - inp_shapes_size[0], - tensors.size(), - (int)dim, - sizeof(int)); - } else if (out.scalar_type() == (ScalarType)Ushort) { - xa_nn_cat( + if ((out.scalar_type() == ScalarType::Int) || + (out.scalar_type() == ScalarType::Short) || + (out.scalar_type() == ScalarType::Char) || + (out.scalar_type() == ScalarType::UInt32) || + (out.scalar_type() == ScalarType::UInt16) || + (out.scalar_type() == ScalarType::Byte)) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_cat, out_data, out_shapes, inp_tensors, @@ -135,32 +123,8 @@ Tensor& cat_out( inp_shapes_size[0], tensors.size(), (int)dim, - sizeof(short)); - } else if (out.scalar_type() == ScalarType::Byte) { - xa_nn_cat( - out_data, - out_shapes, - inp_tensors, - inp_tensors_shapes, - inp_shapes_size[0], - tensors.size(), - (int)dim, - sizeof(char)); - + get_element_size(out.scalar_type())); } else { - // Special handling when all inputs are 1D-empty tensors for aten - // consistency In that case, just return an 1D-empty tensor without checking - // dim - bool all_1d_empty = true; - for (size_t i = 0; i < tensors.size(); ++i) { - if (tensors[i].numel() != 0 || tensors[i].dim() != 1) { - all_1d_empty = false; - break; - } - } - if (all_1d_empty) { - return out; - } const size_t outer = executorch::runtime::getLeadingDims(out, dim); const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim); const size_t ninputs = tensors.size(); diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp index ed5b3125ac4..cff50f2a90b 100644 --- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp +++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp @@ -6,30 +6,32 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include #include #include #include +#include #include #include -using ::executorch::aten::Scalar; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::Error; using ::executorch::runtime::KernelRuntimeContext; template -using optional = exec_aten::optional; +using optional = ::executorch::aten::optional; /* ScalarType in Executorch do not have support for below data types. * So, creating a placeholder for these data types. Once, ScalarTypes is * updated to have support for below data types, these can be removed and * operator need to be updated accordingly */ -enum datatype { Ushort = 20, Bits4u = 21, Bits4 = 22 }; +enum datatype { Bits4u = 21, Bits4 = 22 }; /** * For an input tensor, use the scale and zero_point arguments to quantize it. @@ -49,14 +51,13 @@ void check_dequantize_per_tensor_args( int64_t quant_min, int64_t quant_max, ScalarType dtype, - exec_aten::optional& out_dtype, + ::executorch::aten::optional& out_dtype, Tensor& out) { ET_CHECK_MSG( input.scalar_type() == ScalarType::Byte || input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::UInt16 || input.scalar_type() == ScalarType::Short || - input.scalar_type() == (ScalarType)Ushort || input.scalar_type() == (ScalarType)Bits4 || input.scalar_type() == (ScalarType)Bits4u || input.scalar_type() == ScalarType::Int, @@ -85,14 +86,16 @@ void check_dequantize_per_tensor_args( } // namespace /* Local function which calls the kernels based on the input datatype */ -void dequantize_impl( +Tensor& dequantize_impl( + KernelRuntimeContext& ctx, Tensor& out, const Tensor& input, float* scale_data, int* zero_point_data, int* axis, - exec_aten::optional out_dtype) { - const exec_aten::ArrayRef input_size = input.sizes(); + ::executorch::aten::optional out_dtype) { + const ::executorch::aten::ArrayRef input_size = + input.sizes(); int kTensorDimensionLimit = 5; @@ -125,7 +128,10 @@ void dequantize_impl( if (is_asym_dequant) { if (input.scalar_type() == ScalarType::Byte) { const uint8_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_asym8u_f32( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_asym8u_f32, out_data, input_data, inp_shape, @@ -135,7 +141,10 @@ void dequantize_impl( scale_data); } else if (input.scalar_type() == ScalarType::Char) { const int8_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_asym8_f32( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_asym8_f32, out_data, input_data, inp_shape, @@ -143,9 +152,12 @@ void dequantize_impl( axis, zero_point_data, scale_data); - } else if (input.scalar_type() == (ScalarType)Ushort) { + } else if (input.scalar_type() == ScalarType::UInt16) { const uint16_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_asym16u_f32( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_asym16u_f32, out_data, input_data, inp_shape, @@ -155,7 +167,10 @@ void dequantize_impl( scale_data); } else if (input.scalar_type() == ScalarType::Short) { const int16_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_asym16_f32( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_asym16_f32, out_data, input_data, inp_shape, @@ -165,7 +180,10 @@ void dequantize_impl( scale_data); } else if (input.scalar_type() == (ScalarType)Bits4u) { const uint8_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_asym4u_f32( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_asym4u_f32, out_data, input_data, inp_shape, @@ -175,7 +193,10 @@ void dequantize_impl( scale_data); } else if (input.scalar_type() == (ScalarType)Bits4) { const int8_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_asym4_f32( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_asym4_f32, out_data, input_data, inp_shape, @@ -233,8 +254,9 @@ void dequantize_impl( } } - exec_aten::optional> optional_dim_list{ - exec_aten::ArrayRef{dims, size_t(input.dim() - 1)}}; + ::executorch::aten::optional<::executorch::aten::ArrayRef> + optional_dim_list{::executorch::aten::ArrayRef{ + dims, size_t(input.dim() - 1)}}; // Actual dequantization logic // input, out are the input and output tensors @@ -318,28 +340,76 @@ void dequantize_impl( } else { if (input.scalar_type() == ScalarType::Byte) { const uint8_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_sym8u_f32( - out_data, input_data, inp_shape, input.dim(), axis, scale_data); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_sym8u_f32, + out_data, + input_data, + inp_shape, + input.dim(), + axis, + scale_data); } else if (input.scalar_type() == ScalarType::Char) { const int8_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_sym8_f32( - out_data, input_data, inp_shape, input.dim(), axis, scale_data); - } else if (input.scalar_type() == (ScalarType)Ushort) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_sym8_f32, + out_data, + input_data, + inp_shape, + input.dim(), + axis, + scale_data); + } else if (input.scalar_type() == ScalarType::UInt16) { const uint16_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_sym16u_f32( - out_data, input_data, inp_shape, input.dim(), axis, scale_data); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_sym16u_f32, + out_data, + input_data, + inp_shape, + input.dim(), + axis, + scale_data); } else if (input.scalar_type() == ScalarType::Short) { const int16_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_sym16_f32( - out_data, input_data, inp_shape, input.dim(), axis, scale_data); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_sym16_f32, + out_data, + input_data, + inp_shape, + input.dim(), + axis, + scale_data); } else if (input.scalar_type() == (ScalarType)Bits4u) { const uint8_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_sym4u_f32( - out_data, input_data, inp_shape, input.dim(), axis, scale_data); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_sym4u_f32, + out_data, + input_data, + inp_shape, + input.dim(), + axis, + scale_data); } else if (input.scalar_type() == (ScalarType)Bits4) { const int8_t* input_data = input.const_data_ptr(); - xa_nn_elm_dequantize_sym4_f32( - out_data, input_data, inp_shape, input.dim(), axis, scale_data); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_dequantize_sym4_f32, + out_data, + input_data, + inp_shape, + input.dim(), + axis, + scale_data); } else { if (axis == NULL) { // calculate the dequantized output, cast scale to float to match fbgemm @@ -390,8 +460,9 @@ void dequantize_impl( } } - exec_aten::optional> optional_dim_list{ - exec_aten::ArrayRef{dims, size_t(input.dim() - 1)}}; + ::executorch::aten::optional<::executorch::aten::ArrayRef> + optional_dim_list{::executorch::aten::ArrayRef{ + dims, size_t(input.dim() - 1)}}; // Actual dequantization logic // input, out are the input and output tensors @@ -473,6 +544,7 @@ void dequantize_impl( } } } + return out; } /** @@ -485,14 +557,16 @@ void dequantize_impl( * info. */ Tensor& dequantize_per_tensor_out( + KernelRuntimeContext& context, const Tensor& input, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max, ScalarType dtype, - exec_aten::optional out_dtype, + ::executorch::aten::optional out_dtype, Tensor& out) { +#ifdef OP_ARG_CHECK torch::executor::Error err = resize_tensor(out, input.sizes()); ET_CHECK_MSG( err == torch::executor::Error::Ok, @@ -500,24 +574,28 @@ Tensor& dequantize_per_tensor_out( check_dequantize_per_tensor_args( input, quant_min, quant_max, dtype, out_dtype, out); +#endif float scale_data = (float)scale; int zero_point_data = (int)zero_point; - dequantize_impl(out, input, &scale_data, &zero_point_data, NULL, out_dtype); + dequantize_impl( + context, out, input, &scale_data, &zero_point_data, NULL, out_dtype); return out; } Tensor& dequantize_per_tensor_tensor_args_out( + KernelRuntimeContext& context, const Tensor& input, const Tensor& scale, const Tensor& zero_point, int64_t quant_min, int64_t quant_max, ScalarType dtype, - exec_aten::optional out_dtype, + ::executorch::aten::optional out_dtype, Tensor& out) { +#ifdef OP_ARG_CHECK ET_CHECK_MSG( scale.scalar_type() == ScalarType::Double, "Expected scale to be Double tensor received: %" PRId8, @@ -534,8 +612,10 @@ Tensor& dequantize_per_tensor_tensor_args_out( zero_point.numel() == 1, "Exepcted zero_point to only have one element received: %zd", ssize_t(zero_point.numel())); +#endif dequantize_per_tensor_out( + context, input, scale.const_data_ptr()[0], zero_point.const_data_ptr()[0], @@ -549,15 +629,24 @@ Tensor& dequantize_per_tensor_tensor_args_out( } Tensor& dequantize_per_channel_out( + KernelRuntimeContext& context, const Tensor& input, const Tensor& scale, - const exec_aten::optional& opt_zero_points, + const ::executorch::aten::optional& opt_zero_points, int64_t axis, int64_t quant_min, int64_t quant_max, ScalarType dtype, - exec_aten::optional out_dtype, + ::executorch::aten::optional out_dtype, Tensor& out) { + if (axis < 0) { + axis += executorch::runtime::nonzero_dim(input); + } + /* if the arguments are passed properly to the operator disable the Macro - + * "OP_ARG_CHECK" if not the case, enable the Macro - "OP_ARG_CHECK", to have + * the checks only in operator level(As there are no checks in kernel). + */ +#ifdef OP_ARG_CHECK torch::executor::Error err = resize_tensor(out, input.sizes()); // normalize axis @@ -567,10 +656,6 @@ Tensor& dequantize_per_channel_out( ssize_t(axis), ssize_t(input.dim())); - if (axis < 0) { - axis += executorch::runtime::nonzero_dim(input); - } - ET_CHECK_MSG( err == torch::executor::Error::Ok, "Failed to resize out Tensor in dequantize_per_channel_out"); @@ -599,9 +684,9 @@ Tensor& dequantize_per_channel_out( ssize_t(zero_point.numel()), ssize_t(input.size(axis))); } - check_dequantize_per_tensor_args( input, quant_min, quant_max, dtype, out_dtype, out); +#endif int* axis_ptr = (int*)&axis; @@ -622,80 +707,14 @@ Tensor& dequantize_per_channel_out( for (int i = 0; i < scale.numel(); i++) { scale_data[i] = (float)scale_dt[i]; } - dequantize_impl(out, input, scale_data, zero_point_ptr, axis_ptr, out_dtype); + dequantize_impl( + context, out, input, scale_data, zero_point_ptr, axis_ptr, out_dtype); return out; } -Tensor& dequantize_per_channel_out( - KernelRuntimeContext& context, - const Tensor& input, - const Tensor& scale, - const exec_aten::optional& opt_zero_points, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - exec_aten::optional out_dtype, - Tensor& out) { - (void)context; - torch::executor::Error err = resize_tensor(out, input.sizes()); - ET_CHECK_MSG( - err == torch::executor::Error::Ok, - "Failed to resize out Tensor in dequantize_per_channel_out"); - return dequantize_per_channel_out( - input, - scale, - opt_zero_points, - axis, - quant_min, - quant_max, - dtype, - out_dtype, - out); -} - -Tensor& dequantize_per_tensor_out( - KernelRuntimeContext& context, - const Tensor& input, - double scale, - int64_t zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out) { - // TODO(larryliu): Add a context arg to the real op function and remove this - // wrapper - (void)context; - return dequantize_per_tensor_out( - input, - scale, - zero_point, - quant_min, - quant_max, - dtype, - out.scalar_type(), - out); -} - -Tensor& dequantize_per_tensor_tensor_args_out( - KernelRuntimeContext& context, - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - exec_aten::optional out_dtype, - Tensor& out) { - // TODO(larryliu): Add a context arg to the real op function and remove this - // wrapper - (void)context; - return dequantize_per_tensor_tensor_args_out( - input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out); -} - Tensor& dequantize_per_token_out( + KernelRuntimeContext& context, const Tensor& input, const Tensor& scale, const Tensor& zero_points, @@ -711,18 +730,18 @@ Tensor& dequantize_per_token_out( } // This unfortunate change is needed because we compile op_quantize for aten // mode as well - std::array input_sizes; - input_sizes[0] = static_cast(num_channels); + std::array<::executorch::aten::SizesType, 2> input_sizes; + input_sizes[0] = static_cast<::executorch::aten::SizesType>(num_channels); input_sizes[1] = - static_cast(input.size(input.dim() - 1)); + static_cast<::executorch::aten::SizesType>(input.size(input.dim() - 1)); #ifdef USE_ATEN_LIB Tensor reshaped_input = at::from_blob( input.mutable_data_ptr(), input_sizes, at::TensorOptions(input.scalar_type())); #else - std::array input_dim_order{0, 1}; - std::array input_strides; + std::array<::executorch::aten::DimOrderType, 2> input_dim_order{0, 1}; + std::array<::executorch::aten::StridesType, 2> input_strides; executorch::runtime::dim_order_to_stride_nocheck( input_sizes.data(), input_dim_order.data(), 2, input_strides.data()); void* input_data = input.mutable_data_ptr(); @@ -743,6 +762,7 @@ Tensor& dequantize_per_token_out( #endif return dequantize_per_channel_out( + context, reshaped_input, scale, zero_points, @@ -754,21 +774,6 @@ Tensor& dequantize_per_token_out( out); } -Tensor& dequantize_per_token_out( - KernelRuntimeContext& context, - const Tensor& input, - const Tensor& scale, - const Tensor& zero_points, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - ScalarType out_dtype, - Tensor& out) { - (void)context; - return dequantize_per_token_out( - input, scale, zero_points, quant_min, quant_max, dtype, out_dtype, out); -} - } // namespace native } // namespace G3 } // namespace impl diff --git a/backends/cadence/fusion_g3/operators/op_div.cpp b/backends/cadence/fusion_g3/operators/op_div.cpp new file mode 100644 index 00000000000..1461f643a84 --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_div.cpp @@ -0,0 +1,674 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +using ::executorch::aten::optional; +using ::executorch::aten::Scalar; +using ::executorch::aten::ScalarType; +using ::executorch::aten::string_view; +using ::executorch::aten::Tensor; +using ::executorch::runtime::canCast; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; + +namespace cadence { +namespace impl { +namespace G3 { +namespace native { + +namespace { + +ScalarType get_common_type(ScalarType a_type, ScalarType b_type) { + if (executorch::runtime::isFloatingType(a_type) && + executorch::runtime::isFloatingType(b_type)) { + return executorch::runtime::promoteTypes(a_type, b_type); + } else if (executorch::runtime::isFloatingType(a_type)) { + return a_type; + } else if (executorch::runtime::isFloatingType(b_type)) { + return b_type; + } + return ScalarType::Float; +} + +} // namespace + +Tensor& div_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + // Common Dtype + ScalarType common_type = + executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type()); + +#ifdef OP_ARG_CHECK + // Check Dim Order + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(a, b, out), + InvalidArgument, + out); + + // Resize + ET_KERNEL_CHECK( + ctx, + torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); +#endif + + // Compute Dtype + ScalarType compute_type = + torch::executor::native::utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "div.out"; + + int kTensorDimensionLimit = 5; + + int inp1_shape[kTensorDimensionLimit]; + int inp2_shape[kTensorDimensionLimit]; + int out_shape[kTensorDimensionLimit]; + + bool broadcast = 0; + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + bool optimized = 1; + + for (int i = 0; i < max_dim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int offset_out = max_dim - out.dim(); + int offset_inp1 = max_dim - a.dim(); + int offset_inp2 = max_dim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + offset_out] = out.size(i); + } + for (int i = 0; i < a.dim(); i++) { + inp1_shape[i + offset_inp1] = a.size(i); + } + for (int i = 0; i < b.dim(); i++) { + inp2_shape[i + offset_inp2] = b.size(i); + } + + /*find broadcast*/ + for (int i = 0; i < out.dim(); i++) { + if (((inp1_shape[i]) != (out_shape[i])) || + ((inp2_shape[i]) != (out_shape[i]))) { + broadcast = 1; + } + } + + if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) { + optimized = 0; + } + + if ((compute_type == ScalarType::Int) && (optimized)) { + const int* const inp1_data = a.const_data_ptr(); + const int* const inp2_data = b.const_data_ptr(); + float* const out_data = out.mutable_data_ptr(); + + if (b.numel() == 1) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_scalar_32x32_f32, + out_data, + inp1_data, + inp2_data[0], + out.numel()); + } else if (broadcast) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_broadcast_5D_32x32_f32, + out_data, + out_shape, + inp1_data, + inp1_shape, + inp2_data, + inp2_shape, + max_dim); + } else { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_32x32_f32, + out_data, + inp1_data, + inp2_data, + out.numel()); + } + } else if ((compute_type == ScalarType::Float) && (optimized)) { + const float* const inp1_data = a.const_data_ptr(); + const float* const inp2_data = b.const_data_ptr(); + float* const out_data = out.mutable_data_ptr(); + + int mode = 0; + + if (b.numel() == 1) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_scalar_f32xf32_f32, + out_data, + inp1_data, + inp2_data[0], + mode, + out.numel()); + } else if (broadcast) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_broadcast_5D_f32xf32_f32, + out_data, + out_shape, + inp1_data, + inp1_shape, + inp2_data, + inp2_shape, + mode, + max_dim); + } else { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_f32xf32_f32, + out_data, + inp1_data, + inp2_data, + mode, + out.numel()); + } + } else { + ScalarType common_type = get_common_type(a.scalar_type(), b.scalar_type()); + ScalarType compute_type = + torch::executor::native::utils::get_compute_type(common_type); + + ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + torch::executor::native::utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name>( + [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + return val_a / val_b; + }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + b, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + out, + torch::executor::native::utils::SupportedTensorDtypes::FLOATHBF16); + }); + } + + return out; +} + +Tensor& div_out_mode( + KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + optional mode, + Tensor& out) { + if (!mode.has_value()) { + return div_out(ctx, a, b, out); + } + + auto mode_val = mode.value(); + + // Check mode + ET_KERNEL_CHECK( + ctx, mode_val == "trunc" || mode_val == "floor", InvalidArgument, out); + + // Common Dtype + ScalarType common_type = + executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type()); + +#ifdef OP_ARG_CHECK + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, + (canCast(common_type, out.scalar_type()) && + common_type != ScalarType::Bool), + InvalidArgument, + out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(a, b, out), + InvalidArgument, + out); + + // Resize + ET_KERNEL_CHECK( + ctx, + torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); +#endif + // Compute Dtype + ScalarType compute_type = + torch::executor::native::utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "div.out_mode"; + + const bool mode_is_trunc = mode_val == "trunc"; + bool div_by_zero_error = false; + + int kTensorDimensionLimit = 5; + + int inp1_shape[kTensorDimensionLimit]; + int inp2_shape[kTensorDimensionLimit]; + int out_shape[kTensorDimensionLimit]; + + bool broadcast = 0; + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + bool optimized = 1; + + for (int i = 0; i < max_dim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int offset_out = max_dim - out.dim(); + int offset_inp1 = max_dim - a.dim(); + int offset_inp2 = max_dim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + offset_out] = out.size(i); + } + for (int i = 0; i < a.dim(); i++) { + inp1_shape[i + offset_inp1] = a.size(i); + } + for (int i = 0; i < b.dim(); i++) { + inp2_shape[i + offset_inp2] = b.size(i); + } + + /*find broadcast*/ + for (int i = 0; i < out.dim(); i++) { + if (((inp1_shape[i]) != (out_shape[i])) || + ((inp2_shape[i]) != (out_shape[i]))) { + broadcast = 1; + } + } + + if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) { + optimized = 0; + } + + int mode_value = (mode_val == "trunc") ? 1 : 2; + + if ((compute_type == ScalarType::Int) && (optimized)) { + const int* const inp1_data = a.const_data_ptr(); + const int* const inp2_data = b.const_data_ptr(); + int* const out_data = out.mutable_data_ptr(); + + if (b.numel() == 1) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_scalar_32x32_32, + out_data, + inp1_data, + inp2_data[0], + mode_value, + out.numel()); + } else if (broadcast) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_broadcast_5D_32x32_32, + out_data, + out_shape, + inp1_data, + inp1_shape, + inp2_data, + inp2_shape, + mode_value, + max_dim); + } else { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_32x32_32, + out_data, + inp1_data, + inp2_data, + mode_value, + out.numel()); + } + } else if ((compute_type == ScalarType::Float) && (optimized)) { + const float* const inp1_data = a.const_data_ptr(); + const float* const inp2_data = b.const_data_ptr(); + float* const out_data = out.mutable_data_ptr(); + + if (b.numel() == 1) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_scalar_f32xf32_f32, + out_data, + inp1_data, + inp2_data[0], + mode_value, + out.numel()); + } else if (broadcast) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_broadcast_5D_f32xf32_f32, + out_data, + out_shape, + inp1_data, + inp1_shape, + inp2_data, + inp2_shape, + mode_value, + max_dim); + } else { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_f32xf32_f32, + out_data, + inp1_data, + inp2_data, + mode_value, + out.numel()); + } + } else { + ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + torch::executor::native::utils:: + apply_bitensor_elementwise_fn( + [mode_is_trunc, &div_by_zero_error]( + const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + if (executorch::runtime::is_integral_type< + CTYPE_COMPUTE, + /*includeBool=*/true>::value) { + if (val_b == 0) { + div_by_zero_error = true; + return static_cast(0); + } + } + CTYPE_COMPUTE value = val_a / val_b; + if (mode_is_trunc) { + value = std::trunc(value); + } else { + // We established above that the mode is either trunc or + // floor, so it must be floor. + value = torch::executor::native::utils::floor_divide( + val_a, val_b); + } + return value; + }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + b, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + out, + torch::executor::native::utils::SupportedTensorDtypes::REALHBF16); + }); + } + + ET_KERNEL_CHECK_MSG( + ctx, + !div_by_zero_error, + InvalidArgument, + out, + "Div mode operation encountered integer division by zero"); + + return out; +} + +Tensor& div_scalar_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + // Common Dtype + ScalarType common_type = + torch::executor::native::utils::promote_type_with_scalar( + a.scalar_type(), b); + +#ifdef OP_ARG_CHECK + // Check Common Dtype + ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(a, out), + InvalidArgument, + out); + + // Resize + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out); +#endif + + // Compute Dtype + ScalarType compute_type = + torch::executor::native::utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "div.Scalar_out"; + + if (compute_type == ScalarType::Int) { + const int* const inp1_data = a.const_data_ptr(); + int inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + + float* const out_data = out.mutable_data_ptr(); + + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_scalar_32x32_f32, + out_data, + inp1_data, + inp2_val, + out.numel()); + } else if (compute_type == ScalarType::Float) { + const float* const inp1_data = a.const_data_ptr(); + float inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + + float* const out_data = out.mutable_data_ptr(); + + int mode = 0; + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_scalar_f32xf32_f32, + out_data, + inp1_data, + inp2_val, + mode, + out.numel()); + } else { + ScalarType common_type = + executorch::runtime::isFloatingType(a.scalar_type()) + ? a.scalar_type() + : ScalarType::Float; + ScalarType compute_type = + torch::executor::native::utils::get_compute_type(common_type); + ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + const CTYPE_COMPUTE val_b = + torch::executor::native::utils::scalar_to(b); + torch::executor::native::utils:: + apply_unitensor_elementwise_fn( + [val_b](const CTYPE_COMPUTE val_a) { return val_a / val_b; }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + out, + torch::executor::native::utils::SupportedTensorDtypes:: + SAME_AS_COMMON); + }); + } + + return out; +} + +Tensor& div_scalar_mode_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + optional mode, + Tensor& out) { + if (!mode.has_value()) { + return div_scalar_out(ctx, a, b, out); + } + + auto mode_val = mode.value(); + + // Check mode + ET_KERNEL_CHECK( + ctx, mode_val == "trunc" || mode_val == "floor", InvalidArgument, out); + + // Common Dtype + ScalarType common_type = + torch::executor::native::utils::promote_type_with_scalar( + a.scalar_type(), b); + +#ifdef OP_ARG_CHECK + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, + (canCast(common_type, out.scalar_type()) && + common_type != ScalarType::Bool), + InvalidArgument, + out); + + // Check for intergral division by zero + ET_KERNEL_CHECK_MSG( + ctx, + !(executorch::runtime::isIntegralType(common_type, true) && + torch::executor::native::utils::scalar_to(b) == 0), + InvalidArgument, + out, + "Div mode operation encountered integer division by zero"); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(a, out), + InvalidArgument, + out); + + // Resize + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out); +#endif + + // Compute Dtype + ScalarType compute_type = + torch::executor::native::utils::get_compute_type(common_type); + + const bool mode_is_trunc = mode_val == "trunc"; + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "div.Scalar_mode_out"; + + int mode_value = (mode_val == "trunc") ? 1 : 2; + + if (compute_type == ScalarType::Int) { + const int* const inp1_data = a.const_data_ptr(); + int inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + + int* const out_data = out.mutable_data_ptr(); + + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_scalar_32x32_32, + out_data, + inp1_data, + inp2_val, + mode_value, + out.numel()); + } else if (compute_type == ScalarType::Float) { + const float* const inp1_data = a.const_data_ptr(); + float inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + + float* const out_data = out.mutable_data_ptr(); + + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_div_scalar_f32xf32_f32, + out_data, + inp1_data, + inp2_val, + mode_value, + out.numel()); + } else { + ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + const CTYPE_COMPUTE val_b = + torch::executor::native::utils::scalar_to(b); + torch::executor::native::utils:: + apply_unitensor_elementwise_fn( + [val_b, mode_is_trunc](const CTYPE_COMPUTE val_a) { + CTYPE_COMPUTE value = val_a / val_b; + if (mode_is_trunc) { + value = std::trunc(value); + } else { + value = torch::executor::native::utils::floor_divide( + val_a, val_b); + } + return value; + }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + out, + torch::executor::native::utils::SupportedTensorDtypes::REALHBF16); + }); + } + + return out; +} + +} // namespace native +} // namespace G3 +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/fusion_g3/operators/op_exp.cpp b/backends/cadence/fusion_g3/operators/op_exp.cpp new file mode 100644 index 00000000000..3021a0d4e8a --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_exp.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +#include +#include +#include + +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; + +namespace cadence { +namespace impl { +namespace G3 { +namespace native { + +Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { +#ifdef OP_ARG_CHECK + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_floating_type(out), + InvalidArgument, + out); + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + executorch::runtime::resize_tensor(out, in.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(in, out), + InvalidArgument, + out); +#endif + + if (out.scalar_type() == ScalarType::Float) { + float* const out_data = out.mutable_data_ptr(); + const float* const in_data = in.const_data_ptr(); + + XT_KERNEL_CHECK( + ctx, out, xa_nn_elm_exp_f32_f32, out_data, in_data, out.numel()); + + return out; + } else { + return torch::executor::native::internal:: + unary_ufunc_realhbbf16_to_floathbf16(std::exp, ctx, in, out); + } +} + +} // namespace native +} // namespace G3 +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/fusion_g3/operators/op_mean.cpp b/backends/cadence/fusion_g3/operators/op_mean.cpp new file mode 100644 index 00000000000..be866b2f51c --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_mean.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include +#include +#include + +using ::executorch::aten::ArrayRef; +using ::executorch::aten::optional; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; + +namespace cadence { +namespace impl { +namespace G3 { +namespace native { + +int prepare_data( + const Tensor& in, + Tensor& out, + optional> dim_list, + int* inp_shape, + int* out_shape, + int* p_axis, + int num_inp_dims, + int num_out_dims) { + for (int i = 0; i < num_inp_dims; i++) { + inp_shape[i] = in.size(i); + } + + for (int i = 0; i < num_out_dims; i++) { + out_shape[i] = out.size(i); + } + + int num_axis_dims = 0; + for (const auto& d : dim_list.value()) { + if (d < 0) { + p_axis[num_axis_dims] = num_inp_dims + d; + num_axis_dims++; + } else { + p_axis[num_axis_dims] = d; + num_axis_dims++; + } + } + + return num_axis_dims; +} + +Tensor& mean_dim_out( + KernelRuntimeContext& ctx, + const Tensor& in, + optional> dim_list, + bool keepdim, + optional dtype, + Tensor& out) { + (void)ctx; + +#ifdef OP_ARG_CHECK + ET_KERNEL_CHECK( + ctx, + torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(in, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensor_is_default_dim_order(in), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + torch::executor::resize_reduction_out(in, dim_list, keepdim, out) == + Error::Ok, + InvalidArgument, + out); +#endif + + constexpr int kNnlibMaxDim = 5; + + bool optimized = 1; + + if (out.scalar_type() != ScalarType::Float) + optimized = 0; + + if (in.dim() > kNnlibMaxDim) + optimized = 0; + + if (optimized) { + float* __restrict__ p_out = out.mutable_data_ptr(); + const float* __restrict__ p_inp = + (const float* __restrict__)in.const_data_ptr(); + + int num_elm = in.numel(); + + int num_inp_dims = in.dim(); + int num_out_dims = out.dim(); + + int inp_shape[kNnlibMaxDim]; + int out_shape[kNnlibMaxDim]; + int p_axis[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp_shape[i] = 1; + p_axis[i] = 1; + } + + int num_axis_dims = prepare_data( + in, + out, + dim_list, + inp_shape, + out_shape, + p_axis, + num_inp_dims, + num_out_dims); + + if (num_axis_dims == num_inp_dims) { + num_out_dims = 1; + out_shape[0] = 1; + } + + int inp_shape_max = inp_shape[p_axis[0]]; + for (int i = 1; i < num_axis_dims; i++) { + if (inp_shape[p_axis[i]] > inp_shape_max) { + inp_shape_max = inp_shape[p_axis[i]]; + } + } + + int scratch_size = in.numel() / inp_shape_max; + + executorch::runtime::Result temp_mem = + ctx.allocate_temp(scratch_size * sizeof(float)); + + void* __restrict__ p_scratch_in = (void* __restrict__)(temp_mem.get()); + + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_mean_f32_f32, + p_out, + out_shape, + num_out_dims, + p_inp, + inp_shape, + num_inp_dims, + p_axis, + num_axis_dims, + p_scratch_in); + } else { + ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] { + ET_SWITCH_FLOATH_TYPES( + out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] { + CTYPE_OUT* out_data = out.mutable_data_ptr(); + const size_t num = + torch::executor::get_reduced_dim_product(in, dim_list); + for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) { + CTYPE_OUT sum = 0; + if (in.numel() > 0) { + sum = torch::executor:: + map_reduce_over_dim_list( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { + return acc + outv; + }, + in, + dim_list, + out_ix); + } + out_data[out_ix] = sum / static_cast(num); + } + }); + }); + } + + return out; +} + +} // namespace native +} // namespace G3 +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp index 840cb16c7cf..93b4c5a992c 100644 --- a/backends/cadence/fusion_g3/operators/op_mul.cpp +++ b/backends/cadence/fusion_g3/operators/op_mul.cpp @@ -6,8 +6,11 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include +#include #include #include #include @@ -34,6 +37,7 @@ Tensor& mul_out( ScalarType common_type = executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type()); +#ifdef OP_ARG_CHECK // Check Common Dtype ET_KERNEL_CHECK( ctx, canCast(common_type, out.scalar_type()), InvalidArgument, out); @@ -51,6 +55,7 @@ Tensor& mul_out( torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok, InvalidArgument, out); +#endif // Compute Dtype ScalarType compute_type = @@ -58,7 +63,6 @@ Tensor& mul_out( // @lint-ignore CLANGTIDY facebook-hte-CArray static constexpr const char op_name[] = "mul.out"; - int kTensorDimensionLimit = 5; int inp1_shape[kTensorDimensionLimit]; @@ -111,13 +115,28 @@ Tensor& mul_out( int* const out_data = out.mutable_data_ptr(); if (a.numel() == 1) { - xa_nn_elm_mul_scalar_32x32_32( - out_data, inp2_data, inp1_data[0], out.numel()); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_scalar_32x32_32, + out_data, + inp2_data, + inp1_data[0], + out.numel()); } else if (b.numel() == 1) { - xa_nn_elm_mul_scalar_32x32_32( - out_data, inp1_data, inp2_data[0], out.numel()); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_scalar_32x32_32, + out_data, + inp1_data, + inp2_data[0], + out.numel()); } else if (broadcast) { - xa_nn_elm_mul_broadcast_5D_32x32_32( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_broadcast_5D_32x32_32, out_data, out_shape, inp1_data, @@ -126,7 +145,14 @@ Tensor& mul_out( inp2_shape, max_dim); } else { - xa_nn_elm_mul_32x32_32(out_data, inp1_data, inp2_data, out.numel()); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_32x32_32, + out_data, + inp1_data, + inp2_data, + out.numel()); } } else if ((compute_type == ScalarType::Float) && (optimized)) { const float* const inp1_data = a.const_data_ptr(); @@ -134,13 +160,28 @@ Tensor& mul_out( float* const out_data = out.mutable_data_ptr(); if (a.numel() == 1) { - xa_nn_elm_mul_scalar_f32xf32_f32( - out_data, inp2_data, inp1_data[0], out.numel()); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_scalar_f32xf32_f32, + out_data, + inp2_data, + inp1_data[0], + out.numel()); } else if (b.numel() == 1) { - xa_nn_elm_mul_scalar_f32xf32_f32( - out_data, inp1_data, inp2_data[0], out.numel()); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_scalar_f32xf32_f32, + out_data, + inp1_data, + inp2_data[0], + out.numel()); } else if (broadcast) { - xa_nn_elm_mul_broadcast_5D_f32xf32_f32( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_broadcast_5D_f32xf32_f32, out_data, out_shape, inp1_data, @@ -149,7 +190,14 @@ Tensor& mul_out( inp2_shape, max_dim); } else { - xa_nn_elm_mul_f32xf32_f32(out_data, inp1_data, inp2_data, out.numel()); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_f32xf32_f32, + out_data, + inp1_data, + inp2_data, + out.numel()); } } else { ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { @@ -181,6 +229,7 @@ Tensor& mul_scalar_out( torch::executor::native::utils::promote_type_with_scalar( a.scalar_type(), b); +#ifdef OP_ARG_CHECK // Check Common Dtype ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out); @@ -194,29 +243,41 @@ Tensor& mul_scalar_out( // Resize ET_KERNEL_CHECK( ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out); - +#endif // Compute Dtype ScalarType compute_type = torch::executor::native::utils::get_compute_type(common_type); // @lint-ignore CLANGTIDY facebook-hte-CArray static constexpr const char op_name[] = "mul.Scalar_out"; - if (compute_type == ScalarType::Int) { const int* const inp1_data = a.const_data_ptr(); int inp2_val; torch::executor::native::utils::extract_scalar(b, &inp2_val); int* const out_data = out.mutable_data_ptr(); - xa_nn_elm_mul_scalar_32x32_32(out_data, inp1_data, inp2_val, out.numel()); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_scalar_32x32_32, + out_data, + inp1_data, + inp2_val, + out.numel()); } else if (compute_type == ScalarType::Float) { const float* const inp1_data = a.const_data_ptr(); float inp2_val; torch::executor::native::utils::extract_scalar(b, &inp2_val); float* const out_data = out.mutable_data_ptr(); - xa_nn_elm_mul_scalar_f32xf32_f32( - out_data, inp1_data, inp2_val, out.numel()); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_mul_scalar_f32xf32_f32, + out_data, + inp1_data, + inp2_val, + out.numel()); } else { ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = @@ -232,7 +293,6 @@ Tensor& mul_scalar_out( SAME_AS_COMMON); }); } - return out; } diff --git a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp index a5fbe31eee5..9857bbce377 100644 --- a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp +++ b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp @@ -6,16 +6,20 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include #include #include +#include #include #include #include using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::Error; @@ -32,8 +36,8 @@ template void layer_norm( const Tensor& input, IntArrayRef normalized_shape, - const exec_aten::optional& weight, - const exec_aten::optional& bias, + const optional& weight, + const optional& bias, CTYPE eps, Tensor& out, Tensor& mean, @@ -109,8 +113,8 @@ std::tuple native_layer_norm_out( KernelRuntimeContext& ctx, const Tensor& input, IntArrayRef normalized_shape, - const exec_aten::optional& weight, - const exec_aten::optional& bias, + const optional& weight, + const optional& bias, double eps, Tensor& out, Tensor& mean_out, @@ -118,7 +122,9 @@ std::tuple native_layer_norm_out( (void)ctx; std::tuple ret_val(out, mean_out, rstd_out); + int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit; +#ifdef OP_ARG_CHECK ET_KERNEL_CHECK( ctx, torch::executor::check_layer_norm_args( @@ -156,7 +162,7 @@ std::tuple native_layer_norm_out( InvalidArgument, ret_val); } - int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit; + Tensor::SizesType mean_rstd_sizes[kTensorDimensionLimit]; size_t mean_rstd_ndim = 0; torch::executor::get_layer_norm_out_target_size( @@ -181,6 +187,7 @@ std::tuple native_layer_norm_out( rstd_out, {mean_rstd_sizes, mean_rstd_ndim}) == Error::Ok, InvalidArgument, ret_val); +#endif int input_shape[kTensorDimensionLimit]; for (int i = 0; i < input.dim(); i++) { @@ -218,7 +225,10 @@ std::tuple native_layer_norm_out( } } - xa_nn_native_layer_norm_f32_f32( + XT_KERNEL_CHECK( + ctx, + ret_val, + xa_nn_native_layer_norm_f32_f32, out_data, mean_data, rstd_data, diff --git a/backends/cadence/fusion_g3/operators/op_permute_copy.cpp b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp new file mode 100644 index 00000000000..23c2d1e5fbd --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +#include +#include +#include + +using ::executorch::aten::ArrayRef; +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::ScalarType; +using ::executorch::aten::SizesType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; + +/* ScalarType in Executorch do not have support for below data types. + * So, creating a placeholder for these data types. Once, ScalarTypes is + * updated to have support for below data types, these can be removed and + * operator need to be updated accordingly + */ + +namespace cadence { +namespace impl { +namespace G3 { +namespace native { + +namespace { + +void increment_coordinate_permuted( + const Tensor& tensor, + size_t* const coordinate, + IntArrayRef dims) { + for (int i = dims.size() - 1; i >= 0; i--) { + size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim(); + coordinate[d]++; + if (coordinate[d] == tensor.size(d)) { + coordinate[d] = 0; + } else { + return; + } + } +} + +} // namespace + +Tensor& permute_copy_out( + KernelRuntimeContext& ctx, + const Tensor& in, + IntArrayRef dims, + Tensor& out) { + (void)ctx; + int kTensorDimensionLimit = 5; + /* if the arguments are passed properly to the operator disable the Macro - + * "OP_ARG_CHECK" if not the case, enable the Macro - "OP_ARG_CHECK", to have + * the checks only in operator level(As there are no checks in kernel). + */ +#ifdef OP_ARG_CHECK + ET_KERNEL_CHECK( + ctx, + torch::executor::check_permute_copy_args(in, dims, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(in, out), + InvalidArgument, + out); + + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + torch::executor::get_permute_copy_out_target_size( + in, dims, expected_out_size, &expected_out_dim); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor( + out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); +#endif + + const ArrayRef in_size = in.sizes(); + const ArrayRef out_size = out.sizes(); + + int inp_shape[kTensorDimensionLimit]; + int out_shape[kTensorDimensionLimit]; + + /* input shapes and output shapes */ + for (auto i = 0; i < in_size.size(); i++) { + inp_shape[i] = in_size[i]; + } + + for (auto i = 0; i < out_size.size(); i++) { + out_shape[i] = out_size[i]; + } + + int permute_vec[in.dim()]; + for (int i = 0; i < in.dim(); i++) { + permute_vec[i] = (int)dims[i]; + } + signed char* out_data = out.mutable_data_ptr(); + const signed char* const inp_data = in.const_data_ptr(); + + if (((out.scalar_type() == ScalarType::Int) || + (out.scalar_type() == ScalarType::Short) || + (out.scalar_type() == ScalarType::Char) || + (out.scalar_type() == ScalarType::UInt32) || + (out.scalar_type() == ScalarType::UInt16) || + (out.scalar_type() == ScalarType::Byte)) && + (in.dim() <= 5)) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_permute, + out_data, + out_shape, + inp_data, + inp_shape, + permute_vec, + in.dim(), + get_element_size(out.scalar_type())); + } else { + const auto in_type = out.scalar_type(); + size_t in_coord[5] = {0}; + size_t trailing_dims_memo[kTensorDimensionLimit]; + executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo); + // in and out must be the same dtype + ET_SWITCH_ALL_TYPES(in_type, ctx, "permute_copy.out", CTYPE, [&] { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* const out_data = out.mutable_data_ptr(); + + for (size_t i = 0; i < out.numel(); ++i) { + out_data[i] = + in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo( + in, in_coord, trailing_dims_memo)]; + increment_coordinate_permuted(in, in_coord, dims); + } + }); + } + + return out; +} + +} // namespace native +} // namespace G3 +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp index fc206b67cd6..8237c3c266c 100644 --- a/backends/cadence/fusion_g3/operators/op_quantize.cpp +++ b/backends/cadence/fusion_g3/operators/op_quantize.cpp @@ -6,17 +6,18 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include #include #include #include +#include #include #include -using ::executorch::aten::ArrayRef; -using ::executorch::aten::optional; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::Error; @@ -27,7 +28,7 @@ using ::executorch::runtime::KernelRuntimeContext; * updated to have support for below data types, these can be removed and * operator need to be updated accordingly */ -enum datatype { Ushort = 20, Bits4u = 21, Bits4 = 22 }; +enum datatype { Bits4u = 21, Bits4 = 22 }; /** * For an input tensor, use the scale and zero_point arguments to quantize it. @@ -78,9 +79,6 @@ void check_quantize_per_tensor_args( } else if (dtype == ScalarType::Short) { quant_min_lower_bound = std::numeric_limits::min(); quant_max_upper_bound = std::numeric_limits::max(); - } else if (dtype == (ScalarType)Ushort) { - quant_min_lower_bound = std::numeric_limits::min(); - quant_max_upper_bound = std::numeric_limits::max(); } else if (dtype == (ScalarType)Bits4u) { quant_min_lower_bound = std::numeric_limits::min(); quant_max_upper_bound = std::numeric_limits::max(); @@ -137,7 +135,8 @@ T quantize_val( } /* Local function which calls the kernels based on the output datatype */ -void quantize_impl( +Tensor& quantize_impl( + KernelRuntimeContext& ctx, Tensor& out, const Tensor& input, float* scale_data, @@ -145,7 +144,8 @@ void quantize_impl( int* axis, int quant_min, int quant_max) { - const ArrayRef input_size = input.sizes(); + const ::executorch::aten::ArrayRef input_size = + input.sizes(); int kTensorDimensionLimit = 5; @@ -179,7 +179,10 @@ void quantize_impl( if (is_asym_quant) { if (out.scalar_type() == ScalarType::Byte) { uint8_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_asym8u( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_asym8u, out_data, input_data, inp_shape, @@ -191,7 +194,11 @@ void quantize_impl( quant_max); } else if (out.scalar_type() == ScalarType::Char) { int8_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_asym8( + + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_asym8, out_data, input_data, inp_shape, @@ -201,9 +208,12 @@ void quantize_impl( zero_point_data, quant_min, quant_max); - } else if (out.scalar_type() == (ScalarType)Ushort) { + } else if (out.scalar_type() == ScalarType::UInt16) { uint16_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_asym16u( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_asym16u, out_data, input_data, inp_shape, @@ -215,7 +225,10 @@ void quantize_impl( quant_max); } else if (out.scalar_type() == ScalarType::Short) { int16_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_asym16( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_asym16, out_data, input_data, inp_shape, @@ -227,7 +240,10 @@ void quantize_impl( quant_max); } else if (out.scalar_type() == (ScalarType)Bits4u) { uint8_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_asym4u( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_asym4u, out_data, input_data, inp_shape, @@ -239,7 +255,10 @@ void quantize_impl( quant_max); } else if (out.scalar_type() == (ScalarType)Bits4) { int8_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_asym4( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_asym4, out_data, input_data, inp_shape, @@ -304,8 +323,9 @@ void quantize_impl( } } - optional> optional_dim_list{ - ArrayRef{dims, size_t(input.dim() - 1)}}; + ::executorch::aten::optional<::executorch::aten::ArrayRef> + optional_dim_list{::executorch::aten::ArrayRef{ + dims, size_t(input.dim() - 1)}}; // Actual quantization logic // input, out are the input and output tensors @@ -373,7 +393,10 @@ void quantize_impl( } else { if (out.scalar_type() == ScalarType::Byte) { uint8_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_sym8u( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_sym8u, out_data, input_data, inp_shape, @@ -384,7 +407,10 @@ void quantize_impl( quant_max); } else if (out.scalar_type() == ScalarType::Char) { int8_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_sym8( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_sym8, out_data, input_data, inp_shape, @@ -393,9 +419,12 @@ void quantize_impl( scale_data, quant_min, quant_max); - } else if (out.scalar_type() == (ScalarType)Ushort) { + } else if (out.scalar_type() == ScalarType::UInt16) { uint16_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_sym16u( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_sym16u, out_data, input_data, inp_shape, @@ -406,7 +435,10 @@ void quantize_impl( quant_max); } else if (out.scalar_type() == ScalarType::Short) { int16_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_sym16( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_sym16, out_data, input_data, inp_shape, @@ -417,7 +449,10 @@ void quantize_impl( quant_max); } else if (out.scalar_type() == (ScalarType)Bits4u) { uint8_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_sym4u( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_sym4u, out_data, input_data, inp_shape, @@ -428,7 +463,10 @@ void quantize_impl( quant_max); } else if (out.scalar_type() == (ScalarType)Bits4) { int8_t* out_data = out.mutable_data_ptr(); - xa_nn_elm_quantize_f32_sym4( + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_quantize_f32_sym4, out_data, input_data, inp_shape, @@ -490,8 +528,9 @@ void quantize_impl( } } - optional> optional_dim_list{ - ArrayRef{dims, size_t(input.dim() - 1)}}; + ::executorch::aten::optional<::executorch::aten::ArrayRef> + optional_dim_list{::executorch::aten::ArrayRef{ + dims, size_t(input.dim() - 1)}}; // Actual quantization logic // input, out are the input and output tensors @@ -556,6 +595,7 @@ void quantize_impl( #undef SYM_QUANTIZE_IMPL_CHANNEL } } + return out; } // Quantize the input tensor @@ -568,16 +608,18 @@ Tensor& quantize_per_tensor_out( int64_t quant_max, ScalarType dtype, Tensor& out) { +#ifdef OP_ARG_CHECK Error err = resize_tensor(out, input.sizes()); ET_CHECK_MSG( err == Error::Ok, "Failed to resize out Tensor in quantize_per_tensor_out"); - - // check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out); + check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out); +#endif float scale_data = (float)scale; int zero_point_data = (int)zero_point; quantize_impl( + context, out, input, &scale_data, @@ -606,6 +648,7 @@ Tensor& quantize_per_tensor_tensor_args_out( context.fail(Error::InvalidArgument); return out; } +#ifdef OP_ARG_CHECK ET_CHECK_MSG( scale.scalar_type() == ScalarType::Double, "Expected scale to be Double tensor received: %" PRId8, @@ -622,6 +665,7 @@ Tensor& quantize_per_tensor_tensor_args_out( zero_point.numel() == 1, "Exepcted zero_point to only have one element received: %zd", ssize_t(zero_point.numel())); +#endif quantize_per_tensor_out( context, @@ -652,6 +696,7 @@ Tensor& quantize_per_tensor_tensor_args_out( } Tensor& quantize_per_channel_out( + KernelRuntimeContext& context, const Tensor& input, const Tensor& scale, const Tensor& zero_point, @@ -660,8 +705,12 @@ Tensor& quantize_per_channel_out( int64_t quant_max, ScalarType dtype, Tensor& out) { - Error err = resize_tensor(out, input.sizes()); + if (axis < 0) { + axis += executorch::runtime::nonzero_dim(input); + } +#ifdef OP_ARG_CHECK + Error err = resize_tensor(out, input.sizes()); // normalize axis ET_CHECK_MSG( executorch::runtime::tensor_has_dim(input, axis), @@ -669,10 +718,6 @@ Tensor& quantize_per_channel_out( ssize_t(axis), ssize_t(input.dim())); - if (axis < 0) { - axis += executorch::runtime::nonzero_dim(input); - } - ET_CHECK_MSG( err == Error::Ok, "Failed to resize out Tensor in quantize_per_channel_out"); @@ -699,7 +744,8 @@ Tensor& quantize_per_channel_out( zero_point.numel(), input.size(axis)); - // check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out); + check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out); +#endif const double* scale_dt = scale.const_data_ptr(); const int64_t* zero_point_dt = zero_point.const_data_ptr(); @@ -715,6 +761,7 @@ Tensor& quantize_per_channel_out( int* axis_ptr = (int*)&axis; quantize_impl( + context, out, input, scale_data, @@ -722,25 +769,12 @@ Tensor& quantize_per_channel_out( axis_ptr, (int)quant_min, (int)quant_max); - return out; -} -Tensor& quantize_per_channel_out( - KernelRuntimeContext& context, - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t axis, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out) { - (void)context; - return quantize_per_channel_out( - input, scale, zero_point, axis, quant_min, quant_max, dtype, out); + return out; } Tensor& quantize_per_token_out( + KernelRuntimeContext& context, const Tensor& input, const Tensor& scale, const Tensor& zero_point, @@ -761,11 +795,11 @@ Tensor& quantize_per_token_out( Tensor reshaped_input = at::from_blob( input.mutable_data_ptr(), sizes, at::TensorOptions(input.scalar_type())); #else - std::array input_dim_order{0, 1}; - std::array input_sizes; + std::array<::executorch::aten::DimOrderType, 2> input_dim_order{0, 1}; + std::array<::executorch::aten::SizesType, 2> input_sizes; input_sizes[0] = num_tokens; input_sizes[1] = input.size(input.dim() - 1); - std::array input_strides; + std::array<::executorch::aten::StridesType, 2> input_strides; executorch::runtime::dim_order_to_stride_nocheck( input_sizes.data(), input_dim_order.data(), 2, input_strides.data()); void* input_data = input.mutable_data_ptr(); @@ -786,21 +820,15 @@ Tensor& quantize_per_token_out( #endif return quantize_per_channel_out( - reshaped_input, scale, zero_point, 0, quant_min, quant_max, dtype, out); -} - -Tensor& quantize_per_token_out( - KernelRuntimeContext& context, - const Tensor& input, - const Tensor& scale, - const Tensor& zero_point, - int64_t quant_min, - int64_t quant_max, - ScalarType dtype, - Tensor& out) { - (void)context; - return quantize_per_token_out( - input, scale, zero_point, quant_min, quant_max, dtype, out); + context, + reshaped_input, + scale, + zero_point, + 0, + quant_min, + quant_max, + dtype, + out); } } // namespace native diff --git a/backends/cadence/fusion_g3/operators/op_slice_copy.cpp b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp new file mode 100644 index 00000000000..c481cf726b7 --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +#include + +#include +#include +#include + +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; + +/* ScalarType in Executorch do not have support for below data types. + * So, creating a placeholder for these data types. Once, ScalarTypes is + * updated to have support for below data types, these can be removed and + * operator need to be updated accordingly + */ + +namespace cadence { +namespace impl { +namespace G3 { +namespace native { + +Tensor& slice_copy_Tensor_out( + KernelRuntimeContext& ctx, + const Tensor& in, + int64_t dim, + ::executorch::aten::optional start_val, + ::executorch::aten::optional end_val, + int64_t step, + Tensor& out) { + (void)ctx; + + if (dim < 0) { + dim += in.dim(); + } + // If user do not set value to end_val, set end to in.size(dim) (largest + // value available) + int64_t end = end_val.has_value() ? end_val.value() : in.size(dim); + // If user do not set value to start_val, set start to 0 (smallest value + // available) + int64_t start = start_val.has_value() ? start_val.value() : 0; + int64_t length = + torch::executor::adjust_slice_indices(in.size(dim), &start, &end, step); + + int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit; + +#ifdef OP_ARG_CHECK + ET_KERNEL_CHECK( + ctx, + torch::executor::check_slice_copy_args(in, dim, step, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(in, out), + InvalidArgument, + out); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + Tensor::SizesType target_sizes[kTensorDimensionLimit]; + size_t target_ndim = 0; + torch::executor::get_slice_copy_out_target_size( + in, dim, length, target_sizes, &target_ndim); + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor(out, {target_sizes, target_ndim}) == + Error::Ok, + InvalidArgument, + out); +#endif + + const ::executorch::aten::ArrayRef in_size = in.sizes(); + const ::executorch::aten::ArrayRef out_size = out.sizes(); + + int inp_shape[kTensorDimensionLimit]; + int out_shape[kTensorDimensionLimit]; + + /* input shapes and output shapes */ + for (auto i = 0; i < in_size.size(); i++) { + inp_shape[i] = in_size[i]; + } + + for (auto i = 0; i < out_size.size(); i++) { + out_shape[i] = out_size[i]; + } + + signed char* out_data = out.mutable_data_ptr(); + const signed char* const inp_data = in.const_data_ptr(); + + if ((out.scalar_type() == ScalarType::Int) || + (out.scalar_type() == ScalarType::Short) || + (out.scalar_type() == ScalarType::Char) || + (out.scalar_type() == ScalarType::UInt32) || + (out.scalar_type() == ScalarType::UInt16) || + (out.scalar_type() == ScalarType::Byte)) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_slice, + out_data, + out_shape, + inp_data, + inp_shape, + in.dim(), + (int)start, + (int)(end - 1), + (int)step, + (int)dim, + get_element_size(out.scalar_type())); + } else { + torch::executor::compute_slice(in, dim, start, length, step, out); + } + + return out; +} + +} // namespace native +} // namespace G3 +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/fusion_g3/operators/op_softmax.cpp b/backends/cadence/fusion_g3/operators/op_softmax.cpp index 9f343481508..ee87ebaf5a1 100644 --- a/backends/cadence/fusion_g3/operators/op_softmax.cpp +++ b/backends/cadence/fusion_g3/operators/op_softmax.cpp @@ -6,10 +6,13 @@ * LICENSE file in the root directory of this source tree. */ +#include + #include #include +#include #include #include #include @@ -34,6 +37,10 @@ Tensor& _softmax_out( Tensor& out) { (void)ctx; + // Adjust for negative dim + dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim; + +#ifdef OP_ARG_CHECK ET_KERNEL_CHECK( ctx, torch::executor::check_softmax_args(in, dim, half_to_float, out), @@ -48,9 +55,7 @@ Tensor& _softmax_out( executorch::runtime::tensors_have_same_dim_order(in, out), InvalidArgument, out); - - // Adjust for negative dim - dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim; +#endif int inp_shapes[in.dim()]; const ArrayRef in_size = in.sizes(); @@ -62,7 +67,15 @@ Tensor& _softmax_out( const float* const inp_data = in.const_data_ptr(); float* const out_data = out.mutable_data_ptr(); int axis = dim; - xa_nn_softmax_f32_f32(out_data, inp_data, inp_shapes, in.dim(), &axis); + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_softmax_f32_f32, + out_data, + inp_data, + inp_shapes, + in.dim(), + &axis); } else { ET_SWITCH_FLOATH_TYPES(in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() { const CTYPE* const in_data = in.const_data_ptr(); diff --git a/backends/cadence/fusion_g3/operators/op_sub.cpp b/backends/cadence/fusion_g3/operators/op_sub.cpp new file mode 100644 index 00000000000..91782d2dfff --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_sub.cpp @@ -0,0 +1,339 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include +#include +#include + +using ::executorch::aten::Scalar; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::canCast; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; + +namespace cadence { +namespace impl { +namespace G3 { +namespace native { + +Tensor& sub_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + const Scalar& alpha, + Tensor& out) { + // Common Dtype + ScalarType common_type = + executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type()); +#ifdef OP_ARG_CHECK + ScalarType alpha_type = + torch::executor::native::utils::get_scalar_dtype(alpha); + + // Check alpha type + ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out); + + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, + (canCast(common_type, out.scalar_type()) && + canCast(alpha_type, common_type)), + InvalidArgument, + out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(a, b, out), + InvalidArgument, + out); + + // Resize + ET_KERNEL_CHECK( + ctx, + torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); +#endif + + // Compute Dtype + ScalarType compute_type = + torch::executor::native::utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "sub.out"; + + int kTensorDimensionLimit = 5; + + int inp1_shape[kTensorDimensionLimit]; + int inp2_shape[kTensorDimensionLimit]; + int out_shape[kTensorDimensionLimit]; + + bool broadcast = 0; + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + bool optimized = 1; + + for (int i = 0; i < max_dim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int offset_out = max_dim - out.dim(); + int offset_inp1 = max_dim - a.dim(); + int offset_inp2 = max_dim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + offset_out] = out.size(i); + } + for (int i = 0; i < a.dim(); i++) { + inp1_shape[i + offset_inp1] = a.size(i); + } + for (int i = 0; i < b.dim(); i++) { + inp2_shape[i + offset_inp2] = b.size(i); + } + + /*find broadcast*/ + for (int i = 0; i < out.dim(); i++) { + if (((inp1_shape[i]) != (out_shape[i])) || + ((inp2_shape[i]) != (out_shape[i]))) { + broadcast = 1; + } + } + + if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) { + optimized = 0; + } + + if ((compute_type == ScalarType::Int) && (optimized)) { + const int* const inp1_data = a.const_data_ptr(); + const int* const inp2_data = b.const_data_ptr(); + int* const out_data = out.mutable_data_ptr(); + + int alpha_val; + torch::executor::native::utils::extract_scalar(alpha, &alpha_val); + + if (b.numel() == 1) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_sub_scalar_32x32_32, + out_data, + inp1_data, + inp2_data[0], + alpha_val, + out.numel()); + } else if (broadcast) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_sub_broadcast_5D_32x32_32, + out_data, + out_shape, + inp1_data, + inp1_shape, + inp2_data, + inp2_shape, + max_dim, + alpha_val); + } else { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_sub_32x32_32, + out_data, + inp1_data, + inp2_data, + alpha_val, + out.numel()); + } + } else if ((compute_type == ScalarType::Float) && (optimized)) { + const float* const inp1_data = a.const_data_ptr(); + const float* const inp2_data = b.const_data_ptr(); + float* const out_data = out.mutable_data_ptr(); + + float alpha_val; + torch::executor::native::utils::extract_scalar(alpha, &alpha_val); + + if (b.numel() == 1) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_sub_scalar_f32xf32_f32, + out_data, + inp1_data, + inp2_data[0], + alpha_val, + out.numel()); + } else if (broadcast) { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_sub_broadcast_5D_f32xf32_f32, + out_data, + out_shape, + inp1_data, + inp1_shape, + inp2_data, + inp2_shape, + max_dim, + alpha_val); + } else { + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_sub_f32xf32_f32, + out_data, + inp1_data, + inp2_data, + alpha_val, + out.numel()); + } + } else { + ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + const CTYPE_COMPUTE val_alpha = + torch::executor::native::utils::scalar_to(alpha); + torch::executor::native::utils:: + apply_bitensor_elementwise_fn( + [val_alpha]( + const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + return val_a - val_alpha * val_b; + }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBF16, + b, + torch::executor::native::utils::SupportedTensorDtypes::REALHBF16, + out, + torch::executor::native::utils::SupportedTensorDtypes::REALHBF16); + }); + } + + return out; +} + +Tensor& sub_scalar_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + const Scalar& alpha, + Tensor& out) { + // Common Dtype + ScalarType common_type = + torch::executor::native::utils::promote_type_with_scalar( + a.scalar_type(), b); +#ifdef OP_ARG_CHECK + ScalarType alpha_type = + torch::executor::native::utils::get_scalar_dtype(alpha); + + // Check alpha type + ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out); + + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, + (common_type == out.scalar_type() && canCast(alpha_type, common_type)), + InvalidArgument, + out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(a, out), + InvalidArgument, + out); + + // Resize + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out); +#endif + + // Compute Dtype + ScalarType compute_type = + torch::executor::native::utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "sub.Scalar_out"; + + if (compute_type == ScalarType::Int) { + const int* const inp1_data = a.const_data_ptr(); + int inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + + int alpha_val; + torch::executor::native::utils::extract_scalar(alpha, &alpha_val); + + int* const out_data = out.mutable_data_ptr(); + + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_sub_scalar_32x32_32, + out_data, + inp1_data, + inp2_val, + alpha_val, + out.numel()); + } else if (compute_type == ScalarType::Float) { + const float* const inp1_data = a.const_data_ptr(); + float inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + + float alpha_val; + torch::executor::native::utils::extract_scalar(alpha, &alpha_val); + + float* const out_data = out.mutable_data_ptr(); + + XT_KERNEL_CHECK( + ctx, + out, + xa_nn_elm_sub_scalar_f32xf32_f32, + out_data, + inp1_data, + inp2_val, + alpha_val, + out.numel()); + } else { + ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + const CTYPE_COMPUTE val_b = + torch::executor::native::utils::scalar_to(b); + const CTYPE_COMPUTE val_alpha = + torch::executor::native::utils::scalar_to(alpha); + torch::executor::native::utils:: + apply_unitensor_elementwise_fn( + [val_b, val_alpha](const CTYPE_COMPUTE val_a) { + return val_a - val_alpha * val_b; + }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBF16, + out, + torch::executor::native::utils::SupportedTensorDtypes:: + SAME_AS_COMMON); + }); + } + + return out; +} + +} // namespace native +} // namespace G3 +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/fusion_g3/operators/operators.h b/backends/cadence/fusion_g3/operators/operators.h index 9d7f7b9c30e..e1c0d08f44a 100644 --- a/backends/cadence/fusion_g3/operators/operators.h +++ b/backends/cadence/fusion_g3/operators/operators.h @@ -16,6 +16,13 @@ namespace impl { namespace G3 { namespace native { +::executorch::aten::Tensor& _softmax_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& in, + int64_t dim, + bool half_to_float, + ::executorch::aten::Tensor& out); + ::executorch::aten::Tensor& add_out( ::executorch::runtime::KernelRuntimeContext& ctx, const ::executorch::aten::Tensor& a, @@ -30,6 +37,153 @@ ::executorch::aten::Tensor& add_scalar_out( const ::executorch::aten::Scalar& alpha, ::executorch::aten::Tensor& out); +::executorch::aten::Tensor& cat_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + ::executorch::aten::ArrayRef<::executorch::aten::Tensor> tensors, + int64_t dim, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& dequantize_per_channel_out( + ::executorch::runtime::KernelRuntimeContext& context, + const ::executorch::aten::Tensor& input, + const ::executorch::aten::Tensor& scale, + const ::executorch::aten::optional<::executorch::aten::Tensor>& + opt_zero_points, + int64_t axis, + int64_t quant_min, + int64_t quant_max, + ::executorch::aten::ScalarType dtype, + ::executorch::aten::optional<::executorch::aten::ScalarType> out_dtype, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& dequantize_per_tensor_out( + ::executorch::runtime::KernelRuntimeContext& context, + const ::executorch::aten::Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ::executorch::aten::ScalarType dtype, + ::executorch::aten::optional<::executorch::aten::ScalarType> out_dtype, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& div_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& a, + const ::executorch::aten::Tensor& b, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& div_out_mode( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& a, + const ::executorch::aten::Tensor& b, + ::executorch::aten::optional<::executorch::aten::string_view> mode, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& div_scalar_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& a, + const ::executorch::aten::Scalar& b, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& div_scalar_mode_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& a, + const ::executorch::aten::Scalar& b, + ::executorch::aten::optional<::executorch::aten::string_view> mode, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& exp_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& in, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& mean_dim_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& in, + ::executorch::aten::optional<::executorch::aten::ArrayRef> + dim_list, + bool keepdim, + ::executorch::aten::optional<::executorch::aten::ScalarType> dtype, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& mul_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& a, + const ::executorch::aten::Tensor& b, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& mul_scalar_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& a, + const ::executorch::aten::Scalar& b, + ::executorch::aten::Tensor& out); + +std::tuple< + ::executorch::aten::Tensor&, + ::executorch::aten::Tensor&, + ::executorch::aten::Tensor&> +native_layer_norm_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& input, + ::executorch::aten::IntArrayRef normalized_shape, + const ::executorch::aten::optional<::executorch::aten::Tensor>& weight, + const ::executorch::aten::optional<::executorch::aten::Tensor>& bias, + double eps, + ::executorch::aten::Tensor& out, + ::executorch::aten::Tensor& mean_out, + ::executorch::aten::Tensor& rstd_out); + +::executorch::aten::Tensor& permute_copy_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& in, + ::executorch::aten::IntArrayRef dims, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& quantize_per_channel_out( + ::executorch::runtime::KernelRuntimeContext& context, + const ::executorch::aten::Tensor& input, + const ::executorch::aten::Tensor& scale, + const ::executorch::aten::Tensor& zero_point, + int64_t axis, + int64_t quant_min, + int64_t quant_max, + ::executorch::aten::ScalarType dtype, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& quantize_per_tensor_out( + ::executorch::runtime::KernelRuntimeContext& context, + const ::executorch::aten::Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ::executorch::aten::ScalarType dtype, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& slice_copy_Tensor_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& in, + int64_t dim, + ::executorch::aten::optional start_val, + ::executorch::aten::optional end_val, + int64_t step, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& sub_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& a, + const ::executorch::aten::Tensor& b, + const ::executorch::aten::Scalar& alpha, + ::executorch::aten::Tensor& out); + +::executorch::aten::Tensor& sub_scalar_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& a, + const ::executorch::aten::Scalar& b, + const ::executorch::aten::Scalar& alpha, + ::executorch::aten::Tensor& out); + } // namespace native } // namespace G3 } // namespace impl diff --git a/backends/cadence/fusion_g3/operators/targets.bzl b/backends/cadence/fusion_g3/operators/targets.bzl index 3e5900e3634..e1e7c9a8491 100644 --- a/backends/cadence/fusion_g3/operators/targets.bzl +++ b/backends/cadence/fusion_g3/operators/targets.bzl @@ -28,6 +28,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None: exported_deps = [ ":operators_header", ":xt_macros", + ":xt_utils", ], ) @@ -39,6 +40,12 @@ OPERATORS = [ "native_layer_norm", "quantize", "softmax", + "sub", + "div", + "exp", + "mean", + "slice_copy", + "permute_copy" ] def define_common_targets(): @@ -74,5 +81,17 @@ def define_common_targets(): ], ) + runtime.cxx_library( + name = "xt_utils", + exported_headers = ["xt_utils.h"], + visibility = [ + "//executorch/backends/cadence/...", + ], + exported_deps = [ + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/kernel:kernel_runtime_context", + ], + ) + for op in OPERATORS: define_operator(op) diff --git a/backends/cadence/fusion_g3/operators/xt_utils.h b/backends/cadence/fusion_g3/operators/xt_utils.h new file mode 100644 index 00000000000..443d68d0609 --- /dev/null +++ b/backends/cadence/fusion_g3/operators/xt_utils.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +using ::executorch::aten::ScalarType; + +inline int get_element_size(ScalarType dtype) { + if ((dtype == ScalarType::Int) || (dtype == ScalarType::UInt32)) { + return sizeof(int); + } else if ((dtype == ScalarType::Short) || (dtype == ScalarType::UInt16)) { + return sizeof(short); + } else if ((dtype == ScalarType::Char) || (dtype == ScalarType::Byte)) { + return sizeof(char); + } + return 0; +}