diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp index 9537cbacb70..b5b5baf9a8c 100644 --- a/backends/cadence/fusion_g3/operators/op_add.cpp +++ b/backends/cadence/fusion_g3/operators/op_add.cpp @@ -66,34 +66,20 @@ Tensor& add_out( // @lint-ignore CLANGTIDY facebook-hte-CArray static constexpr const char op_name[] = "add.out"; - const exec_aten::ArrayRef a_size = a.sizes(); - const exec_aten::ArrayRef b_size = b.sizes(); - const exec_aten::ArrayRef out_size = out.sizes(); - int kTensorDimensionLimit = 5; int inp1_shape[kTensorDimensionLimit]; int inp2_shape[kTensorDimensionLimit]; int out_shape[kTensorDimensionLimit]; - /*find broadcast*/ - const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); - const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); - const bool broadcast = (a_is_broadcasted || b_is_broadcasted); + bool broadcast = 0; int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); max_dim = out.dim() > max_dim ? out.dim() : max_dim; bool optimized = 1; - if ((a.dim() == 0) || (b.dim() == 0)) { - optimized = 0; - } - - if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) { - optimized = 0; - } - + /* Added change to work with input dimensions more than 5 */ for (int i = 0; i < max_dim; i++) { out_shape[i] = 1; inp1_shape[i] = 1; @@ -114,6 +100,18 @@ Tensor& add_out( inp2_shape[i + offset_inp2] = b.size(i); } + /*find broadcast*/ + for (int i = 0; i < out.dim(); i++) { + if (((inp1_shape[i]) != (out_shape[i])) || + ((inp2_shape[i]) != (out_shape[i]))) { + broadcast = 1; + } + } + + if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) { + optimized = 0; + } + if ((compute_type == ScalarType::Int) && (optimized)) { const int* const inp1_data = a.const_data_ptr(); const int* const inp2_data = b.const_data_ptr(); @@ -121,7 +119,14 @@ Tensor& add_out( int alpha_val; torch::executor::native::utils::extract_scalar(alpha, &alpha_val); - if (broadcast) { + + if ((a.numel() == 1) && (alpha_val == 1)) { + xa_nn_elm_add_scalar_32x32_32( + out_data, inp2_data, inp1_data[0], alpha_val, out.numel()); + } else if (b.numel() == 1) { + xa_nn_elm_add_scalar_32x32_32( + out_data, inp1_data, inp2_data[0], alpha_val, out.numel()); + } else if (broadcast) { xa_nn_elm_add_broadcast_5D_32x32_32( out_data, out_shape, @@ -143,7 +148,13 @@ Tensor& add_out( float alpha_val; torch::executor::native::utils::extract_scalar(alpha, &alpha_val); - if (broadcast) { + if ((a.numel() == 1) && (alpha_val == 1.0)) { + xa_nn_elm_add_scalar_f32xf32_f32( + out_data, inp2_data, inp1_data[0], alpha_val, out.numel()); + } else if (b.numel() == 1) { + xa_nn_elm_add_scalar_f32xf32_f32( + out_data, inp1_data, inp2_data[0], alpha_val, out.numel()); + } else if (broadcast) { xa_nn_elm_add_broadcast_5D_f32xf32_f32( out_data, out_shape, @@ -176,7 +187,6 @@ Tensor& add_out( torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16); }); } - return out; } @@ -234,6 +244,7 @@ Tensor& add_scalar_out( xa_nn_elm_add_scalar_32x32_32( out_data, inp1_data, inp2_val, alpha_val, out.numel()); + } else if (compute_type == ScalarType::Float) { const float* const inp1_data = a.const_data_ptr(); float inp2_val; @@ -246,6 +257,7 @@ Tensor& add_scalar_out( xa_nn_elm_add_scalar_f32xf32_f32( out_data, inp1_data, inp2_val, alpha_val, out.numel()); + } else { ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { torch::executor::native::utils:: @@ -266,6 +278,7 @@ Tensor& add_scalar_out( SAME_AS_COMMON); }); } + return out; } diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp index 62bbb0c9d49..7fae3fa29c4 100644 --- a/backends/cadence/fusion_g3/operators/op_cat.cpp +++ b/backends/cadence/fusion_g3/operators/op_cat.cpp @@ -22,10 +22,7 @@ using torch::executor::KernelRuntimeContext; * updated to have support for below data types, these can be removed and * operator need to be updated accordingly */ -enum datatype { - Ushort = 20, - Uint = 23, -}; +enum datatype { Ushort = 20, Uint = 23 }; namespace cadence { namespace impl { @@ -118,8 +115,7 @@ Tensor& cat_out( tensors.size(), (int)dim, sizeof(char)); - } - if (out.scalar_type() == (ScalarType)Uint) { + } else if (out.scalar_type() == (ScalarType)Uint) { xa_nn_cat( out_data, out_shapes, @@ -164,7 +160,6 @@ Tensor& cat_out( if (all_1d_empty) { return out; } - const size_t outer = executorch::runtime::getLeadingDims(out, dim); const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim); const size_t ninputs = tensors.size(); diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp index 31cd50314e1..914ecf9d7e4 100644 --- a/backends/cadence/fusion_g3/operators/op_mul.cpp +++ b/backends/cadence/fusion_g3/operators/op_mul.cpp @@ -58,34 +58,20 @@ Tensor& mul_out( // @lint-ignore CLANGTIDY facebook-hte-CArray static constexpr const char op_name[] = "mul.out"; - const exec_aten::ArrayRef a_size = a.sizes(); - const exec_aten::ArrayRef b_size = b.sizes(); - const exec_aten::ArrayRef out_size = out.sizes(); - int kTensorDimensionLimit = 5; int inp1_shape[kTensorDimensionLimit]; int inp2_shape[kTensorDimensionLimit]; int out_shape[kTensorDimensionLimit]; - /*find broadcast*/ - const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); - const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); - const bool broadcast = (a_is_broadcasted || b_is_broadcasted); + bool broadcast = 0; int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); max_dim = out.dim() > max_dim ? out.dim() : max_dim; bool optimized = 1; - if ((a.dim() == 0) || (b.dim() == 0)) { - optimized = 0; - } - - if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) { - optimized = 0; - } - + /* Added change to work with input dimensions more than 5 */ for (int i = 0; i < max_dim; i++) { out_shape[i] = 1; inp1_shape[i] = 1; @@ -106,12 +92,30 @@ Tensor& mul_out( inp2_shape[i + offset_inp2] = b.size(i); } + /*find broadcast*/ + for (int i = 0; i < out.dim(); i++) { + if (((inp1_shape[i]) != (out_shape[i])) || + ((inp2_shape[i]) != (out_shape[i]))) { + broadcast = 1; + } + } + + if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) { + optimized = 0; + } + if ((compute_type == ScalarType::Int) && (optimized)) { const int* const inp1_data = a.const_data_ptr(); const int* const inp2_data = b.const_data_ptr(); int* const out_data = out.mutable_data_ptr(); - if (broadcast) { + if (a.numel() == 1) { + xa_nn_elm_mul_scalar_32x32_32( + out_data, inp2_data, inp1_data[0], out.numel()); + } else if (b.numel() == 1) { + xa_nn_elm_mul_scalar_32x32_32( + out_data, inp1_data, inp2_data[0], out.numel()); + } else if (broadcast) { xa_nn_elm_mul_broadcast_5D_32x32_32( out_data, out_shape, @@ -128,7 +132,13 @@ Tensor& mul_out( const float* const inp2_data = b.const_data_ptr(); float* const out_data = out.mutable_data_ptr(); - if (broadcast) { + if (a.numel() == 1) { + xa_nn_elm_mul_scalar_f32xf32_f32( + out_data, inp2_data, inp1_data[0], out.numel()); + } else if (b.numel() == 1) { + xa_nn_elm_mul_scalar_f32xf32_f32( + out_data, inp1_data, inp2_data[0], out.numel()); + } else if (broadcast) { xa_nn_elm_mul_broadcast_5D_f32xf32_f32( out_data, out_shape, @@ -157,7 +167,6 @@ Tensor& mul_out( torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16); }); } - return out; } diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp index 2b8376dc8db..4b69c5d0f49 100644 --- a/backends/cadence/fusion_g3/operators/op_quantize.cpp +++ b/backends/cadence/fusion_g3/operators/op_quantize.cpp @@ -31,7 +31,7 @@ enum datatype { Ushort = 20, Bits4u = 21, Bits4 = 22 }; */ namespace cadence { namespace impl { -namespace FusionG3 { +namespace G3 { namespace native { namespace { @@ -364,8 +364,8 @@ void quantize_impl( #undef ASYM_CALCULATE_FLOAT_TYPE_TENSOR #undef ASYM_CALCULATE_FLOAT_TYPE_CHANNEL -#undef ASYM_ASYM_QUANTIZE_IMPL_CHANNEL_TENSOR -#undef ASYM_ASYM_QUANTIZE_IMPL_CHANNEL_CHANNEL +#undef ASYM_QUANTIZE_IMPL_TENSOR +#undef ASYM_QUANTIZE_IMPL_CHANNEL } } else { if (out.scalar_type() == ScalarType::Byte) { @@ -549,8 +549,8 @@ void quantize_impl( } #undef SYM_CALCULATE_FLOAT_TYPE_TENSOR #undef SYM_CALCULATE_FLOAT_TYPE_CHANNEL -#undef SYM_ASYM_QUANTIZE_IMPL_CHANNEL_TENSOR -#undef SYM_ASYM_QUANTIZE_IMPL_CHANNEL_CHANNEL +#undef SYM_QUANTIZE_IMPL_TENSOR +#undef SYM_QUANTIZE_IMPL_CHANNEL } } } @@ -719,7 +719,6 @@ Tensor& quantize_per_channel_out( axis_ptr, (int)quant_min, (int)quant_max); - return out; } @@ -802,6 +801,6 @@ Tensor& quantize_per_token_out( } } // namespace native -} // namespace FusionG3 +} // namespace G3 } // namespace impl } // namespace cadence \ No newline at end of file