From 33e7495b16cf376c059e47005ae6cc2873ed3eae Mon Sep 17 00:00:00 2001
From: "cmadhira@cadence.com" <mckala@invecas.com>
Date: Mon, 2 Dec 2024 18:27:13 +0530
Subject: [PATCH 1/2] Removed unnecessary conditions in op_add and op_mul Added
 scalar function call in op_add and op_mul Updated undefining of macros in
 op_quantize Updated elseif() instead of if() in op_cat

Signed-off-by: cmadhira@cadence.com <mckala@invecas.com>
---
 .../cadence/fusion_g3/operators/op_add.cpp    | 51 ++++++++++++-------
 .../cadence/fusion_g3/operators/op_cat.cpp    | 10 ++--
 .../cadence/fusion_g3/operators/op_mul.cpp    | 47 ++++++++++-------
 .../fusion_g3/operators/op_quantize.cpp       | 14 ++---
 4 files changed, 70 insertions(+), 52 deletions(-)
diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
index 9537cbacb70..b5b5baf9a8c 100644
--- a/backends/cadence/fusion_g3/operators/op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -66,34 +66,20 @@ Tensor& add_out(
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "add.out";
 
-  const exec_aten::ArrayRef<Tensor::SizesType> a_size = a.sizes();
-  const exec_aten::ArrayRef<Tensor::SizesType> b_size = b.sizes();
-  const exec_aten::ArrayRef<Tensor::SizesType> out_size = out.sizes();
-
   int kTensorDimensionLimit = 5;
 
   int inp1_shape[kTensorDimensionLimit];
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  bool broadcast = 0;
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
   bool optimized = 1;
 
-  if ((a.dim() == 0) || (b.dim() == 0)) {
-    optimized = 0;
-  }
-
-  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
-    optimized = 0;
-  }
-
+  /* Added change to work with input dimensions more than 5 */
   for (int i = 0; i < max_dim; i++) {
     out_shape[i] = 1;
     inp1_shape[i] = 1;
@@ -114,6 +100,18 @@ Tensor& add_out(
     inp2_shape[i + offset_inp2] = b.size(i);
   }
 
+  /*find broadcast*/
+  for (int i = 0; i < out.dim(); i++) {
+    if (((inp1_shape[i]) != (out_shape[i])) ||
+        ((inp2_shape[i]) != (out_shape[i]))) {
+      broadcast = 1;
+    }
+  }
+
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
+  }
+
   if ((compute_type == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
@@ -121,7 +119,14 @@ Tensor& add_out(
 
     int alpha_val;
     torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
-    if (broadcast) {
+
+    if ((a.numel() == 1) && (alpha_val == 1)) {
+      xa_nn_elm_add_scalar_32x32_32(
+          out_data, inp2_data, inp1_data[0], alpha_val, out.numel());
+    } else if (b.numel() == 1) {
+      xa_nn_elm_add_scalar_32x32_32(
+          out_data, inp1_data, inp2_data[0], alpha_val, out.numel());
+    } else if (broadcast) {
       xa_nn_elm_add_broadcast_5D_32x32_32(
           out_data,
           out_shape,
@@ -143,7 +148,13 @@ Tensor& add_out(
     float alpha_val;
     torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
 
-    if (broadcast) {
+    if ((a.numel() == 1) && (alpha_val == 1.0)) {
+      xa_nn_elm_add_scalar_f32xf32_f32(
+          out_data, inp2_data, inp1_data[0], alpha_val, out.numel());
+    } else if (b.numel() == 1) {
+      xa_nn_elm_add_scalar_f32xf32_f32(
+          out_data, inp1_data, inp2_data[0], alpha_val, out.numel());
+    } else if (broadcast) {
       xa_nn_elm_add_broadcast_5D_f32xf32_f32(
           out_data,
           out_shape,
@@ -176,7 +187,6 @@ Tensor& add_out(
           torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16);
     });
   }
-
   return out;
 }
 
@@ -234,6 +244,7 @@ Tensor& add_scalar_out(
 
     xa_nn_elm_add_scalar_32x32_32(
         out_data, inp1_data, inp2_val, alpha_val, out.numel());
+
   } else if (compute_type == ScalarType::Float) {
     const float* const inp1_data = a.const_data_ptr<float>();
     float inp2_val;
@@ -246,6 +257,7 @@ Tensor& add_scalar_out(
 
     xa_nn_elm_add_scalar_f32xf32_f32(
         out_data, inp1_data, inp2_val, alpha_val, out.numel());
+
   } else {
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       torch::executor::native::utils::
@@ -266,6 +278,7 @@ Tensor& add_scalar_out(
                   SAME_AS_COMMON);
     });
   }
+
   return out;
 }
 
diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp
index 62bbb0c9d49..11bbcfece22 100644
--- a/backends/cadence/fusion_g3/operators/op_cat.cpp
+++ b/backends/cadence/fusion_g3/operators/op_cat.cpp
@@ -22,10 +22,7 @@ using torch::executor::KernelRuntimeContext;
  * updated to have support for below data types, these can be removed and
  * operator need to be updated accordingly
  */
-enum datatype {
-  Ushort = 20,
-  Uint = 23,
-};
+enum datatype { Ushort = 20, Uint = 23 };
 
 namespace cadence {
 namespace impl {
@@ -37,6 +34,7 @@ Tensor& cat_out(
     exec_aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
+
   if (dim < 0) {
     dim += out.dim();
   }
@@ -118,8 +116,7 @@ Tensor& cat_out(
         tensors.size(),
         (int)dim,
         sizeof(char));
-  }
-  if (out.scalar_type() == (ScalarType)Uint) {
+  } else if (out.scalar_type() == (ScalarType)Uint) {
     xa_nn_cat(
         out_data,
         out_shapes,
@@ -164,7 +161,6 @@ Tensor& cat_out(
     if (all_1d_empty) {
       return out;
     }
-
     const size_t outer = executorch::runtime::getLeadingDims(out, dim);
     const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim);
     const size_t ninputs = tensors.size();
diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
index 31cd50314e1..914ecf9d7e4 100644
--- a/backends/cadence/fusion_g3/operators/op_mul.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -58,34 +58,20 @@ Tensor& mul_out(
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "mul.out";
 
-  const exec_aten::ArrayRef<Tensor::SizesType> a_size = a.sizes();
-  const exec_aten::ArrayRef<Tensor::SizesType> b_size = b.sizes();
-  const exec_aten::ArrayRef<Tensor::SizesType> out_size = out.sizes();
-
   int kTensorDimensionLimit = 5;
 
   int inp1_shape[kTensorDimensionLimit];
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  bool broadcast = 0;
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
   bool optimized = 1;
 
-  if ((a.dim() == 0) || (b.dim() == 0)) {
-    optimized = 0;
-  }
-
-  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
-    optimized = 0;
-  }
-
+  /* Added change to work with input dimensions more than 5 */
   for (int i = 0; i < max_dim; i++) {
     out_shape[i] = 1;
     inp1_shape[i] = 1;
@@ -106,12 +92,30 @@ Tensor& mul_out(
     inp2_shape[i + offset_inp2] = b.size(i);
   }
 
+  /*find broadcast*/
+  for (int i = 0; i < out.dim(); i++) {
+    if (((inp1_shape[i]) != (out_shape[i])) ||
+        ((inp2_shape[i]) != (out_shape[i]))) {
+      broadcast = 1;
+    }
+  }
+
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
+  }
+
   if ((compute_type == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
 
-    if (broadcast) {
+    if (a.numel() == 1) {
+      xa_nn_elm_mul_scalar_32x32_32(
+          out_data, inp2_data, inp1_data[0], out.numel());
+    } else if (b.numel() == 1) {
+      xa_nn_elm_mul_scalar_32x32_32(
+          out_data, inp1_data, inp2_data[0], out.numel());
+    } else if (broadcast) {
       xa_nn_elm_mul_broadcast_5D_32x32_32(
           out_data,
           out_shape,
@@ -128,7 +132,13 @@ Tensor& mul_out(
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
 
-    if (broadcast) {
+    if (a.numel() == 1) {
+      xa_nn_elm_mul_scalar_f32xf32_f32(
+          out_data, inp2_data, inp1_data[0], out.numel());
+    } else if (b.numel() == 1) {
+      xa_nn_elm_mul_scalar_f32xf32_f32(
+          out_data, inp1_data, inp2_data[0], out.numel());
+    } else if (broadcast) {
       xa_nn_elm_mul_broadcast_5D_f32xf32_f32(
           out_data,
           out_shape,
@@ -157,7 +167,6 @@ Tensor& mul_out(
           torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16);
     });
   }
-
   return out;
 }
 
diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp
index 2b8376dc8db..31b00c185ed 100644
--- a/backends/cadence/fusion_g3/operators/op_quantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_quantize.cpp
@@ -31,7 +31,7 @@ enum datatype { Ushort = 20, Bits4u = 21, Bits4 = 22 };
  */
 namespace cadence {
 namespace impl {
-namespace FusionG3 {
+namespace G3 {
 namespace native {
 
 namespace {
@@ -364,8 +364,8 @@ void quantize_impl(
 
 #undef ASYM_CALCULATE_FLOAT_TYPE_TENSOR
 #undef ASYM_CALCULATE_FLOAT_TYPE_CHANNEL
-#undef ASYM_ASYM_QUANTIZE_IMPL_CHANNEL_TENSOR
-#undef ASYM_ASYM_QUANTIZE_IMPL_CHANNEL_CHANNEL
+#undef ASYM_QUANTIZE_IMPL_TENSOR
+#undef ASYM_QUANTIZE_IMPL_CHANNEL
     }
   } else {
     if (out.scalar_type() == ScalarType::Byte) {
@@ -549,8 +549,8 @@ void quantize_impl(
       }
 #undef SYM_CALCULATE_FLOAT_TYPE_TENSOR
 #undef SYM_CALCULATE_FLOAT_TYPE_CHANNEL
-#undef SYM_ASYM_QUANTIZE_IMPL_CHANNEL_TENSOR
-#undef SYM_ASYM_QUANTIZE_IMPL_CHANNEL_CHANNEL
+#undef SYM_QUANTIZE_IMPL_TENSOR
+#undef SYM_QUANTIZE_IMPL_CHANNEL
     }
   }
 }
@@ -565,6 +565,7 @@ Tensor& quantize_per_tensor_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
+
   torch::executor::Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
       err == torch::executor::Error::Ok,
@@ -719,7 +720,6 @@ Tensor& quantize_per_channel_out(
       axis_ptr,
       (int)quant_min,
       (int)quant_max);
-
   return out;
 }
 
@@ -802,6 +802,6 @@ Tensor& quantize_per_token_out(
 }
 
 } // namespace native
-} // namespace FusionG3
+} // namespace G3
 } // namespace impl
 } // namespace cadence
\ No newline at end of file

From 0ef057f988dd5cb9d69355914484e320823fda4c Mon Sep 17 00:00:00 2001
From: "cmadhira@cadence.com" <mckala@invecas.com>
Date: Tue, 3 Dec 2024 10:17:23 +0530
Subject: [PATCH 2/2] Removed linter errors

Signed-off-by: cmadhira@cadence.com <mckala@invecas.com>
---
 backends/cadence/fusion_g3/operators/op_cat.cpp      | 1 -
 backends/cadence/fusion_g3/operators/op_quantize.cpp | 1 -
 2 files changed, 2 deletions(-)

diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp
index 11bbcfece22..7fae3fa29c4 100644
--- a/backends/cadence/fusion_g3/operators/op_cat.cpp
+++ b/backends/cadence/fusion_g3/operators/op_cat.cpp
@@ -34,7 +34,6 @@ Tensor& cat_out(
     exec_aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
-
   if (dim < 0) {
     dim += out.dim();
   }
diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp
index 31b00c185ed..4b69c5d0f49 100644
--- a/backends/cadence/fusion_g3/operators/op_quantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_quantize.cpp
@@ -565,7 +565,6 @@ Tensor& quantize_per_tensor_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
-
   torch::executor::Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
       err == torch::executor::Error::Ok,