pytorch · mcremon-meta · Dec 3, 2024 · Dec 2, 2024 · Dec 3, 2024 · mcremon-meta
diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -66,34 +66,20 @@ Tensor& add_out(
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "add.out";
 
-  const exec_aten::ArrayRef<Tensor::SizesType> a_size = a.sizes();
-  const exec_aten::ArrayRef<Tensor::SizesType> b_size = b.sizes();
-  const exec_aten::ArrayRef<Tensor::SizesType> out_size = out.sizes();
-
   int kTensorDimensionLimit = 5;
 
   int inp1_shape[kTensorDimensionLimit];
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  bool broadcast = 0;
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
   bool optimized = 1;
 
-  if ((a.dim() == 0) || (b.dim() == 0)) {
-    optimized = 0;
-  }
-
-  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
-    optimized = 0;
-  }
-
+  /* Added change to work with input dimensions more than 5 */
   for (int i = 0; i < max_dim; i++) {
     out_shape[i] = 1;
     inp1_shape[i] = 1;
@@ -114,14 +100,33 @@ Tensor& add_out(
     inp2_shape[i + offset_inp2] = b.size(i);
   }
 
+  /*find broadcast*/
+  for (int i = 0; i < out.dim(); i++) {
+    if (((inp1_shape[i]) != (out_shape[i])) ||
+        ((inp2_shape[i]) != (out_shape[i]))) {
+      broadcast = 1;
+    }
+  }
+
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
+  }
+
   if ((compute_type == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
 
     int alpha_val;
     torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
-    if (broadcast) {
+
+    if ((a.numel() == 1) && (alpha_val == 1)) {
+      xa_nn_elm_add_scalar_32x32_32(
+          out_data, inp2_data, inp1_data[0], alpha_val, out.numel());
+    } else if (b.numel() == 1) {
+      xa_nn_elm_add_scalar_32x32_32(
+          out_data, inp1_data, inp2_data[0], alpha_val, out.numel());
+    } else if (broadcast) {
       xa_nn_elm_add_broadcast_5D_32x32_32(
           out_data,
           out_shape,
@@ -143,7 +148,13 @@ Tensor& add_out(
     float alpha_val;
     torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
 
-    if (broadcast) {
+    if ((a.numel() == 1) && (alpha_val == 1.0)) {
+      xa_nn_elm_add_scalar_f32xf32_f32(
+          out_data, inp2_data, inp1_data[0], alpha_val, out.numel());
+    } else if (b.numel() == 1) {
+      xa_nn_elm_add_scalar_f32xf32_f32(
+          out_data, inp1_data, inp2_data[0], alpha_val, out.numel());
+    } else if (broadcast) {
       xa_nn_elm_add_broadcast_5D_f32xf32_f32(
           out_data,
           out_shape,
@@ -176,7 +187,6 @@ Tensor& add_out(
           torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16);
     });
   }
-
   return out;
 }
 
@@ -234,6 +244,7 @@ Tensor& add_scalar_out(
 
     xa_nn_elm_add_scalar_32x32_32(
         out_data, inp1_data, inp2_val, alpha_val, out.numel());
+
   } else if (compute_type == ScalarType::Float) {
     const float* const inp1_data = a.const_data_ptr<float>();
     float inp2_val;
@@ -246,6 +257,7 @@ Tensor& add_scalar_out(
 
     xa_nn_elm_add_scalar_f32xf32_f32(
         out_data, inp1_data, inp2_val, alpha_val, out.numel());
+
   } else {
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       torch::executor::native::utils::
@@ -266,6 +278,7 @@ Tensor& add_scalar_out(
                   SAME_AS_COMMON);
     });
   }
+
   return out;
 }
 

diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp
@@ -22,10 +22,7 @@ using torch::executor::KernelRuntimeContext;
  * updated to have support for below data types, these can be removed and
  * operator need to be updated accordingly
  */
-enum datatype {
-  Ushort = 20,
-  Uint = 23,
-};
+enum datatype { Ushort = 20, Uint = 23 };
 
 namespace cadence {
 namespace impl {
@@ -118,8 +115,7 @@ Tensor& cat_out(
         tensors.size(),
         (int)dim,
         sizeof(char));
-  }
-  if (out.scalar_type() == (ScalarType)Uint) {
+  } else if (out.scalar_type() == (ScalarType)Uint) {
     xa_nn_cat(
         out_data,
         out_shapes,
@@ -164,7 +160,6 @@ Tensor& cat_out(
     if (all_1d_empty) {
       return out;
     }
-
     const size_t outer = executorch::runtime::getLeadingDims(out, dim);
     const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim);
     const size_t ninputs = tensors.size();

diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -58,34 +58,20 @@ Tensor& mul_out(
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "mul.out";
 
-  const exec_aten::ArrayRef<Tensor::SizesType> a_size = a.sizes();
-  const exec_aten::ArrayRef<Tensor::SizesType> b_size = b.sizes();
-  const exec_aten::ArrayRef<Tensor::SizesType> out_size = out.sizes();
-
   int kTensorDimensionLimit = 5;
 
   int inp1_shape[kTensorDimensionLimit];
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  bool broadcast = 0;
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
   bool optimized = 1;
 
-  if ((a.dim() == 0) || (b.dim() == 0)) {
-    optimized = 0;
-  }
-
-  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
-    optimized = 0;
-  }
-
+  /* Added change to work with input dimensions more than 5 */
   for (int i = 0; i < max_dim; i++) {
     out_shape[i] = 1;
     inp1_shape[i] = 1;
@@ -106,12 +92,30 @@ Tensor& mul_out(
     inp2_shape[i + offset_inp2] = b.size(i);
   }
 
+  /*find broadcast*/
+  for (int i = 0; i < out.dim(); i++) {
+    if (((inp1_shape[i]) != (out_shape[i])) ||
+        ((inp2_shape[i]) != (out_shape[i]))) {
+      broadcast = 1;
+    }
+  }
+
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
+  }
+
   if ((compute_type == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
 
-    if (broadcast) {
+    if (a.numel() == 1) {
+      xa_nn_elm_mul_scalar_32x32_32(
+          out_data, inp2_data, inp1_data[0], out.numel());
+    } else if (b.numel() == 1) {
+      xa_nn_elm_mul_scalar_32x32_32(
+          out_data, inp1_data, inp2_data[0], out.numel());
+    } else if (broadcast) {
       xa_nn_elm_mul_broadcast_5D_32x32_32(
           out_data,
           out_shape,
@@ -128,7 +132,13 @@ Tensor& mul_out(
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
 
-    if (broadcast) {
+    if (a.numel() == 1) {
+      xa_nn_elm_mul_scalar_f32xf32_f32(
+          out_data, inp2_data, inp1_data[0], out.numel());
+    } else if (b.numel() == 1) {
+      xa_nn_elm_mul_scalar_f32xf32_f32(
+          out_data, inp1_data, inp2_data[0], out.numel());
+    } else if (broadcast) {
       xa_nn_elm_mul_broadcast_5D_f32xf32_f32(
           out_data,
           out_shape,
@@ -157,7 +167,6 @@ Tensor& mul_out(
           torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16);
     });
   }
-
   return out;
 }
 

diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp
@@ -31,7 +31,7 @@ enum datatype { Ushort = 20, Bits4u = 21, Bits4 = 22 };
  */
 namespace cadence {
 namespace impl {
-namespace FusionG3 {
+namespace G3 {
 namespace native {
 
 namespace {
@@ -364,8 +364,8 @@ void quantize_impl(
 
 #undef ASYM_CALCULATE_FLOAT_TYPE_TENSOR
 #undef ASYM_CALCULATE_FLOAT_TYPE_CHANNEL
-#undef ASYM_ASYM_QUANTIZE_IMPL_CHANNEL_TENSOR
-#undef ASYM_ASYM_QUANTIZE_IMPL_CHANNEL_CHANNEL
+#undef ASYM_QUANTIZE_IMPL_TENSOR
+#undef ASYM_QUANTIZE_IMPL_CHANNEL
     }
   } else {
     if (out.scalar_type() == ScalarType::Byte) {
@@ -549,8 +549,8 @@ void quantize_impl(
       }
 #undef SYM_CALCULATE_FLOAT_TYPE_TENSOR
 #undef SYM_CALCULATE_FLOAT_TYPE_CHANNEL
-#undef SYM_ASYM_QUANTIZE_IMPL_CHANNEL_TENSOR
-#undef SYM_ASYM_QUANTIZE_IMPL_CHANNEL_CHANNEL
+#undef SYM_QUANTIZE_IMPL_TENSOR
+#undef SYM_QUANTIZE_IMPL_CHANNEL
     }
   }
 }
@@ -719,7 +719,6 @@ Tensor& quantize_per_channel_out(
       axis_ptr,
       (int)quant_min,
       (int)quant_max);
-
   return out;
 }
 
@@ -802,6 +801,6 @@ Tensor& quantize_per_token_out(
 }
 
 } // namespace native
-} // namespace FusionG3
+} // namespace G3
 } // namespace impl
 } // namespace cadence