pytorch
diff --git a/‎backends/cadence/fusion_g3/operators/op_cat.cpp‎
Lines changed: 12 additions & 73 deletions b/‎backends/cadence/fusion_g3/operators/op_cat.cpp‎
Lines changed: 12 additions & 73 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_dequantize.cpp‎
Lines changed: 8 additions & 10 deletions b/‎backends/cadence/fusion_g3/operators/op_dequantize.cpp‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_div.cpp‎
Lines changed: 4 additions & 2 deletions b/‎backends/cadence/fusion_g3/operators/op_div.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_exp.cpp‎
Lines changed: 2 additions & 3 deletions b/‎backends/cadence/fusion_g3/operators/op_exp.cpp‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_mean.cpp‎
Lines changed: 12 additions & 9 deletions b/‎backends/cadence/fusion_g3/operators/op_mean.cpp‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp‎
Lines changed: 8 additions & 7 deletions b/‎backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp‎
Lines changed: 8 additions & 7 deletions
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+#include <executorch/backends/cadence/fusion_g3/operators/xt_utils.h>
 
 #include <cstring>
 
@@ -16,7 +17,7 @@
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using ::executorch::aten::Scalar;
+using ::executorch::aten::ArrayRef;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
@@ -27,7 +28,6 @@ using ::executorch::runtime::KernelRuntimeContext;
  * updated to have support for below data types, these can be removed and
  * operator need to be updated accordingly
  */
-enum datatype { Ushort = 20, Uint = 23 };
 
 namespace cadence {
 namespace impl {
@@ -36,7 +36,7 @@ namespace native {
 
 Tensor& cat_out(
     KernelRuntimeContext& ctx,
-    ::executorch::aten::ArrayRef<Tensor> tensors,
+    ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
   if (dim < 0) {
@@ -84,7 +84,7 @@ Tensor& cat_out(
   int inp_shapes_size[tensors.size()];
 
   int temp_sizes[tensors.size()][kTensorDimensionLimit];
-  ::executorch::aten::ArrayRef<Tensor::SizesType> temp_size;
+  ArrayRef<Tensor::SizesType> temp_size;
 
   for (int i = 0; i < tensors.size(); i++) {
     inp_tensors[i] = tensors[i].const_data_ptr<signed char>();
@@ -99,14 +99,19 @@ Tensor& cat_out(
 
   signed char* out_data = out.mutable_data_ptr<signed char>();
 
-  const ::executorch::aten::ArrayRef<Tensor::SizesType> out_size = out.sizes();
+  const ArrayRef<Tensor::SizesType> out_size = out.sizes();
   int out_shapes[kTensorDimensionLimit];
   for (int i = 0; i < out_size.size(); i++) // output shapes
   {
     out_shapes[i] = out_size[i];
   }
 
-  if (out.scalar_type() == ScalarType::Int) {
+  if ((out.scalar_type() == ScalarType::Int) ||
+      (out.scalar_type() == ScalarType::Short) ||
+      (out.scalar_type() == ScalarType::Char) ||
+      (out.scalar_type() == ScalarType::UInt32) ||
+      (out.scalar_type() == ScalarType::UInt16) ||
+      (out.scalar_type() == ScalarType::Byte)) {
     XT_KERNEL_CHECK(
         ctx,
         out,
@@ -118,73 +123,7 @@ Tensor& cat_out(
         inp_shapes_size[0],
         tensors.size(),
         (int)dim,
-        sizeof(int));
-  } else if (out.scalar_type() == ScalarType::Short) {
-    XT_KERNEL_CHECK(
-        ctx,
-        out,
-        xa_nn_cat,
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(short));
-  } else if (out.scalar_type() == ScalarType::Char) {
-    XT_KERNEL_CHECK(
-        ctx,
-        out,
-        xa_nn_cat,
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(char));
-  } else if (out.scalar_type() == (ScalarType)Uint) {
-    XT_KERNEL_CHECK(
-        ctx,
-        out,
-        xa_nn_cat,
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(int));
-  } else if (out.scalar_type() == (ScalarType)Ushort) {
-    XT_KERNEL_CHECK(
-        ctx,
-        out,
-        xa_nn_cat,
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(short));
-  } else if (out.scalar_type() == ScalarType::Byte) {
-    XT_KERNEL_CHECK(
-        ctx,
-        out,
-        xa_nn_cat,
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(char));
-
+        get_element_size(out.scalar_type()));
   } else {
     const size_t outer = executorch::runtime::getLeadingDims(out, dim);
     const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim);
 
@@ -18,7 +18,6 @@
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
@@ -32,7 +31,7 @@ using optional = ::executorch::aten::optional<T>;
  * operator need to be updated accordingly
  */
 
-enum datatype { Ushort = 20, Bits4u = 21, Bits4 = 22 };
+enum datatype { Bits4u = 21, Bits4 = 22 };
 
 /**
  * For an input tensor, use the scale and zero_point arguments to quantize it.
@@ -57,9 +56,8 @@ void check_dequantize_per_tensor_args(
   ET_CHECK_MSG(
       input.scalar_type() == ScalarType::Byte ||
           input.scalar_type() == ScalarType::Char ||
-          input.scalar_type() == ScalarType::Bits16 ||
+          input.scalar_type() == ScalarType::UInt16 ||
           input.scalar_type() == ScalarType::Short ||
-          input.scalar_type() == (ScalarType)Ushort ||
           input.scalar_type() == (ScalarType)Bits4 ||
           input.scalar_type() == (ScalarType)Bits4u ||
           input.scalar_type() == ScalarType::Int,
@@ -154,7 +152,7 @@ Tensor& dequantize_impl(
           axis,
           zero_point_data,
           scale_data);
-    } else if (input.scalar_type() == (ScalarType)Ushort) {
+    } else if (input.scalar_type() == ScalarType::UInt16) {
       const uint16_t* input_data = input.const_data_ptr<uint16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -236,7 +234,7 @@ Tensor& dequantize_impl(
     break;
         switch (input.scalar_type()) {
           ET_FORALL_INT_TYPES(ASYM_CALCULATE_INT_TYPE_TENSOR);
-          ASYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, Bits16);
+          ASYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, UInt16);
           default:
             ET_CHECK_MSG(
                 false,
@@ -328,7 +326,7 @@ Tensor& dequantize_impl(
     break;
         switch (input.scalar_type()) {
           ET_FORALL_INT_TYPES(ASYM_CALCULATE_INT_TYPE_CHANNEL);
-          ASYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, Bits16);
+          ASYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, UInt16);
           default:
             ET_CHECK_MSG(
                 false,
@@ -364,7 +362,7 @@ Tensor& dequantize_impl(
           input.dim(),
           axis,
           scale_data);
-    } else if (input.scalar_type() == (ScalarType)Ushort) {
+    } else if (input.scalar_type() == ScalarType::UInt16) {
       const uint16_t* input_data = input.const_data_ptr<uint16_t>();
       XT_KERNEL_CHECK(
           ctx,
@@ -442,7 +440,7 @@ Tensor& dequantize_impl(
     break;
         switch (input.scalar_type()) {
           ET_FORALL_INT_TYPES(SYM_CALCULATE_INT_TYPE_TENSOR);
-          SYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, Bits16);
+          SYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, UInt16);
           default:
             ET_CHECK_MSG(
                 false,
@@ -534,7 +532,7 @@ Tensor& dequantize_impl(
     break;
         switch (input.scalar_type()) {
           ET_FORALL_INT_TYPES(SYM_CALCULATE_INT_TYPE_CHANNEL);
-          SYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, Bits16);
+          SYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, UInt16);
           default:
             ET_CHECK_MSG(
                 false,
 
@@ -19,8 +19,10 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
+using ::executorch::aten::string_view;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::canCast;
 using ::executorch::runtime::Error;
@@ -230,7 +232,7 @@ Tensor& div_out_mode(
     KernelRuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
-    ::executorch::aten::optional<::executorch::aten::string_view> mode,
+    optional<string_view> mode,
     Tensor& out) {
   if (!mode.has_value()) {
     return div_out(ctx, a, b, out);
@@ -546,7 +548,7 @@ Tensor& div_scalar_mode_out(
     KernelRuntimeContext& ctx,
     const Tensor& a,
     const Scalar& b,
-    ::executorch::aten::optional<::executorch::aten::string_view> mode,
+    optional<string_view> mode,
     Tensor& out) {
   if (!mode.has_value()) {
     return div_scalar_out(ctx, a, b, out);
 
@@ -16,18 +16,17 @@
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
-using torch::executor::RuntimeContext;
+using ::executorch::runtime::KernelRuntimeContext;
 
 namespace cadence {
 namespace impl {
 namespace G3 {
 namespace native {
 
-Tensor& exp_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
 #ifdef OP_ARG_CHECK
   ET_KERNEL_CHECK(
       ctx,
 
@@ -16,7 +16,8 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-using ::executorch::aten::Scalar;
+using ::executorch::aten::ArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
@@ -30,8 +31,7 @@ namespace native {
 int prepare_data(
     const Tensor& in,
     Tensor& out,
-    ::executorch::aten::optional<::executorch::aten::ArrayRef<int64_t>>
-        dim_list,
+    optional<ArrayRef<int64_t>> dim_list,
     int* inp_shape,
     int* out_shape,
     int* p_axis,
@@ -62,10 +62,9 @@ int prepare_data(
 Tensor& mean_dim_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    ::executorch::aten::optional<::executorch::aten::ArrayRef<int64_t>>
-        dim_list,
+    optional<ArrayRef<int64_t>> dim_list,
     bool keepdim,
-    ::executorch::aten::optional<ScalarType> dtype,
+    optional<ScalarType> dtype,
     Tensor& out) {
   (void)ctx;
 
@@ -141,11 +140,15 @@ Tensor& mean_dim_out(
       out_shape[0] = 1;
     }
 
-    int scratch_size = 1;
-    for (int i = 0; i < num_inp_dims; i++) {
-      scratch_size *= inp_shape[i];
+    int inp_shape_max = inp_shape[p_axis[0]];
+    for (int i = 1; i < num_axis_dims; i++) {
+      if (inp_shape[p_axis[i]] > inp_shape_max) {
+        inp_shape_max = inp_shape[p_axis[i]];
+      }
     }
 
+    int scratch_size = in.numel() / inp_shape_max;
+
     executorch::runtime::Result<void*> temp_mem =
         ctx.allocate_temp(scratch_size * sizeof(float));
 
 
@@ -18,9 +18,10 @@
 #include <executorch/kernels/portable/cpu/vec_ops.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using Tensor = ::executorch::aten::Tensor;
-using ScalarType = ::executorch::aten::ScalarType;
-using IntArrayRef = ::executorch::aten::ArrayRef<int64_t>;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::KernelRuntimeContext;
 
@@ -35,8 +36,8 @@ template <typename CTYPE>
 void layer_norm(
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const ::executorch::aten::optional<Tensor>& weight,
-    const ::executorch::aten::optional<Tensor>& bias,
+    const optional<Tensor>& weight,
+    const optional<Tensor>& bias,
     CTYPE eps,
     Tensor& out,
     Tensor& mean,
@@ -112,8 +113,8 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const ::executorch::aten::optional<Tensor>& weight,
-    const ::executorch::aten::optional<Tensor>& bias,
+    const optional<Tensor>& weight,
+    const optional<Tensor>& bias,
     double eps,
     Tensor& out,
     Tensor& mean_out,