diff --git a/.gitmodules b/.gitmodules
index 58f2133ed67..afae765e2b8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -66,7 +66,7 @@
 	url = https://github.com/pybind/pybind11.git
 [submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
 	path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
-	url = https://github.com/foss-xtensa/nnlib-FusionG3/
+	url = https://github.com/foss-xtensa/nnlib-FusionG3.git
 [submodule "third-party/ao"]
 	path = third-party/ao
 	url = https://github.com/pytorch/ao.git
diff --git a/backends/cadence/aot/functions_fusion_g3.yaml b/backends/cadence/aot/functions_fusion_g3.yaml
index f1f934b9701..5ca05544806 100644
--- a/backends/cadence/aot/functions_fusion_g3.yaml
+++ b/backends/cadence/aot/functions_fusion_g3.yaml
@@ -50,12 +50,12 @@
 - op: div.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::div_out
+      kernel_name: cadence::impl::G3::div_out
 
 - op: div.out_mode
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::div_out_mode
+      kernel_name: cadence::impl::G3::div_out_mode
 
 - op: embedding.out
   kernels:
@@ -71,7 +71,6 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::G3::mul_out
-
 - op: mul.Scalar_out
   kernels:
     - arg_meta: null
@@ -80,7 +79,7 @@
 - op: permute_copy.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::permute_copy_out
+      kernel_name: cadence::impl::G3::permute_copy_out
 
 - op: sigmoid.out
   kernels:
@@ -90,7 +89,7 @@
 - op: slice_copy.Tensor_out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::slice_copy_Tensor_out
+      kernel_name: cadence::impl::G3::slice_copy_Tensor_out
 
 - op: split_with_sizes_copy.out
   kernels:
@@ -100,7 +99,12 @@
 - op: sub.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::sub_out
+      kernel_name: cadence::impl::G3::sub_out
+
+- op: sub.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::sub_scalar_out
 
 - op: view_copy.out
   kernels:
@@ -117,6 +121,16 @@
     - arg_meta: null
       kernel_name: cadence::impl::G3::native_layer_norm_out
 
+- op: mean.out
+  kernels:
+    - arg_meta: null
+      kernel_name:  cadence::impl::G3::mean_dim_out
+
+- op: exp.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::exp_out
+
 # custom ops
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
diff --git a/backends/cadence/fusion_g3/operators/CMakeLists.txt b/backends/cadence/fusion_g3/operators/CMakeLists.txt
index 704b4aa741a..cac16bddc50 100644
--- a/backends/cadence/fusion_g3/operators/CMakeLists.txt
+++ b/backends/cadence/fusion_g3/operators/CMakeLists.txt
@@ -36,6 +36,12 @@ set(_aten_ops__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/op_native_layer_norm.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_quantize.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_dequantize.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_sub.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_div.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_mean.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_slice_copy.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_permute_copy.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_exp.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
@@ -51,6 +57,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
 )
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
index f40fcc973b0..d51fee5338f 100644
--- a/backends/cadence/fusion_g3/operators/op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -39,6 +39,7 @@ Tensor& add_out(
   ScalarType common_type =
       executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
 
+#ifdef OP_ARG_CHECK
   // Check Common Dtype
   ET_KERNEL_CHECK(
       ctx,
@@ -62,12 +63,12 @@ Tensor& add_out(
       torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
       InvalidArgument,
       out);
+#endif
 
   // Compute Dtype
   ScalarType compute_type =
       torch::executor::native::utils::get_compute_type(common_type);
 
-  // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "add.out";
 
   int kTensorDimensionLimit = 5;
@@ -253,6 +254,7 @@ Tensor& add_scalar_out(
       torch::executor::native::utils::promote_type_with_scalar(
           a.scalar_type(), b);
 
+#ifdef OP_ARG_CHECK
   // Check Common Dtype
   ET_KERNEL_CHECK(
       ctx,
@@ -276,7 +278,7 @@ Tensor& add_scalar_out(
       executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok,
       InvalidArgument,
       out);
-
+#endif
   // Compute Dtype
   ScalarType compute_type =
       torch::executor::native::utils::get_compute_type(common_type);
diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp
index f0f327c024b..74fd96a2120 100644
--- a/backends/cadence/fusion_g3/operators/op_cat.cpp
+++ b/backends/cadence/fusion_g3/operators/op_cat.cpp
@@ -6,13 +6,18 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+#include <executorch/backends/cadence/fusion_g3/operators/xt_utils.h>
+
 #include <cstring>
 
 #include <xa_nnlib_kernels_api.h>
 
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+using ::executorch::aten::ArrayRef;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
@@ -23,7 +28,6 @@ using ::executorch::runtime::KernelRuntimeContext;
  * updated to have support for below data types, these can be removed and
  * operator need to be updated accordingly
  */
-enum datatype { Ushort = 20, Uint = 23 };
 
 namespace cadence {
 namespace impl {
@@ -32,20 +36,22 @@ namespace native {
 
 Tensor& cat_out(
     KernelRuntimeContext& ctx,
-    exec_aten::ArrayRef<Tensor> tensors,
+    ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
   if (dim < 0) {
     dim += out.dim();
   }
 
+  int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
+
+#ifdef OP_ARG_CHECK
   ET_KERNEL_CHECK(
       ctx,
       torch::executor::check_cat_args(tensors, dim, out),
       InvalidArgument,
       out);
 
-  int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   torch::executor::get_cat_out_target_size(
@@ -57,6 +63,20 @@ Tensor& cat_out(
           out, {expected_out_size, expected_out_dim}) == Error::Ok,
       InvalidArgument,
       out);
+#endif
+  // Special handling when all inputs are 1D-empty tensors for aten
+  // consistency In that case, just return an 1D-empty tensor without checking
+  // dim
+  bool all_1d_empty = true;
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
+      all_1d_empty = false;
+      break;
+    }
+  }
+  if (all_1d_empty) {
+    return out;
+  }
 
   const signed char* inp_tensors[tensors.size()];
   const int* inp_tensors_shapes[tensors.size()];
@@ -64,7 +84,7 @@ Tensor& cat_out(
   int inp_shapes_size[tensors.size()];
 
   int temp_sizes[tensors.size()][kTensorDimensionLimit];
-  exec_aten::ArrayRef<Tensor::SizesType> temp_size;
+  ArrayRef<Tensor::SizesType> temp_size;
 
   for (int i = 0; i < tensors.size(); i++) {
     inp_tensors[i] = tensors[i].const_data_ptr<signed char>();
@@ -79,55 +99,23 @@ Tensor& cat_out(
 
   signed char* out_data = out.mutable_data_ptr<signed char>();
 
-  const exec_aten::ArrayRef<Tensor::SizesType> out_size = out.sizes();
+  const ArrayRef<Tensor::SizesType> out_size = out.sizes();
   int out_shapes[kTensorDimensionLimit];
   for (int i = 0; i < out_size.size(); i++) // output shapes
   {
     out_shapes[i] = out_size[i];
   }
 
-  if (out.scalar_type() == ScalarType::Int) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(int));
-  } else if (out.scalar_type() == ScalarType::Short) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(short));
-  } else if (out.scalar_type() == ScalarType::Char) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(char));
-  } else if (out.scalar_type() == (ScalarType)Uint) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(int));
-  } else if (out.scalar_type() == (ScalarType)Ushort) {
-    xa_nn_cat(
+  if ((out.scalar_type() == ScalarType::Int) ||
+      (out.scalar_type() == ScalarType::Short) ||
+      (out.scalar_type() == ScalarType::Char) ||
+      (out.scalar_type() == ScalarType::UInt32) ||
+      (out.scalar_type() == ScalarType::UInt16) ||
+      (out.scalar_type() == ScalarType::Byte)) {
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_cat,
         out_data,
         out_shapes,
         inp_tensors,
@@ -135,32 +123,8 @@ Tensor& cat_out(
         inp_shapes_size[0],
         tensors.size(),
         (int)dim,
-        sizeof(short));
-  } else if (out.scalar_type() == ScalarType::Byte) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(char));
-
+        get_element_size(out.scalar_type()));
   } else {
-    // Special handling when all inputs are 1D-empty tensors for aten
-    // consistency In that case, just return an 1D-empty tensor without checking
-    // dim
-    bool all_1d_empty = true;
-    for (size_t i = 0; i < tensors.size(); ++i) {
-      if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
-        all_1d_empty = false;
-        break;
-      }
-    }
-    if (all_1d_empty) {
-      return out;
-    }
     const size_t outer = executorch::runtime::getLeadingDims(out, dim);
     const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim);
     const size_t ninputs = tensors.size();
diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
index ed5b3125ac4..cff50f2a90b 100644
--- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
@@ -6,30 +6,32 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
 #include <algorithm>
 #include <cinttypes>
 #include <cmath>
 
 #include <xa_nnlib_kernels_api.h>
 
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::KernelRuntimeContext;
 
 template <typename T>
-using optional = exec_aten::optional<T>;
+using optional = ::executorch::aten::optional<T>;
 /* ScalarType in Executorch do not have support for below data types.
  * So, creating a placeholder for these data types. Once, ScalarTypes is
  * updated to have support for below data types, these can be removed and
  * operator need to be updated accordingly
  */
 
-enum datatype { Ushort = 20, Bits4u = 21, Bits4 = 22 };
+enum datatype { Bits4u = 21, Bits4 = 22 };
 
 /**
  * For an input tensor, use the scale and zero_point arguments to quantize it.
@@ -49,14 +51,13 @@ void check_dequantize_per_tensor_args(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType>& out_dtype,
+    ::executorch::aten::optional<ScalarType>& out_dtype,
     Tensor& out) {
   ET_CHECK_MSG(
       input.scalar_type() == ScalarType::Byte ||
           input.scalar_type() == ScalarType::Char ||
           input.scalar_type() == ScalarType::UInt16 ||
           input.scalar_type() == ScalarType::Short ||
-          input.scalar_type() == (ScalarType)Ushort ||
           input.scalar_type() == (ScalarType)Bits4 ||
           input.scalar_type() == (ScalarType)Bits4u ||
           input.scalar_type() == ScalarType::Int,
@@ -85,14 +86,16 @@ void check_dequantize_per_tensor_args(
 } // namespace
 
 /* Local function which calls the kernels based on the input datatype */
-void dequantize_impl(
+Tensor& dequantize_impl(
+    KernelRuntimeContext& ctx,
     Tensor& out,
     const Tensor& input,
     float* scale_data,
     int* zero_point_data,
     int* axis,
-    exec_aten::optional<ScalarType> out_dtype) {
-  const exec_aten::ArrayRef<Tensor::SizesType> input_size = input.sizes();
+    ::executorch::aten::optional<ScalarType> out_dtype) {
+  const ::executorch::aten::ArrayRef<Tensor::SizesType> input_size =
+      input.sizes();
 
   int kTensorDimensionLimit = 5;
 
@@ -125,7 +128,10 @@ void dequantize_impl(
   if (is_asym_dequant) {
     if (input.scalar_type() == ScalarType::Byte) {
       const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-      xa_nn_elm_dequantize_asym8u_f32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_asym8u_f32,
           out_data,
           input_data,
           inp_shape,
@@ -135,7 +141,10 @@ void dequantize_impl(
           scale_data);
     } else if (input.scalar_type() == ScalarType::Char) {
       const int8_t* input_data = input.const_data_ptr<int8_t>();
-      xa_nn_elm_dequantize_asym8_f32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_asym8_f32,
           out_data,
           input_data,
           inp_shape,
@@ -143,9 +152,12 @@ void dequantize_impl(
           axis,
           zero_point_data,
           scale_data);
-    } else if (input.scalar_type() == (ScalarType)Ushort) {
+    } else if (input.scalar_type() == ScalarType::UInt16) {
       const uint16_t* input_data = input.const_data_ptr<uint16_t>();
-      xa_nn_elm_dequantize_asym16u_f32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_asym16u_f32,
           out_data,
           input_data,
           inp_shape,
@@ -155,7 +167,10 @@ void dequantize_impl(
           scale_data);
     } else if (input.scalar_type() == ScalarType::Short) {
       const int16_t* input_data = input.const_data_ptr<int16_t>();
-      xa_nn_elm_dequantize_asym16_f32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_asym16_f32,
           out_data,
           input_data,
           inp_shape,
@@ -165,7 +180,10 @@ void dequantize_impl(
           scale_data);
     } else if (input.scalar_type() == (ScalarType)Bits4u) {
       const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-      xa_nn_elm_dequantize_asym4u_f32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_asym4u_f32,
           out_data,
           input_data,
           inp_shape,
@@ -175,7 +193,10 @@ void dequantize_impl(
           scale_data);
     } else if (input.scalar_type() == (ScalarType)Bits4) {
       const int8_t* input_data = input.const_data_ptr<int8_t>();
-      xa_nn_elm_dequantize_asym4_f32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_asym4_f32,
           out_data,
           input_data,
           inp_shape,
@@ -233,8 +254,9 @@ void dequantize_impl(
           }
         }
 
-        exec_aten::optional<exec_aten::ArrayRef<int64_t>> optional_dim_list{
-            exec_aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
+        ::executorch::aten::optional<::executorch::aten::ArrayRef<int64_t>>
+            optional_dim_list{::executorch::aten::ArrayRef<int64_t>{
+                dims, size_t(input.dim() - 1)}};
 
 // Actual dequantization logic
 // input, out are the input and output tensors
@@ -318,28 +340,76 @@ void dequantize_impl(
   } else {
     if (input.scalar_type() == ScalarType::Byte) {
       const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-      xa_nn_elm_dequantize_sym8u_f32(
-          out_data, input_data, inp_shape, input.dim(), axis, scale_data);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_sym8u_f32,
+          out_data,
+          input_data,
+          inp_shape,
+          input.dim(),
+          axis,
+          scale_data);
     } else if (input.scalar_type() == ScalarType::Char) {
       const int8_t* input_data = input.const_data_ptr<int8_t>();
-      xa_nn_elm_dequantize_sym8_f32(
-          out_data, input_data, inp_shape, input.dim(), axis, scale_data);
-    } else if (input.scalar_type() == (ScalarType)Ushort) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_sym8_f32,
+          out_data,
+          input_data,
+          inp_shape,
+          input.dim(),
+          axis,
+          scale_data);
+    } else if (input.scalar_type() == ScalarType::UInt16) {
       const uint16_t* input_data = input.const_data_ptr<uint16_t>();
-      xa_nn_elm_dequantize_sym16u_f32(
-          out_data, input_data, inp_shape, input.dim(), axis, scale_data);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_sym16u_f32,
+          out_data,
+          input_data,
+          inp_shape,
+          input.dim(),
+          axis,
+          scale_data);
     } else if (input.scalar_type() == ScalarType::Short) {
       const int16_t* input_data = input.const_data_ptr<int16_t>();
-      xa_nn_elm_dequantize_sym16_f32(
-          out_data, input_data, inp_shape, input.dim(), axis, scale_data);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_sym16_f32,
+          out_data,
+          input_data,
+          inp_shape,
+          input.dim(),
+          axis,
+          scale_data);
     } else if (input.scalar_type() == (ScalarType)Bits4u) {
       const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-      xa_nn_elm_dequantize_sym4u_f32(
-          out_data, input_data, inp_shape, input.dim(), axis, scale_data);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_sym4u_f32,
+          out_data,
+          input_data,
+          inp_shape,
+          input.dim(),
+          axis,
+          scale_data);
     } else if (input.scalar_type() == (ScalarType)Bits4) {
       const int8_t* input_data = input.const_data_ptr<int8_t>();
-      xa_nn_elm_dequantize_sym4_f32(
-          out_data, input_data, inp_shape, input.dim(), axis, scale_data);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_dequantize_sym4_f32,
+          out_data,
+          input_data,
+          inp_shape,
+          input.dim(),
+          axis,
+          scale_data);
     } else {
       if (axis == NULL) {
 // calculate the dequantized output, cast scale to float to match fbgemm
@@ -390,8 +460,9 @@ void dequantize_impl(
           }
         }
 
-        exec_aten::optional<exec_aten::ArrayRef<int64_t>> optional_dim_list{
-            exec_aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
+        ::executorch::aten::optional<::executorch::aten::ArrayRef<int64_t>>
+            optional_dim_list{::executorch::aten::ArrayRef<int64_t>{
+                dims, size_t(input.dim() - 1)}};
 
 // Actual dequantization logic
 // input, out are the input and output tensors
@@ -473,6 +544,7 @@ void dequantize_impl(
       }
     }
   }
+  return out;
 }
 
 /**
@@ -485,14 +557,16 @@ void dequantize_impl(
  * info.
  */
 Tensor& dequantize_per_tensor_out(
+    KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
     int64_t zero_point,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
+    ::executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
+#ifdef OP_ARG_CHECK
   torch::executor::Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
       err == torch::executor::Error::Ok,
@@ -500,24 +574,28 @@ Tensor& dequantize_per_tensor_out(
 
   check_dequantize_per_tensor_args(
       input, quant_min, quant_max, dtype, out_dtype, out);
+#endif
 
   float scale_data = (float)scale;
   int zero_point_data = (int)zero_point;
 
-  dequantize_impl(out, input, &scale_data, &zero_point_data, NULL, out_dtype);
+  dequantize_impl(
+      context, out, input, &scale_data, &zero_point_data, NULL, out_dtype);
 
   return out;
 }
 
 Tensor& dequantize_per_tensor_tensor_args_out(
+    KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
     const Tensor& zero_point,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
+    ::executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
+#ifdef OP_ARG_CHECK
   ET_CHECK_MSG(
       scale.scalar_type() == ScalarType::Double,
       "Expected scale to be Double tensor received: %" PRId8,
@@ -534,8 +612,10 @@ Tensor& dequantize_per_tensor_tensor_args_out(
       zero_point.numel() == 1,
       "Exepcted zero_point to only have one element received: %zd",
       ssize_t(zero_point.numel()));
+#endif
 
   dequantize_per_tensor_out(
+      context,
       input,
       scale.const_data_ptr<double>()[0],
       zero_point.const_data_ptr<int64_t>()[0],
@@ -549,15 +629,24 @@ Tensor& dequantize_per_tensor_tensor_args_out(
 }
 
 Tensor& dequantize_per_channel_out(
+    KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
-    const exec_aten::optional<Tensor>& opt_zero_points,
+    const ::executorch::aten::optional<Tensor>& opt_zero_points,
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
+    ::executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
+  if (axis < 0) {
+    axis += executorch::runtime::nonzero_dim(input);
+  }
+  /* if the arguments are passed properly to the operator disable the Macro -
+   * "OP_ARG_CHECK" if not the case, enable the Macro - "OP_ARG_CHECK", to have
+   * the checks only in operator level(As there are no checks in kernel).
+   */
+#ifdef OP_ARG_CHECK
   torch::executor::Error err = resize_tensor(out, input.sizes());
 
   // normalize axis
@@ -567,10 +656,6 @@ Tensor& dequantize_per_channel_out(
       ssize_t(axis),
       ssize_t(input.dim()));
 
-  if (axis < 0) {
-    axis += executorch::runtime::nonzero_dim(input);
-  }
-
   ET_CHECK_MSG(
       err == torch::executor::Error::Ok,
       "Failed to resize out Tensor in dequantize_per_channel_out");
@@ -599,9 +684,9 @@ Tensor& dequantize_per_channel_out(
         ssize_t(zero_point.numel()),
         ssize_t(input.size(axis)));
   }
-
   check_dequantize_per_tensor_args(
       input, quant_min, quant_max, dtype, out_dtype, out);
+#endif
 
   int* axis_ptr = (int*)&axis;
 
@@ -622,80 +707,14 @@ Tensor& dequantize_per_channel_out(
   for (int i = 0; i < scale.numel(); i++) {
     scale_data[i] = (float)scale_dt[i];
   }
-  dequantize_impl(out, input, scale_data, zero_point_ptr, axis_ptr, out_dtype);
+  dequantize_impl(
+      context, out, input, scale_data, zero_point_ptr, axis_ptr, out_dtype);
 
   return out;
 }
 
-Tensor& dequantize_per_channel_out(
-    KernelRuntimeContext& context,
-    const Tensor& input,
-    const Tensor& scale,
-    const exec_aten::optional<Tensor>& opt_zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
-    Tensor& out) {
-  (void)context;
-  torch::executor::Error err = resize_tensor(out, input.sizes());
-  ET_CHECK_MSG(
-      err == torch::executor::Error::Ok,
-      "Failed to resize out Tensor in dequantize_per_channel_out");
-  return dequantize_per_channel_out(
-      input,
-      scale,
-      opt_zero_points,
-      axis,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype,
-      out);
-}
-
-Tensor& dequantize_per_tensor_out(
-    KernelRuntimeContext& context,
-    const Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out) {
-  // TODO(larryliu): Add a context arg to the real op function and remove this
-  // wrapper
-  (void)context;
-  return dequantize_per_tensor_out(
-      input,
-      scale,
-      zero_point,
-      quant_min,
-      quant_max,
-      dtype,
-      out.scalar_type(),
-      out);
-}
-
-Tensor& dequantize_per_tensor_tensor_args_out(
-    KernelRuntimeContext& context,
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
-    Tensor& out) {
-  // TODO(larryliu): Add a context arg to the real op function and remove this
-  // wrapper
-  (void)context;
-  return dequantize_per_tensor_tensor_args_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out);
-}
-
 Tensor& dequantize_per_token_out(
+    KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
     const Tensor& zero_points,
@@ -711,18 +730,18 @@ Tensor& dequantize_per_token_out(
   }
   // This unfortunate change is needed because we compile op_quantize for aten
   // mode as well
-  std::array<exec_aten::SizesType, 2> input_sizes;
-  input_sizes[0] = static_cast<exec_aten::SizesType>(num_channels);
+  std::array<::executorch::aten::SizesType, 2> input_sizes;
+  input_sizes[0] = static_cast<::executorch::aten::SizesType>(num_channels);
   input_sizes[1] =
-      static_cast<exec_aten::SizesType>(input.size(input.dim() - 1));
+      static_cast<::executorch::aten::SizesType>(input.size(input.dim() - 1));
 #ifdef USE_ATEN_LIB
   Tensor reshaped_input = at::from_blob(
       input.mutable_data_ptr(),
       input_sizes,
       at::TensorOptions(input.scalar_type()));
 #else
-  std::array<exec_aten::DimOrderType, 2> input_dim_order{0, 1};
-  std::array<exec_aten::StridesType, 2> input_strides;
+  std::array<::executorch::aten::DimOrderType, 2> input_dim_order{0, 1};
+  std::array<::executorch::aten::StridesType, 2> input_strides;
   executorch::runtime::dim_order_to_stride_nocheck(
       input_sizes.data(), input_dim_order.data(), 2, input_strides.data());
   void* input_data = input.mutable_data_ptr();
@@ -743,6 +762,7 @@ Tensor& dequantize_per_token_out(
 #endif
 
   return dequantize_per_channel_out(
+      context,
       reshaped_input,
       scale,
       zero_points,
@@ -754,21 +774,6 @@ Tensor& dequantize_per_token_out(
       out);
 }
 
-Tensor& dequantize_per_token_out(
-    KernelRuntimeContext& context,
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    ScalarType out_dtype,
-    Tensor& out) {
-  (void)context;
-  return dequantize_per_token_out(
-      input, scale, zero_points, quant_min, quant_max, dtype, out_dtype, out);
-}
-
 } // namespace native
 } // namespace G3
 } // namespace impl
diff --git a/backends/cadence/fusion_g3/operators/op_div.cpp b/backends/cadence/fusion_g3/operators/op_div.cpp
new file mode 100644
index 00000000000..1461f643a84
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_div.cpp
@@ -0,0 +1,674 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <cmath>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+using ::executorch::aten::optional;
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::string_view;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::canCast;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+namespace {
+
+ScalarType get_common_type(ScalarType a_type, ScalarType b_type) {
+  if (executorch::runtime::isFloatingType(a_type) &&
+      executorch::runtime::isFloatingType(b_type)) {
+    return executorch::runtime::promoteTypes(a_type, b_type);
+  } else if (executorch::runtime::isFloatingType(a_type)) {
+    return a_type;
+  } else if (executorch::runtime::isFloatingType(b_type)) {
+    return b_type;
+  }
+  return ScalarType::Float;
+}
+
+} // namespace
+
+Tensor& div_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  // Common Dtype
+  ScalarType common_type =
+      executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
+
+#ifdef OP_ARG_CHECK
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(a, b, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "div.out";
+
+  int kTensorDimensionLimit = 5;
+
+  int inp1_shape[kTensorDimensionLimit];
+  int inp2_shape[kTensorDimensionLimit];
+  int out_shape[kTensorDimensionLimit];
+
+  bool broadcast = 0;
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  bool optimized = 1;
+
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i] = 1;
+    inp1_shape[i] = 1;
+    inp2_shape[i] = 1;
+  }
+
+  int offset_out = max_dim - out.dim();
+  int offset_inp1 = max_dim - a.dim();
+  int offset_inp2 = max_dim - b.dim();
+
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < a.dim(); i++) {
+    inp1_shape[i + offset_inp1] = a.size(i);
+  }
+  for (int i = 0; i < b.dim(); i++) {
+    inp2_shape[i + offset_inp2] = b.size(i);
+  }
+
+  /*find broadcast*/
+  for (int i = 0; i < out.dim(); i++) {
+    if (((inp1_shape[i]) != (out_shape[i])) ||
+        ((inp2_shape[i]) != (out_shape[i]))) {
+      broadcast = 1;
+    }
+  }
+
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
+  }
+
+  if ((compute_type == ScalarType::Int) && (optimized)) {
+    const int* const inp1_data = a.const_data_ptr<int>();
+    const int* const inp2_data = b.const_data_ptr<int>();
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    if (b.numel() == 1) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_scalar_32x32_f32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          out.numel());
+    } else if (broadcast) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_broadcast_5D_32x32_f32,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp1_shape,
+          inp2_data,
+          inp2_shape,
+          max_dim);
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_32x32_f32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          out.numel());
+    }
+  } else if ((compute_type == ScalarType::Float) && (optimized)) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    const float* const inp2_data = b.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    int mode = 0;
+
+    if (b.numel() == 1) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_scalar_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          mode,
+          out.numel());
+    } else if (broadcast) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_broadcast_5D_f32xf32_f32,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp1_shape,
+          inp2_data,
+          inp2_shape,
+          mode,
+          max_dim);
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          mode,
+          out.numel());
+    }
+  } else {
+    ScalarType common_type = get_common_type(a.scalar_type(), b.scalar_type());
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+
+    ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      torch::executor::native::utils::apply_bitensor_elementwise_fn<
+          CTYPE_COMPUTE,
+          op_name>(
+          [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+            return val_a / val_b;
+          },
+          ctx,
+          a,
+          torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+          b,
+          torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+          out,
+          torch::executor::native::utils::SupportedTensorDtypes::FLOATHBF16);
+    });
+  }
+
+  return out;
+}
+
+Tensor& div_out_mode(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    optional<string_view> mode,
+    Tensor& out) {
+  if (!mode.has_value()) {
+    return div_out(ctx, a, b, out);
+  }
+
+  auto mode_val = mode.value();
+
+  // Check mode
+  ET_KERNEL_CHECK(
+      ctx, mode_val == "trunc" || mode_val == "floor", InvalidArgument, out);
+
+  // Common Dtype
+  ScalarType common_type =
+      executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
+
+#ifdef OP_ARG_CHECK
+  // Check Common Dtype
+  ET_KERNEL_CHECK(
+      ctx,
+      (canCast(common_type, out.scalar_type()) &&
+       common_type != ScalarType::Bool),
+      InvalidArgument,
+      out);
+
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(a, b, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "div.out_mode";
+
+  const bool mode_is_trunc = mode_val == "trunc";
+  bool div_by_zero_error = false;
+
+  int kTensorDimensionLimit = 5;
+
+  int inp1_shape[kTensorDimensionLimit];
+  int inp2_shape[kTensorDimensionLimit];
+  int out_shape[kTensorDimensionLimit];
+
+  bool broadcast = 0;
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  bool optimized = 1;
+
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i] = 1;
+    inp1_shape[i] = 1;
+    inp2_shape[i] = 1;
+  }
+
+  int offset_out = max_dim - out.dim();
+  int offset_inp1 = max_dim - a.dim();
+  int offset_inp2 = max_dim - b.dim();
+
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < a.dim(); i++) {
+    inp1_shape[i + offset_inp1] = a.size(i);
+  }
+  for (int i = 0; i < b.dim(); i++) {
+    inp2_shape[i + offset_inp2] = b.size(i);
+  }
+
+  /*find broadcast*/
+  for (int i = 0; i < out.dim(); i++) {
+    if (((inp1_shape[i]) != (out_shape[i])) ||
+        ((inp2_shape[i]) != (out_shape[i]))) {
+      broadcast = 1;
+    }
+  }
+
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
+  }
+
+  int mode_value = (mode_val == "trunc") ? 1 : 2;
+
+  if ((compute_type == ScalarType::Int) && (optimized)) {
+    const int* const inp1_data = a.const_data_ptr<int>();
+    const int* const inp2_data = b.const_data_ptr<int>();
+    int* const out_data = out.mutable_data_ptr<int>();
+
+    if (b.numel() == 1) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_scalar_32x32_32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          mode_value,
+          out.numel());
+    } else if (broadcast) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_broadcast_5D_32x32_32,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp1_shape,
+          inp2_data,
+          inp2_shape,
+          mode_value,
+          max_dim);
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_32x32_32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          mode_value,
+          out.numel());
+    }
+  } else if ((compute_type == ScalarType::Float) && (optimized)) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    const float* const inp2_data = b.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    if (b.numel() == 1) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_scalar_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          mode_value,
+          out.numel());
+    } else if (broadcast) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_broadcast_5D_f32xf32_f32,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp1_shape,
+          inp2_data,
+          inp2_shape,
+          mode_value,
+          max_dim);
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_div_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          mode_value,
+          out.numel());
+    }
+  } else {
+    ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      torch::executor::native::utils::
+          apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+              [mode_is_trunc, &div_by_zero_error](
+                  const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+                if (executorch::runtime::is_integral_type<
+                        CTYPE_COMPUTE,
+                        /*includeBool=*/true>::value) {
+                  if (val_b == 0) {
+                    div_by_zero_error = true;
+                    return static_cast<CTYPE_COMPUTE>(0);
+                  }
+                }
+                CTYPE_COMPUTE value = val_a / val_b;
+                if (mode_is_trunc) {
+                  value = std::trunc(value);
+                } else {
+                  // We established above that the mode is either trunc or
+                  // floor, so it must be floor.
+                  value = torch::executor::native::utils::floor_divide(
+                      val_a, val_b);
+                }
+                return value;
+              },
+              ctx,
+              a,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+              b,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+              out,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBF16);
+    });
+  }
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      !div_by_zero_error,
+      InvalidArgument,
+      out,
+      "Div mode operation encountered integer division by zero");
+
+  return out;
+}
+
+Tensor& div_scalar_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    Tensor& out) {
+  // Common Dtype
+  ScalarType common_type =
+      torch::executor::native::utils::promote_type_with_scalar(
+          a.scalar_type(), b);
+
+#ifdef OP_ARG_CHECK
+  // Check Common Dtype
+  ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out);
+
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(a, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "div.Scalar_out";
+
+  if (compute_type == ScalarType::Int) {
+    const int* const inp1_data = a.const_data_ptr<int>();
+    int inp2_val;
+    torch::executor::native::utils::extract_scalar(b, &inp2_val);
+
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_div_scalar_32x32_f32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        out.numel());
+  } else if (compute_type == ScalarType::Float) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    float inp2_val;
+    torch::executor::native::utils::extract_scalar(b, &inp2_val);
+
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    int mode = 0;
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_div_scalar_f32xf32_f32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        mode,
+        out.numel());
+  } else {
+    ScalarType common_type =
+        executorch::runtime::isFloatingType(a.scalar_type())
+        ? a.scalar_type()
+        : ScalarType::Float;
+    ScalarType compute_type =
+        torch::executor::native::utils::get_compute_type(common_type);
+    ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      const CTYPE_COMPUTE val_b =
+          torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(b);
+      torch::executor::native::utils::
+          apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+              [val_b](const CTYPE_COMPUTE val_a) { return val_a / val_b; },
+              ctx,
+              a,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+              out,
+              torch::executor::native::utils::SupportedTensorDtypes::
+                  SAME_AS_COMMON);
+    });
+  }
+
+  return out;
+}
+
+Tensor& div_scalar_mode_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    optional<string_view> mode,
+    Tensor& out) {
+  if (!mode.has_value()) {
+    return div_scalar_out(ctx, a, b, out);
+  }
+
+  auto mode_val = mode.value();
+
+  // Check mode
+  ET_KERNEL_CHECK(
+      ctx, mode_val == "trunc" || mode_val == "floor", InvalidArgument, out);
+
+  // Common Dtype
+  ScalarType common_type =
+      torch::executor::native::utils::promote_type_with_scalar(
+          a.scalar_type(), b);
+
+#ifdef OP_ARG_CHECK
+  // Check Common Dtype
+  ET_KERNEL_CHECK(
+      ctx,
+      (canCast(common_type, out.scalar_type()) &&
+       common_type != ScalarType::Bool),
+      InvalidArgument,
+      out);
+
+  // Check for intergral division by zero
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      !(executorch::runtime::isIntegralType(common_type, true) &&
+        torch::executor::native::utils::scalar_to<double>(b) == 0),
+      InvalidArgument,
+      out,
+      "Div mode operation encountered integer division by zero");
+
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(a, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  const bool mode_is_trunc = mode_val == "trunc";
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "div.Scalar_mode_out";
+
+  int mode_value = (mode_val == "trunc") ? 1 : 2;
+
+  if (compute_type == ScalarType::Int) {
+    const int* const inp1_data = a.const_data_ptr<int>();
+    int inp2_val;
+    torch::executor::native::utils::extract_scalar(b, &inp2_val);
+
+    int* const out_data = out.mutable_data_ptr<int>();
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_div_scalar_32x32_32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        mode_value,
+        out.numel());
+  } else if (compute_type == ScalarType::Float) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    float inp2_val;
+    torch::executor::native::utils::extract_scalar(b, &inp2_val);
+
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_div_scalar_f32xf32_f32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        mode_value,
+        out.numel());
+  } else {
+    ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      const CTYPE_COMPUTE val_b =
+          torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(b);
+      torch::executor::native::utils::
+          apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+              [val_b, mode_is_trunc](const CTYPE_COMPUTE val_a) {
+                CTYPE_COMPUTE value = val_a / val_b;
+                if (mode_is_trunc) {
+                  value = std::trunc(value);
+                } else {
+                  value = torch::executor::native::utils::floor_divide(
+                      val_a, val_b);
+                }
+                return value;
+              },
+              ctx,
+              a,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+              out,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBF16);
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/fusion_g3/operators/op_exp.cpp b/backends/cadence/fusion_g3/operators/op_exp.cpp
new file mode 100644
index 00000000000..3021a0d4e8a
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_exp.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <cmath>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
+#ifdef OP_ARG_CHECK
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_floating_type(out),
+      InvalidArgument,
+      out);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      executorch::runtime::resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+#endif
+
+  if (out.scalar_type() == ScalarType::Float) {
+    float* const out_data = out.mutable_data_ptr<float>();
+    const float* const in_data = in.const_data_ptr<float>();
+
+    XT_KERNEL_CHECK(
+        ctx, out, xa_nn_elm_exp_f32_f32, out_data, in_data, out.numel());
+
+    return out;
+  } else {
+    return torch::executor::native::internal::
+        unary_ufunc_realhbbf16_to_floathbf16(std::exp, ctx, in, out);
+  }
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/fusion_g3/operators/op_mean.cpp b/backends/cadence/fusion_g3/operators/op_mean.cpp
new file mode 100644
index 00000000000..be866b2f51c
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_mean.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+using ::executorch::aten::ArrayRef;
+using ::executorch::aten::optional;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+int prepare_data(
+    const Tensor& in,
+    Tensor& out,
+    optional<ArrayRef<int64_t>> dim_list,
+    int* inp_shape,
+    int* out_shape,
+    int* p_axis,
+    int num_inp_dims,
+    int num_out_dims) {
+  for (int i = 0; i < num_inp_dims; i++) {
+    inp_shape[i] = in.size(i);
+  }
+
+  for (int i = 0; i < num_out_dims; i++) {
+    out_shape[i] = out.size(i);
+  }
+
+  int num_axis_dims = 0;
+  for (const auto& d : dim_list.value()) {
+    if (d < 0) {
+      p_axis[num_axis_dims] = num_inp_dims + d;
+      num_axis_dims++;
+    } else {
+      p_axis[num_axis_dims] = d;
+      num_axis_dims++;
+    }
+  }
+
+  return num_axis_dims;
+}
+
+Tensor& mean_dim_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    optional<ArrayRef<int64_t>> dim_list,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& out) {
+  (void)ctx;
+
+#ifdef OP_ARG_CHECK
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_default_dim_order(in),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_reduction_out(in, dim_list, keepdim, out) ==
+          Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  constexpr int kNnlibMaxDim = 5;
+
+  bool optimized = 1;
+
+  if (out.scalar_type() != ScalarType::Float)
+    optimized = 0;
+
+  if (in.dim() > kNnlibMaxDim)
+    optimized = 0;
+
+  if (optimized) {
+    float* __restrict__ p_out = out.mutable_data_ptr<float>();
+    const float* __restrict__ p_inp =
+        (const float* __restrict__)in.const_data_ptr<float>();
+
+    int num_elm = in.numel();
+
+    int num_inp_dims = in.dim();
+    int num_out_dims = out.dim();
+
+    int inp_shape[kNnlibMaxDim];
+    int out_shape[kNnlibMaxDim];
+    int p_axis[kNnlibMaxDim];
+
+    for (int i = 0; i < kNnlibMaxDim; i++) {
+      out_shape[i] = 1;
+      inp_shape[i] = 1;
+      p_axis[i] = 1;
+    }
+
+    int num_axis_dims = prepare_data(
+        in,
+        out,
+        dim_list,
+        inp_shape,
+        out_shape,
+        p_axis,
+        num_inp_dims,
+        num_out_dims);
+
+    if (num_axis_dims == num_inp_dims) {
+      num_out_dims = 1;
+      out_shape[0] = 1;
+    }
+
+    int inp_shape_max = inp_shape[p_axis[0]];
+    for (int i = 1; i < num_axis_dims; i++) {
+      if (inp_shape[p_axis[i]] > inp_shape_max) {
+        inp_shape_max = inp_shape[p_axis[i]];
+      }
+    }
+
+    int scratch_size = in.numel() / inp_shape_max;
+
+    executorch::runtime::Result<void*> temp_mem =
+        ctx.allocate_temp(scratch_size * sizeof(float));
+
+    void* __restrict__ p_scratch_in = (void* __restrict__)(temp_mem.get());
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_mean_f32_f32,
+        p_out,
+        out_shape,
+        num_out_dims,
+        p_inp,
+        inp_shape,
+        num_inp_dims,
+        p_axis,
+        num_axis_dims,
+        p_scratch_in);
+  } else {
+    ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] {
+      ET_SWITCH_FLOATH_TYPES(
+          out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] {
+            CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+            const size_t num =
+                torch::executor::get_reduced_dim_product(in, dim_list);
+            for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+              CTYPE_OUT sum = 0;
+              if (in.numel() > 0) {
+                sum = torch::executor::
+                    map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
+                        [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                        [](CTYPE_OUT outv, CTYPE_OUT acc) {
+                          return acc + outv;
+                        },
+                        in,
+                        dim_list,
+                        out_ix);
+              }
+              out_data[out_ix] = sum / static_cast<float>(num);
+            }
+          });
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
index 840cb16c7cf..93b4c5a992c 100644
--- a/backends/cadence/fusion_g3/operators/op_mul.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -6,8 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
 #include <xa_nnlib_kernels_api.h>
 
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -34,6 +37,7 @@ Tensor& mul_out(
   ScalarType common_type =
       executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
 
+#ifdef OP_ARG_CHECK
   // Check Common Dtype
   ET_KERNEL_CHECK(
       ctx, canCast(common_type, out.scalar_type()), InvalidArgument, out);
@@ -51,6 +55,7 @@ Tensor& mul_out(
       torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
       InvalidArgument,
       out);
+#endif
 
   // Compute Dtype
   ScalarType compute_type =
@@ -58,7 +63,6 @@ Tensor& mul_out(
 
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "mul.out";
-
   int kTensorDimensionLimit = 5;
 
   int inp1_shape[kTensorDimensionLimit];
@@ -111,13 +115,28 @@ Tensor& mul_out(
     int* const out_data = out.mutable_data_ptr<int>();
 
     if (a.numel() == 1) {
-      xa_nn_elm_mul_scalar_32x32_32(
-          out_data, inp2_data, inp1_data[0], out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_mul_scalar_32x32_32,
+          out_data,
+          inp2_data,
+          inp1_data[0],
+          out.numel());
     } else if (b.numel() == 1) {
-      xa_nn_elm_mul_scalar_32x32_32(
-          out_data, inp1_data, inp2_data[0], out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_mul_scalar_32x32_32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          out.numel());
     } else if (broadcast) {
-      xa_nn_elm_mul_broadcast_5D_32x32_32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_mul_broadcast_5D_32x32_32,
           out_data,
           out_shape,
           inp1_data,
@@ -126,7 +145,14 @@ Tensor& mul_out(
           inp2_shape,
           max_dim);
     } else {
-      xa_nn_elm_mul_32x32_32(out_data, inp1_data, inp2_data, out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_mul_32x32_32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          out.numel());
     }
   } else if ((compute_type == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
@@ -134,13 +160,28 @@ Tensor& mul_out(
     float* const out_data = out.mutable_data_ptr<float>();
 
     if (a.numel() == 1) {
-      xa_nn_elm_mul_scalar_f32xf32_f32(
-          out_data, inp2_data, inp1_data[0], out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_mul_scalar_f32xf32_f32,
+          out_data,
+          inp2_data,
+          inp1_data[0],
+          out.numel());
     } else if (b.numel() == 1) {
-      xa_nn_elm_mul_scalar_f32xf32_f32(
-          out_data, inp1_data, inp2_data[0], out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_mul_scalar_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          out.numel());
     } else if (broadcast) {
-      xa_nn_elm_mul_broadcast_5D_f32xf32_f32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_mul_broadcast_5D_f32xf32_f32,
           out_data,
           out_shape,
           inp1_data,
@@ -149,7 +190,14 @@ Tensor& mul_out(
           inp2_shape,
           max_dim);
     } else {
-      xa_nn_elm_mul_f32xf32_f32(out_data, inp1_data, inp2_data, out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_mul_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          out.numel());
     }
   } else {
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
@@ -181,6 +229,7 @@ Tensor& mul_scalar_out(
       torch::executor::native::utils::promote_type_with_scalar(
           a.scalar_type(), b);
 
+#ifdef OP_ARG_CHECK
   // Check Common Dtype
   ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out);
 
@@ -194,29 +243,41 @@ Tensor& mul_scalar_out(
   // Resize
   ET_KERNEL_CHECK(
       ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out);
-
+#endif
   // Compute Dtype
   ScalarType compute_type =
       torch::executor::native::utils::get_compute_type(common_type);
 
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "mul.Scalar_out";
-
   if (compute_type == ScalarType::Int) {
     const int* const inp1_data = a.const_data_ptr<int>();
     int inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
     int* const out_data = out.mutable_data_ptr<int>();
 
-    xa_nn_elm_mul_scalar_32x32_32(out_data, inp1_data, inp2_val, out.numel());
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_mul_scalar_32x32_32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        out.numel());
   } else if (compute_type == ScalarType::Float) {
     const float* const inp1_data = a.const_data_ptr<float>();
     float inp2_val;
     torch::executor::native::utils::extract_scalar(b, &inp2_val);
     float* const out_data = out.mutable_data_ptr<float>();
 
-    xa_nn_elm_mul_scalar_f32xf32_f32(
-        out_data, inp1_data, inp2_val, out.numel());
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_mul_scalar_f32xf32_f32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        out.numel());
   } else {
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
       const CTYPE_COMPUTE val_b =
@@ -232,7 +293,6 @@ Tensor& mul_scalar_out(
                   SAME_AS_COMMON);
     });
   }
-
   return out;
 }
 
diff --git a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
index a5fbe31eee5..9857bbce377 100644
--- a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
+++ b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
@@ -6,16 +6,20 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
 #include <cmath>
 #include <tuple>
 
 #include <xa_nnlib_kernels_api.h>
 
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>
 #include <executorch/kernels/portable/cpu/vec_ops.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
@@ -32,8 +36,8 @@ template <typename CTYPE>
 void layer_norm(
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const optional<Tensor>& weight,
+    const optional<Tensor>& bias,
     CTYPE eps,
     Tensor& out,
     Tensor& mean,
@@ -109,8 +113,8 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     IntArrayRef normalized_shape,
-    const exec_aten::optional<Tensor>& weight,
-    const exec_aten::optional<Tensor>& bias,
+    const optional<Tensor>& weight,
+    const optional<Tensor>& bias,
     double eps,
     Tensor& out,
     Tensor& mean_out,
@@ -118,7 +122,9 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
   (void)ctx;
 
   std::tuple<Tensor&, Tensor&, Tensor&> ret_val(out, mean_out, rstd_out);
+  int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
 
+#ifdef OP_ARG_CHECK
   ET_KERNEL_CHECK(
       ctx,
       torch::executor::check_layer_norm_args(
@@ -156,7 +162,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
         InvalidArgument,
         ret_val);
   }
-  int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
+
   Tensor::SizesType mean_rstd_sizes[kTensorDimensionLimit];
   size_t mean_rstd_ndim = 0;
   torch::executor::get_layer_norm_out_target_size(
@@ -181,6 +187,7 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
           rstd_out, {mean_rstd_sizes, mean_rstd_ndim}) == Error::Ok,
       InvalidArgument,
       ret_val);
+#endif
 
   int input_shape[kTensorDimensionLimit];
   for (int i = 0; i < input.dim(); i++) {
@@ -218,7 +225,10 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
       }
     }
 
-    xa_nn_native_layer_norm_f32_f32(
+    XT_KERNEL_CHECK(
+        ctx,
+        ret_val,
+        xa_nn_native_layer_norm_f32_f32,
         out_data,
         mean_data,
         rstd_data,
diff --git a/backends/cadence/fusion_g3/operators/op_permute_copy.cpp b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
new file mode 100644
index 00000000000..23c2d1e5fbd
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+#include <executorch/backends/cadence/fusion_g3/operators/xt_utils.h>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ArrayRef;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::SizesType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+/* ScalarType in Executorch do not have support for below data types.
+ * So, creating a placeholder for these data types. Once, ScalarTypes is
+ * updated to have support for below data types, these can be removed and
+ * operator need to be updated accordingly
+ */
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+namespace {
+
+void increment_coordinate_permuted(
+    const Tensor& tensor,
+    size_t* const coordinate,
+    IntArrayRef dims) {
+  for (int i = dims.size() - 1; i >= 0; i--) {
+    size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim();
+    coordinate[d]++;
+    if (coordinate[d] == tensor.size(d)) {
+      coordinate[d] = 0;
+    } else {
+      return;
+    }
+  }
+}
+
+} // namespace
+
+Tensor& permute_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    IntArrayRef dims,
+    Tensor& out) {
+  (void)ctx;
+  int kTensorDimensionLimit = 5;
+  /* if the arguments are passed properly to the operator disable the Macro -
+   * "OP_ARG_CHECK" if not the case, enable the Macro - "OP_ARG_CHECK", to have
+   * the checks only in operator level(As there are no checks in kernel).
+   */
+#ifdef OP_ARG_CHECK
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::check_permute_copy_args(in, dims, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+
+  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  torch::executor::get_permute_copy_out_target_size(
+      in, dims, expected_out_size, &expected_out_dim);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(
+          out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  const ArrayRef<Tensor::SizesType> in_size = in.sizes();
+  const ArrayRef<Tensor::SizesType> out_size = out.sizes();
+
+  int inp_shape[kTensorDimensionLimit];
+  int out_shape[kTensorDimensionLimit];
+
+  /* input shapes and output shapes */
+  for (auto i = 0; i < in_size.size(); i++) {
+    inp_shape[i] = in_size[i];
+  }
+
+  for (auto i = 0; i < out_size.size(); i++) {
+    out_shape[i] = out_size[i];
+  }
+
+  int permute_vec[in.dim()];
+  for (int i = 0; i < in.dim(); i++) {
+    permute_vec[i] = (int)dims[i];
+  }
+  signed char* out_data = out.mutable_data_ptr<signed char>();
+  const signed char* const inp_data = in.const_data_ptr<signed char>();
+
+  if (((out.scalar_type() == ScalarType::Int) ||
+       (out.scalar_type() == ScalarType::Short) ||
+       (out.scalar_type() == ScalarType::Char) ||
+       (out.scalar_type() == ScalarType::UInt32) ||
+       (out.scalar_type() == ScalarType::UInt16) ||
+       (out.scalar_type() == ScalarType::Byte)) &&
+      (in.dim() <= 5)) {
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_permute,
+        out_data,
+        out_shape,
+        inp_data,
+        inp_shape,
+        permute_vec,
+        in.dim(),
+        get_element_size(out.scalar_type()));
+  } else {
+    const auto in_type = out.scalar_type();
+    size_t in_coord[5] = {0};
+    size_t trailing_dims_memo[kTensorDimensionLimit];
+    executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo);
+    // in and out must be the same dtype
+    ET_SWITCH_ALL_TYPES(in_type, ctx, "permute_copy.out", CTYPE, [&] {
+      const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+      CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+      for (size_t i = 0; i < out.numel(); ++i) {
+        out_data[i] =
+            in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo(
+                in, in_coord, trailing_dims_memo)];
+        increment_coordinate_permuted(in, in_coord, dims);
+      }
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp
index fc206b67cd6..8237c3c266c 100644
--- a/backends/cadence/fusion_g3/operators/op_quantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_quantize.cpp
@@ -6,17 +6,18 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
 #include <algorithm>
 #include <cinttypes>
 #include <cmath>
 
 #include <xa_nnlib_kernels_api.h>
 
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using ::executorch::aten::ArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
@@ -27,7 +28,7 @@ using ::executorch::runtime::KernelRuntimeContext;
  * updated to have support for below data types, these can be removed and
  * operator need to be updated accordingly
  */
-enum datatype { Ushort = 20, Bits4u = 21, Bits4 = 22 };
+enum datatype { Bits4u = 21, Bits4 = 22 };
 
 /**
  * For an input tensor, use the scale and zero_point arguments to quantize it.
@@ -78,9 +79,6 @@ void check_quantize_per_tensor_args(
   } else if (dtype == ScalarType::Short) {
     quant_min_lower_bound = std::numeric_limits<int16_t>::min();
     quant_max_upper_bound = std::numeric_limits<int16_t>::max();
-  } else if (dtype == (ScalarType)Ushort) {
-    quant_min_lower_bound = std::numeric_limits<uint16_t>::min();
-    quant_max_upper_bound = std::numeric_limits<uint16_t>::max();
   } else if (dtype == (ScalarType)Bits4u) {
     quant_min_lower_bound = std::numeric_limits<uint8_t>::min();
     quant_max_upper_bound = std::numeric_limits<uint8_t>::max();
@@ -137,7 +135,8 @@ T quantize_val(
 }
 
 /* Local function which calls the kernels based on the output datatype */
-void quantize_impl(
+Tensor& quantize_impl(
+    KernelRuntimeContext& ctx,
     Tensor& out,
     const Tensor& input,
     float* scale_data,
@@ -145,7 +144,8 @@ void quantize_impl(
     int* axis,
     int quant_min,
     int quant_max) {
-  const ArrayRef<Tensor::SizesType> input_size = input.sizes();
+  const ::executorch::aten::ArrayRef<Tensor::SizesType> input_size =
+      input.sizes();
 
   int kTensorDimensionLimit = 5;
 
@@ -179,7 +179,10 @@ void quantize_impl(
   if (is_asym_quant) {
     if (out.scalar_type() == ScalarType::Byte) {
       uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-      xa_nn_elm_quantize_f32_asym8u(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_asym8u,
           out_data,
           input_data,
           inp_shape,
@@ -191,7 +194,11 @@ void quantize_impl(
           quant_max);
     } else if (out.scalar_type() == ScalarType::Char) {
       int8_t* out_data = out.mutable_data_ptr<int8_t>();
-      xa_nn_elm_quantize_f32_asym8(
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_asym8,
           out_data,
           input_data,
           inp_shape,
@@ -201,9 +208,12 @@ void quantize_impl(
           zero_point_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == (ScalarType)Ushort) {
+    } else if (out.scalar_type() == ScalarType::UInt16) {
       uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
-      xa_nn_elm_quantize_f32_asym16u(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_asym16u,
           out_data,
           input_data,
           inp_shape,
@@ -215,7 +225,10 @@ void quantize_impl(
           quant_max);
     } else if (out.scalar_type() == ScalarType::Short) {
       int16_t* out_data = out.mutable_data_ptr<int16_t>();
-      xa_nn_elm_quantize_f32_asym16(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_asym16,
           out_data,
           input_data,
           inp_shape,
@@ -227,7 +240,10 @@ void quantize_impl(
           quant_max);
     } else if (out.scalar_type() == (ScalarType)Bits4u) {
       uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-      xa_nn_elm_quantize_f32_asym4u(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_asym4u,
           out_data,
           input_data,
           inp_shape,
@@ -239,7 +255,10 @@ void quantize_impl(
           quant_max);
     } else if (out.scalar_type() == (ScalarType)Bits4) {
       int8_t* out_data = out.mutable_data_ptr<int8_t>();
-      xa_nn_elm_quantize_f32_asym4(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_asym4,
           out_data,
           input_data,
           inp_shape,
@@ -304,8 +323,9 @@ void quantize_impl(
           }
         }
 
-        optional<ArrayRef<int64_t>> optional_dim_list{
-            ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
+        ::executorch::aten::optional<::executorch::aten::ArrayRef<int64_t>>
+            optional_dim_list{::executorch::aten::ArrayRef<int64_t>{
+                dims, size_t(input.dim() - 1)}};
 
 // Actual quantization logic
 // input, out are the input and output tensors
@@ -373,7 +393,10 @@ void quantize_impl(
   } else {
     if (out.scalar_type() == ScalarType::Byte) {
       uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-      xa_nn_elm_quantize_f32_sym8u(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_sym8u,
           out_data,
           input_data,
           inp_shape,
@@ -384,7 +407,10 @@ void quantize_impl(
           quant_max);
     } else if (out.scalar_type() == ScalarType::Char) {
       int8_t* out_data = out.mutable_data_ptr<int8_t>();
-      xa_nn_elm_quantize_f32_sym8(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_sym8,
           out_data,
           input_data,
           inp_shape,
@@ -393,9 +419,12 @@ void quantize_impl(
           scale_data,
           quant_min,
           quant_max);
-    } else if (out.scalar_type() == (ScalarType)Ushort) {
+    } else if (out.scalar_type() == ScalarType::UInt16) {
       uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
-      xa_nn_elm_quantize_f32_sym16u(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_sym16u,
           out_data,
           input_data,
           inp_shape,
@@ -406,7 +435,10 @@ void quantize_impl(
           quant_max);
     } else if (out.scalar_type() == ScalarType::Short) {
       int16_t* out_data = out.mutable_data_ptr<int16_t>();
-      xa_nn_elm_quantize_f32_sym16(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_sym16,
           out_data,
           input_data,
           inp_shape,
@@ -417,7 +449,10 @@ void quantize_impl(
           quant_max);
     } else if (out.scalar_type() == (ScalarType)Bits4u) {
       uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-      xa_nn_elm_quantize_f32_sym4u(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_sym4u,
           out_data,
           input_data,
           inp_shape,
@@ -428,7 +463,10 @@ void quantize_impl(
           quant_max);
     } else if (out.scalar_type() == (ScalarType)Bits4) {
       int8_t* out_data = out.mutable_data_ptr<int8_t>();
-      xa_nn_elm_quantize_f32_sym4(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_quantize_f32_sym4,
           out_data,
           input_data,
           inp_shape,
@@ -490,8 +528,9 @@ void quantize_impl(
           }
         }
 
-        optional<ArrayRef<int64_t>> optional_dim_list{
-            ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
+        ::executorch::aten::optional<::executorch::aten::ArrayRef<int64_t>>
+            optional_dim_list{::executorch::aten::ArrayRef<int64_t>{
+                dims, size_t(input.dim() - 1)}};
 
 // Actual quantization logic
 // input, out are the input and output tensors
@@ -556,6 +595,7 @@ void quantize_impl(
 #undef SYM_QUANTIZE_IMPL_CHANNEL
     }
   }
+  return out;
 }
 
 // Quantize the input tensor
@@ -568,16 +608,18 @@ Tensor& quantize_per_tensor_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
+#ifdef OP_ARG_CHECK
   Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
       err == Error::Ok,
       "Failed to resize out Tensor in quantize_per_tensor_out");
-
-  // check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
+  check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
+#endif
 
   float scale_data = (float)scale;
   int zero_point_data = (int)zero_point;
   quantize_impl(
+      context,
       out,
       input,
       &scale_data,
@@ -606,6 +648,7 @@ Tensor& quantize_per_tensor_tensor_args_out(
     context.fail(Error::InvalidArgument);
     return out;
   }
+#ifdef OP_ARG_CHECK
   ET_CHECK_MSG(
       scale.scalar_type() == ScalarType::Double,
       "Expected scale to be Double tensor received: %" PRId8,
@@ -622,6 +665,7 @@ Tensor& quantize_per_tensor_tensor_args_out(
       zero_point.numel() == 1,
       "Exepcted zero_point to only have one element received: %zd",
       ssize_t(zero_point.numel()));
+#endif
 
   quantize_per_tensor_out(
       context,
@@ -652,6 +696,7 @@ Tensor& quantize_per_tensor_tensor_args_out(
 }
 
 Tensor& quantize_per_channel_out(
+    KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
     const Tensor& zero_point,
@@ -660,8 +705,12 @@ Tensor& quantize_per_channel_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
-  Error err = resize_tensor(out, input.sizes());
+  if (axis < 0) {
+    axis += executorch::runtime::nonzero_dim(input);
+  }
 
+#ifdef OP_ARG_CHECK
+  Error err = resize_tensor(out, input.sizes());
   // normalize axis
   ET_CHECK_MSG(
       executorch::runtime::tensor_has_dim(input, axis),
@@ -669,10 +718,6 @@ Tensor& quantize_per_channel_out(
       ssize_t(axis),
       ssize_t(input.dim()));
 
-  if (axis < 0) {
-    axis += executorch::runtime::nonzero_dim(input);
-  }
-
   ET_CHECK_MSG(
       err == Error::Ok,
       "Failed to resize out Tensor in quantize_per_channel_out");
@@ -699,7 +744,8 @@ Tensor& quantize_per_channel_out(
       zero_point.numel(),
       input.size(axis));
 
-  // check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
+  check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
+#endif
 
   const double* scale_dt = scale.const_data_ptr<double>();
   const int64_t* zero_point_dt = zero_point.const_data_ptr<int64_t>();
@@ -715,6 +761,7 @@ Tensor& quantize_per_channel_out(
   int* axis_ptr = (int*)&axis;
 
   quantize_impl(
+      context,
       out,
       input,
       scale_data,
@@ -722,25 +769,12 @@ Tensor& quantize_per_channel_out(
       axis_ptr,
       (int)quant_min,
       (int)quant_max);
-  return out;
-}
 
-Tensor& quantize_per_channel_out(
-    KernelRuntimeContext& context,
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out) {
-  (void)context;
-  return quantize_per_channel_out(
-      input, scale, zero_point, axis, quant_min, quant_max, dtype, out);
+  return out;
 }
 
 Tensor& quantize_per_token_out(
+    KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
     const Tensor& zero_point,
@@ -761,11 +795,11 @@ Tensor& quantize_per_token_out(
   Tensor reshaped_input = at::from_blob(
       input.mutable_data_ptr(), sizes, at::TensorOptions(input.scalar_type()));
 #else
-  std::array<exec_aten::DimOrderType, 2> input_dim_order{0, 1};
-  std::array<exec_aten::SizesType, 2> input_sizes;
+  std::array<::executorch::aten::DimOrderType, 2> input_dim_order{0, 1};
+  std::array<::executorch::aten::SizesType, 2> input_sizes;
   input_sizes[0] = num_tokens;
   input_sizes[1] = input.size(input.dim() - 1);
-  std::array<exec_aten::StridesType, 2> input_strides;
+  std::array<::executorch::aten::StridesType, 2> input_strides;
   executorch::runtime::dim_order_to_stride_nocheck(
       input_sizes.data(), input_dim_order.data(), 2, input_strides.data());
   void* input_data = input.mutable_data_ptr();
@@ -786,21 +820,15 @@ Tensor& quantize_per_token_out(
 #endif
 
   return quantize_per_channel_out(
-      reshaped_input, scale, zero_point, 0, quant_min, quant_max, dtype, out);
-}
-
-Tensor& quantize_per_token_out(
-    KernelRuntimeContext& context,
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out) {
-  (void)context;
-  return quantize_per_token_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out);
+      context,
+      reshaped_input,
+      scale,
+      zero_point,
+      0,
+      quant_min,
+      quant_max,
+      dtype,
+      out);
 }
 
 } // namespace native
diff --git a/backends/cadence/fusion_g3/operators/op_slice_copy.cpp b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
new file mode 100644
index 00000000000..c481cf726b7
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+#include <executorch/backends/cadence/fusion_g3/operators/xt_utils.h>
+
+#include <cstring>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/util/slice_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+/* ScalarType in Executorch do not have support for below data types.
+ * So, creating a placeholder for these data types. Once, ScalarTypes is
+ * updated to have support for below data types, these can be removed and
+ * operator need to be updated accordingly
+ */
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+Tensor& slice_copy_Tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t dim,
+    ::executorch::aten::optional<int64_t> start_val,
+    ::executorch::aten::optional<int64_t> end_val,
+    int64_t step,
+    Tensor& out) {
+  (void)ctx;
+
+  if (dim < 0) {
+    dim += in.dim();
+  }
+  // If user do not set value to end_val, set end to in.size(dim) (largest
+  // value available)
+  int64_t end = end_val.has_value() ? end_val.value() : in.size(dim);
+  // If user do not set value to start_val, set start to 0 (smallest value
+  // available)
+  int64_t start = start_val.has_value() ? start_val.value() : 0;
+  int64_t length =
+      torch::executor::adjust_slice_indices(in.size(dim), &start, &end, step);
+
+  int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
+
+#ifdef OP_ARG_CHECK
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::check_slice_copy_args(in, dim, step, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  Tensor::SizesType target_sizes[kTensorDimensionLimit];
+  size_t target_ndim = 0;
+  torch::executor::get_slice_copy_out_target_size(
+      in, dim, length, target_sizes, &target_ndim);
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(out, {target_sizes, target_ndim}) ==
+          Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  const ::executorch::aten::ArrayRef<Tensor::SizesType> in_size = in.sizes();
+  const ::executorch::aten::ArrayRef<Tensor::SizesType> out_size = out.sizes();
+
+  int inp_shape[kTensorDimensionLimit];
+  int out_shape[kTensorDimensionLimit];
+
+  /* input shapes and output shapes */
+  for (auto i = 0; i < in_size.size(); i++) {
+    inp_shape[i] = in_size[i];
+  }
+
+  for (auto i = 0; i < out_size.size(); i++) {
+    out_shape[i] = out_size[i];
+  }
+
+  signed char* out_data = out.mutable_data_ptr<signed char>();
+  const signed char* const inp_data = in.const_data_ptr<signed char>();
+
+  if ((out.scalar_type() == ScalarType::Int) ||
+      (out.scalar_type() == ScalarType::Short) ||
+      (out.scalar_type() == ScalarType::Char) ||
+      (out.scalar_type() == ScalarType::UInt32) ||
+      (out.scalar_type() == ScalarType::UInt16) ||
+      (out.scalar_type() == ScalarType::Byte)) {
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_slice,
+        out_data,
+        out_shape,
+        inp_data,
+        inp_shape,
+        in.dim(),
+        (int)start,
+        (int)(end - 1),
+        (int)step,
+        (int)dim,
+        get_element_size(out.scalar_type()));
+  } else {
+    torch::executor::compute_slice(in, dim, start, length, step, out);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/fusion_g3/operators/op_softmax.cpp b/backends/cadence/fusion_g3/operators/op_softmax.cpp
index 9f343481508..ee87ebaf5a1 100644
--- a/backends/cadence/fusion_g3/operators/op_softmax.cpp
+++ b/backends/cadence/fusion_g3/operators/op_softmax.cpp
@@ -6,10 +6,13 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
 #include <cmath>
 
 #include <xa_nnlib_kernels_api.h>
 
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
@@ -34,6 +37,10 @@ Tensor& _softmax_out(
     Tensor& out) {
   (void)ctx;
 
+  // Adjust for negative dim
+  dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim;
+
+#ifdef OP_ARG_CHECK
   ET_KERNEL_CHECK(
       ctx,
       torch::executor::check_softmax_args(in, dim, half_to_float, out),
@@ -48,9 +55,7 @@ Tensor& _softmax_out(
       executorch::runtime::tensors_have_same_dim_order(in, out),
       InvalidArgument,
       out);
-
-  // Adjust for negative dim
-  dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim;
+#endif
 
   int inp_shapes[in.dim()];
   const ArrayRef<Tensor::SizesType> in_size = in.sizes();
@@ -62,7 +67,15 @@ Tensor& _softmax_out(
     const float* const inp_data = in.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
     int axis = dim;
-    xa_nn_softmax_f32_f32(out_data, inp_data, inp_shapes, in.dim(), &axis);
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_softmax_f32_f32,
+        out_data,
+        inp_data,
+        inp_shapes,
+        in.dim(),
+        &axis);
   } else {
     ET_SWITCH_FLOATH_TYPES(in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() {
       const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
diff --git a/backends/cadence/fusion_g3/operators/op_sub.cpp b/backends/cadence/fusion_g3/operators/op_sub.cpp
new file mode 100644
index 00000000000..91782d2dfff
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/op_sub.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::canCast;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace G3 {
+namespace native {
+
+Tensor& sub_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    const Scalar& alpha,
+    Tensor& out) {
+  // Common Dtype
+  ScalarType common_type =
+      executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
+#ifdef OP_ARG_CHECK
+  ScalarType alpha_type =
+      torch::executor::native::utils::get_scalar_dtype(alpha);
+
+  // Check alpha type
+  ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
+
+  // Check Common Dtype
+  ET_KERNEL_CHECK(
+      ctx,
+      (canCast(common_type, out.scalar_type()) &&
+       canCast(alpha_type, common_type)),
+      InvalidArgument,
+      out);
+
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(a, b, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sub.out";
+
+  int kTensorDimensionLimit = 5;
+
+  int inp1_shape[kTensorDimensionLimit];
+  int inp2_shape[kTensorDimensionLimit];
+  int out_shape[kTensorDimensionLimit];
+
+  bool broadcast = 0;
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  bool optimized = 1;
+
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i] = 1;
+    inp1_shape[i] = 1;
+    inp2_shape[i] = 1;
+  }
+
+  int offset_out = max_dim - out.dim();
+  int offset_inp1 = max_dim - a.dim();
+  int offset_inp2 = max_dim - b.dim();
+
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < a.dim(); i++) {
+    inp1_shape[i + offset_inp1] = a.size(i);
+  }
+  for (int i = 0; i < b.dim(); i++) {
+    inp2_shape[i + offset_inp2] = b.size(i);
+  }
+
+  /*find broadcast*/
+  for (int i = 0; i < out.dim(); i++) {
+    if (((inp1_shape[i]) != (out_shape[i])) ||
+        ((inp2_shape[i]) != (out_shape[i]))) {
+      broadcast = 1;
+    }
+  }
+
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
+  }
+
+  if ((compute_type == ScalarType::Int) && (optimized)) {
+    const int* const inp1_data = a.const_data_ptr<int>();
+    const int* const inp2_data = b.const_data_ptr<int>();
+    int* const out_data = out.mutable_data_ptr<int>();
+
+    int alpha_val;
+    torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+    if (b.numel() == 1) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_scalar_32x32_32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          alpha_val,
+          out.numel());
+    } else if (broadcast) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_broadcast_5D_32x32_32,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp1_shape,
+          inp2_data,
+          inp2_shape,
+          max_dim,
+          alpha_val);
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_32x32_32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          alpha_val,
+          out.numel());
+    }
+  } else if ((compute_type == ScalarType::Float) && (optimized)) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    const float* const inp2_data = b.const_data_ptr<float>();
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    float alpha_val;
+    torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+    if (b.numel() == 1) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_scalar_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          alpha_val,
+          out.numel());
+    } else if (broadcast) {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_broadcast_5D_f32xf32_f32,
+          out_data,
+          out_shape,
+          inp1_data,
+          inp1_shape,
+          inp2_data,
+          inp2_shape,
+          max_dim,
+          alpha_val);
+    } else {
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_sub_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          alpha_val,
+          out.numel());
+    }
+  } else {
+    ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      const CTYPE_COMPUTE val_alpha =
+          torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(alpha);
+      torch::executor::native::utils::
+          apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+              [val_alpha](
+                  const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+                return val_a - val_alpha * val_b;
+              },
+              ctx,
+              a,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBF16,
+              b,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBF16,
+              out,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBF16);
+    });
+  }
+
+  return out;
+}
+
+Tensor& sub_scalar_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    const Scalar& alpha,
+    Tensor& out) {
+  // Common Dtype
+  ScalarType common_type =
+      torch::executor::native::utils::promote_type_with_scalar(
+          a.scalar_type(), b);
+#ifdef OP_ARG_CHECK
+  ScalarType alpha_type =
+      torch::executor::native::utils::get_scalar_dtype(alpha);
+
+  // Check alpha type
+  ET_KERNEL_CHECK(ctx, alpha_type != ScalarType::Bool, InvalidArgument, out);
+
+  // Check Common Dtype
+  ET_KERNEL_CHECK(
+      ctx,
+      (common_type == out.scalar_type() && canCast(alpha_type, common_type)),
+      InvalidArgument,
+      out);
+
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(a, out),
+      InvalidArgument,
+      out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+#endif
+
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sub.Scalar_out";
+
+  if (compute_type == ScalarType::Int) {
+    const int* const inp1_data = a.const_data_ptr<int>();
+    int inp2_val;
+    torch::executor::native::utils::extract_scalar(b, &inp2_val);
+
+    int alpha_val;
+    torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+    int* const out_data = out.mutable_data_ptr<int>();
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_sub_scalar_32x32_32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        alpha_val,
+        out.numel());
+  } else if (compute_type == ScalarType::Float) {
+    const float* const inp1_data = a.const_data_ptr<float>();
+    float inp2_val;
+    torch::executor::native::utils::extract_scalar(b, &inp2_val);
+
+    float alpha_val;
+    torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
+
+    float* const out_data = out.mutable_data_ptr<float>();
+
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_sub_scalar_f32xf32_f32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        alpha_val,
+        out.numel());
+  } else {
+    ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      const CTYPE_COMPUTE val_b =
+          torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(b);
+      const CTYPE_COMPUTE val_alpha =
+          torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(alpha);
+      torch::executor::native::utils::
+          apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+              [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+                return val_a - val_alpha * val_b;
+              },
+              ctx,
+              a,
+              torch::executor::native::utils::SupportedTensorDtypes::REALHBF16,
+              out,
+              torch::executor::native::utils::SupportedTensorDtypes::
+                  SAME_AS_COMMON);
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace G3
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/fusion_g3/operators/operators.h b/backends/cadence/fusion_g3/operators/operators.h
index 9d7f7b9c30e..e1c0d08f44a 100644
--- a/backends/cadence/fusion_g3/operators/operators.h
+++ b/backends/cadence/fusion_g3/operators/operators.h
@@ -16,6 +16,13 @@ namespace impl {
 namespace G3 {
 namespace native {
 
+::executorch::aten::Tensor& _softmax_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    int64_t dim,
+    bool half_to_float,
+    ::executorch::aten::Tensor& out);
+
 ::executorch::aten::Tensor& add_out(
     ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& a,
@@ -30,6 +37,153 @@ ::executorch::aten::Tensor& add_scalar_out(
     const ::executorch::aten::Scalar& alpha,
     ::executorch::aten::Tensor& out);
 
+::executorch::aten::Tensor& cat_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    ::executorch::aten::ArrayRef<::executorch::aten::Tensor> tensors,
+    int64_t dim,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& dequantize_per_channel_out(
+    ::executorch::runtime::KernelRuntimeContext& context,
+    const ::executorch::aten::Tensor& input,
+    const ::executorch::aten::Tensor& scale,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>&
+        opt_zero_points,
+    int64_t axis,
+    int64_t quant_min,
+    int64_t quant_max,
+    ::executorch::aten::ScalarType dtype,
+    ::executorch::aten::optional<::executorch::aten::ScalarType> out_dtype,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& dequantize_per_tensor_out(
+    ::executorch::runtime::KernelRuntimeContext& context,
+    const ::executorch::aten::Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ::executorch::aten::ScalarType dtype,
+    ::executorch::aten::optional<::executorch::aten::ScalarType> out_dtype,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& div_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Tensor& b,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& div_out_mode(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Tensor& b,
+    ::executorch::aten::optional<::executorch::aten::string_view> mode,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& div_scalar_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Scalar& b,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& div_scalar_mode_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Scalar& b,
+    ::executorch::aten::optional<::executorch::aten::string_view> mode,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& exp_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& mean_dim_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    ::executorch::aten::optional<::executorch::aten::ArrayRef<int64_t>>
+        dim_list,
+    bool keepdim,
+    ::executorch::aten::optional<::executorch::aten::ScalarType> dtype,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& mul_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Tensor& b,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& mul_scalar_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Scalar& b,
+    ::executorch::aten::Tensor& out);
+
+std::tuple<
+    ::executorch::aten::Tensor&,
+    ::executorch::aten::Tensor&,
+    ::executorch::aten::Tensor&>
+native_layer_norm_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& input,
+    ::executorch::aten::IntArrayRef normalized_shape,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& weight,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
+    double eps,
+    ::executorch::aten::Tensor& out,
+    ::executorch::aten::Tensor& mean_out,
+    ::executorch::aten::Tensor& rstd_out);
+
+::executorch::aten::Tensor& permute_copy_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    ::executorch::aten::IntArrayRef dims,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& quantize_per_channel_out(
+    ::executorch::runtime::KernelRuntimeContext& context,
+    const ::executorch::aten::Tensor& input,
+    const ::executorch::aten::Tensor& scale,
+    const ::executorch::aten::Tensor& zero_point,
+    int64_t axis,
+    int64_t quant_min,
+    int64_t quant_max,
+    ::executorch::aten::ScalarType dtype,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& quantize_per_tensor_out(
+    ::executorch::runtime::KernelRuntimeContext& context,
+    const ::executorch::aten::Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ::executorch::aten::ScalarType dtype,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& slice_copy_Tensor_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& in,
+    int64_t dim,
+    ::executorch::aten::optional<int64_t> start_val,
+    ::executorch::aten::optional<int64_t> end_val,
+    int64_t step,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& sub_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Tensor& b,
+    const ::executorch::aten::Scalar& alpha,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& sub_scalar_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& a,
+    const ::executorch::aten::Scalar& b,
+    const ::executorch::aten::Scalar& alpha,
+    ::executorch::aten::Tensor& out);
+
 } // namespace native
 } // namespace G3
 } // namespace impl
diff --git a/backends/cadence/fusion_g3/operators/targets.bzl b/backends/cadence/fusion_g3/operators/targets.bzl
index 3e5900e3634..e1e7c9a8491 100644
--- a/backends/cadence/fusion_g3/operators/targets.bzl
+++ b/backends/cadence/fusion_g3/operators/targets.bzl
@@ -28,6 +28,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         exported_deps = [
             ":operators_header",
             ":xt_macros",
+            ":xt_utils",
         ],
     )
 
@@ -39,6 +40,12 @@ OPERATORS = [
     "native_layer_norm",
     "quantize",
     "softmax",
+    "sub",
+    "div",
+    "exp",
+    "mean",
+    "slice_copy",
+    "permute_copy"
 ]
 
 def define_common_targets():
@@ -74,5 +81,17 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "xt_utils",
+        exported_headers = ["xt_utils.h"],
+        visibility = [
+            "//executorch/backends/cadence/...",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/kernel:kernel_runtime_context",
+        ],
+    )
+
     for op in OPERATORS:
         define_operator(op)
diff --git a/backends/cadence/fusion_g3/operators/xt_utils.h b/backends/cadence/fusion_g3/operators/xt_utils.h
new file mode 100644
index 00000000000..443d68d0609
--- /dev/null
+++ b/backends/cadence/fusion_g3/operators/xt_utils.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ScalarType;
+
+inline int get_element_size(ScalarType dtype) {
+  if ((dtype == ScalarType::Int) || (dtype == ScalarType::UInt32)) {
+    return sizeof(int);
+  } else if ((dtype == ScalarType::Short) || (dtype == ScalarType::UInt16)) {
+    return sizeof(short);
+  } else if ((dtype == ScalarType::Char) || (dtype == ScalarType::Byte)) {
+    return sizeof(char);
+  }
+  return 0;
+}