diff --git a/backends/cadence/common/xt_macros.h b/backends/cadence/common/xt_macros.h
new file mode 100644
index 00000000000..0d1ee414082
--- /dev/null
+++ b/backends/cadence/common/xt_macros.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+#define XT_KERNEL_CHECK(ctx, out, kernel, ...)                  \
+  {                                                             \
+    const auto ret = kernel(__VA_ARGS__);                       \
+    ET_KERNEL_CHECK_MSG(                                        \
+        ctx,                                                    \
+        ret == 0,                                               \
+        InvalidArgument,                                        \
+        out,                                                    \
+        "Failed to run kernel: " #kernel "(" #__VA_ARGS__ ")"); \
+  }
diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
index f6de58e9ac7..b78cc33890b 100644
--- a/backends/cadence/fusion_g3/operators/op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -10,7 +10,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp
index 3ad958b04a6..0c83ebaf0ad 100644
--- a/backends/cadence/fusion_g3/operators/op_cat.cpp
+++ b/backends/cadence/fusion_g3/operators/op_cat.cpp
@@ -13,7 +13,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
diff --git a/backends/cadence/fusion_g3/operators/op_clamp.cpp b/backends/cadence/fusion_g3/operators/op_clamp.cpp
index ffae66af0ef..8eed6b681c2 100644
--- a/backends/cadence/fusion_g3/operators/op_clamp.cpp
+++ b/backends/cadence/fusion_g3/operators/op_clamp.cpp
@@ -15,7 +15,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/math_util.h>
diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
index 27c374e1a1c..537e3f04ae0 100644
--- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
@@ -14,7 +14,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
diff --git a/backends/cadence/fusion_g3/operators/op_div.cpp b/backends/cadence/fusion_g3/operators/op_div.cpp
index fb5410a184e..62ebf303ebd 100644
--- a/backends/cadence/fusion_g3/operators/op_div.cpp
+++ b/backends/cadence/fusion_g3/operators/op_div.cpp
@@ -12,7 +12,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/math_util.h>
diff --git a/backends/cadence/fusion_g3/operators/op_exp.cpp b/backends/cadence/fusion_g3/operators/op_exp.cpp
index 84f5670b320..51d53067668 100644
--- a/backends/cadence/fusion_g3/operators/op_exp.cpp
+++ b/backends/cadence/fusion_g3/operators/op_exp.cpp
@@ -12,7 +12,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
diff --git a/backends/cadence/fusion_g3/operators/op_hardtanh.cpp b/backends/cadence/fusion_g3/operators/op_hardtanh.cpp
index 21570d9cb54..b930098fb24 100644
--- a/backends/cadence/fusion_g3/operators/op_hardtanh.cpp
+++ b/backends/cadence/fusion_g3/operators/op_hardtanh.cpp
@@ -11,7 +11,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/math_util.h>
diff --git a/backends/cadence/fusion_g3/operators/op_lt.cpp b/backends/cadence/fusion_g3/operators/op_lt.cpp
index 2aafeb09ddd..850552f1d3b 100644
--- a/backends/cadence/fusion_g3/operators/op_lt.cpp
+++ b/backends/cadence/fusion_g3/operators/op_lt.cpp
@@ -10,7 +10,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
 using ::executorch::aten::Scalar;
diff --git a/backends/cadence/fusion_g3/operators/op_mean.cpp b/backends/cadence/fusion_g3/operators/op_mean.cpp
index e18a21b4e0c..cefd45f6ef8 100644
--- a/backends/cadence/fusion_g3/operators/op_mean.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mean.cpp
@@ -10,7 +10,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
index ea78ea11512..a4a230a374f 100644
--- a/backends/cadence/fusion_g3/operators/op_mul.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -10,7 +10,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
diff --git a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
index 9ae0a974df9..aa25cec9230 100644
--- a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
+++ b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
@@ -13,7 +13,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>
 #include <executorch/kernels/portable/cpu/vec_ops.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
diff --git a/backends/cadence/fusion_g3/operators/op_permute_copy.cpp b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
index a8195b8bacd..5b1d079f92e 100644
--- a/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
+++ b/backends/cadence/fusion_g3/operators/op_permute_copy.cpp
@@ -11,7 +11,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp
index 670e6dcb358..26f90ddf5d1 100644
--- a/backends/cadence/fusion_g3/operators/op_quantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_quantize.cpp
@@ -14,7 +14,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
diff --git a/backends/cadence/fusion_g3/operators/op_rsqrt.cpp b/backends/cadence/fusion_g3/operators/op_rsqrt.cpp
index 0988fe946e3..a9017397687 100644
--- a/backends/cadence/fusion_g3/operators/op_rsqrt.cpp
+++ b/backends/cadence/fusion_g3/operators/op_rsqrt.cpp
@@ -10,7 +10,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
diff --git a/backends/cadence/fusion_g3/operators/op_sigmoid.cpp b/backends/cadence/fusion_g3/operators/op_sigmoid.cpp
index 08dc735a8af..0ded70926eb 100644
--- a/backends/cadence/fusion_g3/operators/op_sigmoid.cpp
+++ b/backends/cadence/fusion_g3/operators/op_sigmoid.cpp
@@ -12,7 +12,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
diff --git a/backends/cadence/fusion_g3/operators/op_slice_copy.cpp b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
index 2bb42f49169..a97f9beb0c7 100644
--- a/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
+++ b/backends/cadence/fusion_g3/operators/op_slice_copy.cpp
@@ -13,7 +13,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/slice_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
diff --git a/backends/cadence/fusion_g3/operators/op_softmax.cpp b/backends/cadence/fusion_g3/operators/op_softmax.cpp
index 37a0f227193..1faf41c94a8 100644
--- a/backends/cadence/fusion_g3/operators/op_softmax.cpp
+++ b/backends/cadence/fusion_g3/operators/op_softmax.cpp
@@ -12,7 +12,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
diff --git a/backends/cadence/fusion_g3/operators/op_sqrt.cpp b/backends/cadence/fusion_g3/operators/op_sqrt.cpp
index cf68b95e7cf..584d94d78a1 100644
--- a/backends/cadence/fusion_g3/operators/op_sqrt.cpp
+++ b/backends/cadence/fusion_g3/operators/op_sqrt.cpp
@@ -12,7 +12,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
diff --git a/backends/cadence/fusion_g3/operators/op_sub.cpp b/backends/cadence/fusion_g3/operators/op_sub.cpp
index b90b2fa2ed5..0b5bee9a651 100644
--- a/backends/cadence/fusion_g3/operators/op_sub.cpp
+++ b/backends/cadence/fusion_g3/operators/op_sub.cpp
@@ -10,7 +10,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
diff --git a/backends/cadence/fusion_g3/operators/op_tanh.cpp b/backends/cadence/fusion_g3/operators/op_tanh.cpp
index 5015995e925..9686dc7caa9 100644
--- a/backends/cadence/fusion_g3/operators/op_tanh.cpp
+++ b/backends/cadence/fusion_g3/operators/op_tanh.cpp
@@ -12,7 +12,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
diff --git a/backends/cadence/fusion_g3/operators/op_transpose_copy.cpp b/backends/cadence/fusion_g3/operators/op_transpose_copy.cpp
index d27c82b4ff3..4bff24cbfe5 100644
--- a/backends/cadence/fusion_g3/operators/op_transpose_copy.cpp
+++ b/backends/cadence/fusion_g3/operators/op_transpose_copy.cpp
@@ -11,7 +11,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/transpose_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
diff --git a/backends/cadence/fusion_g3/operators/op_where.cpp b/backends/cadence/fusion_g3/operators/op_where.cpp
index 8d92be32419..4351e8bd684 100644
--- a/backends/cadence/fusion_g3/operators/op_where.cpp
+++ b/backends/cadence/fusion_g3/operators/op_where.cpp
@@ -10,7 +10,7 @@
 
 #include <xa_nnlib_kernels_api.h>
 
-#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
+#include <executorch/backends/cadence/common/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
diff --git a/backends/cadence/fusion_g3/operators/targets.bzl b/backends/cadence/fusion_g3/operators/targets.bzl
index bc0a01b4fe8..dd04bd1223b 100644
--- a/backends/cadence/fusion_g3/operators/targets.bzl
+++ b/backends/cadence/fusion_g3/operators/targets.bzl
@@ -10,8 +10,11 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         "//executorch/kernels/portable/cpu/pattern:all_deps",
         "//executorch/runtime/kernel:kernel_includes",
         "//executorch/kernels/portable/cpu:scalar_utils",
+        "//executorch/backends/cadence/common:xt_macros",
         "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib_common",
         "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib",
+        ":operators_header",
+        ":xt_utils",
     ]
     if deps == None:
         deps = []
@@ -26,11 +29,6 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         ],
         compatible_with = ["ovr_config//cpu:xtensa"],
         deps = deps + common_deps,
-        exported_deps = [
-            ":operators_header",
-            ":xt_macros",
-            ":xt_utils",
-        ],
     )
 
 OPERATORS = [
@@ -79,18 +77,6 @@ def define_common_targets():
         ],
     )
 
-    runtime.cxx_library(
-        name = "xt_macros",
-        exported_headers = ["xt_macros.h"],
-        visibility = [
-            "//executorch/backends/cadence/...",
-        ],
-        exported_deps = [
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/kernel:kernel_runtime_context",
-        ],
-    )
-
     runtime.cxx_library(
         name = "xt_utils",
         exported_headers = ["xt_utils.h"],
diff --git a/backends/cadence/fusion_g3/operators/xt_macros.h b/backends/cadence/fusion_g3/operators/xt_macros.h
deleted file mode 100644
index 4ab99380a2d..00000000000
--- a/backends/cadence/fusion_g3/operators/xt_macros.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-
-#define XT_KERNEL_CHECK(ctx, out, kernel, ...) \
-  const auto ret = kernel(__VA_ARGS__);        \
-  ET_KERNEL_CHECK_MSG(                         \
-      ctx,                                     \
-      ret == 0,                                \
-      InvalidArgument,                         \
-      out,                                     \
-      "Failed to run kernel: " #kernel "(" #__VA_ARGS__ ")");
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
index f51fddf31db..445cf3d9f2b 100644
--- a/backends/cadence/hifi/operators/op_add.cpp
+++ b/backends/cadence/hifi/operators/op_add.cpp
@@ -16,6 +16,8 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <executorch/backends/cadence/common/xt_macros.h>
+
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
@@ -184,10 +186,25 @@ Tensor& add_out(
       for (int i = 0; i < b.dim(); i++)
         inp2_shape[i + off_b] = b.size(i);
 
-      xa_nn_elm_add_broadcast_4D_f32xf32_f32(
-          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_broadcast_4D_f32xf32_f32,
+          out_data,
+          out_shape,
+          a_data,
+          inp1_shape,
+          b_data,
+          inp2_shape);
     } else {
-      xa_nn_elm_add_f32xf32_f32(out_data, a_data, b_data, out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_f32xf32_f32,
+          out_data,
+          a_data,
+          b_data,
+          out.numel());
     }
 
     return out;
diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp
index cd412bc5c39..1546c1e3a7f 100644
--- a/backends/cadence/hifi/operators/op_atan2.cpp
+++ b/backends/cadence/hifi/operators/op_atan2.cpp
@@ -12,6 +12,8 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cmath>
 
+#include <executorch/backends/cadence/common/xt_macros.h>
+
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::runtime::isFloatingType;
@@ -181,7 +183,15 @@ Tensor& atan2_out(
       for (int i = 0; i < b_dim; i++)
         p_inp1_shape[i] = b.size(i);
 
-      xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_32_32,
+          ptr1,
+          p_out_shape,
+          pin1,
+          p_inp1_shape,
+          out_dim);
 
       FLOAT32* __restrict__ p_out =
           (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
diff --git a/backends/cadence/hifi/operators/op_bitwise_and.cpp b/backends/cadence/hifi/operators/op_bitwise_and.cpp
index 85db3b164d7..82b29b8bcd1 100644
--- a/backends/cadence/hifi/operators/op_bitwise_and.cpp
+++ b/backends/cadence/hifi/operators/op_bitwise_and.cpp
@@ -14,6 +14,8 @@
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+#include <executorch/backends/cadence/common/xt_macros.h>
+
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
@@ -96,14 +98,37 @@ Tensor& bitwise_and_Tensor_out(
       for (int i = 0; i < b_dim; i++)
         p_inp2_shape[i] = b.size(i);
 
-      xa_nn_broadcast_8_8(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
-
-      xa_nn_broadcast_8_8(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr1,
+          p_out_shape,
+          pin1,
+          p_inp1_shape,
+          out_dim);
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr2,
+          p_out_shape,
+          pin2,
+          p_inp2_shape,
+          out_dim);
 
       const WORD8* __restrict__ p_inp1 = (const WORD8* __restrict__)ptr1;
       const WORD8* __restrict__ p_inp2 = (const WORD8* __restrict__)ptr2;
 
-      xa_nn_elm_logicaland_boolxbool_bool(p_out, p_inp1, p_inp2, num_elm);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_logicaland_boolxbool_bool,
+          p_out,
+          p_inp1,
+          p_inp2,
+          num_elm);
     } else if (a_is_broadcasted && !b_is_broadcasted) {
       WORD8* __restrict__ ptr1 =
           (WORD8* __restrict__)kernels::allocate_temp_memory(ctx, num_elm);
@@ -124,11 +149,26 @@ Tensor& bitwise_and_Tensor_out(
       for (int i = 0; i < a_dim; i++)
         p_inp1_shape[i] = a.size(i);
 
-      xa_nn_broadcast_8_8(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr1,
+          p_out_shape,
+          pin1,
+          p_inp1_shape,
+          out_dim);
 
       const WORD8* __restrict__ p_inp1 = (const WORD8* __restrict__)ptr1;
 
-      xa_nn_elm_logicaland_boolxbool_bool(p_out, p_inp1, p_inp2, num_elm);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_logicaland_boolxbool_bool,
+          p_out,
+          p_inp1,
+          p_inp2,
+          num_elm);
     } else if (!a_is_broadcasted && b_is_broadcasted) {
       WORD8* __restrict__ ptr1 =
           (WORD8* __restrict__)kernels::allocate_temp_memory(ctx, num_elm);
@@ -149,11 +189,26 @@ Tensor& bitwise_and_Tensor_out(
       for (int i = 0; i < b_dim; i++)
         p_inp2_shape[i] = b.size(i);
 
-      xa_nn_broadcast_8_8(ptr1, p_out_shape, pinp2, p_inp2_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr1,
+          p_out_shape,
+          pinp2,
+          p_inp2_shape,
+          out_dim);
 
       const WORD8* __restrict__ p_inp2 = (const WORD8* __restrict__)ptr1;
 
-      xa_nn_elm_logicaland_boolxbool_bool(p_out, p_inp1, p_inp2, num_elm);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_logicaland_boolxbool_bool,
+          p_out,
+          p_inp1,
+          p_inp2,
+          num_elm);
     } else {
       const WORD8* __restrict__ p_inp1 =
           (const WORD8* __restrict__)a.const_data_ptr<bool>();
diff --git a/backends/cadence/hifi/operators/op_bitwise_or.cpp b/backends/cadence/hifi/operators/op_bitwise_or.cpp
index 3b717620202..9a9722aa6a0 100644
--- a/backends/cadence/hifi/operators/op_bitwise_or.cpp
+++ b/backends/cadence/hifi/operators/op_bitwise_or.cpp
@@ -14,6 +14,8 @@
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+#include <executorch/backends/cadence/common/xt_macros.h>
+
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
@@ -96,14 +98,37 @@ Tensor& bitwise_or_Tensor_out(
       for (int i = 0; i < b_dim; i++)
         p_inp2_shape[i] = b.size(i);
 
-      xa_nn_broadcast_8_8(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
-
-      xa_nn_broadcast_8_8(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr1,
+          p_out_shape,
+          pin1,
+          p_inp1_shape,
+          out_dim);
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr2,
+          p_out_shape,
+          pin2,
+          p_inp2_shape,
+          out_dim);
 
       const WORD8* __restrict__ p_inp1 = (const WORD8* __restrict__)ptr1;
       const WORD8* __restrict__ p_inp2 = (const WORD8* __restrict__)ptr2;
 
-      xa_nn_elm_logicalor_boolxbool_bool(p_out, p_inp1, p_inp2, num_elm);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_logicalor_boolxbool_bool,
+          p_out,
+          p_inp1,
+          p_inp2,
+          num_elm);
     } else if (a_is_broadcasted && !b_is_broadcasted) {
       WORD8* __restrict__ ptr1 =
           (WORD8* __restrict__)kernels::allocate_temp_memory(ctx, num_elm);
@@ -124,11 +149,26 @@ Tensor& bitwise_or_Tensor_out(
       for (int i = 0; i < a_dim; i++)
         p_inp1_shape[i] = a.size(i);
 
-      xa_nn_broadcast_8_8(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr1,
+          p_out_shape,
+          pin1,
+          p_inp1_shape,
+          out_dim);
 
       const WORD8* __restrict__ p_inp1 = (const WORD8* __restrict__)ptr1;
 
-      xa_nn_elm_logicalor_boolxbool_bool(p_out, p_inp1, p_inp2, num_elm);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_logicalor_boolxbool_bool,
+          p_out,
+          p_inp1,
+          p_inp2,
+          num_elm);
     } else if (!a_is_broadcasted && b_is_broadcasted) {
       WORD8* __restrict__ ptr1 =
           (WORD8* __restrict__)kernels::allocate_temp_memory(ctx, num_elm);
@@ -149,11 +189,26 @@ Tensor& bitwise_or_Tensor_out(
       for (int i = 0; i < b_dim; i++)
         p_inp2_shape[i] = b.size(i);
 
-      xa_nn_broadcast_8_8(ptr1, p_out_shape, pinp2, p_inp2_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr1,
+          p_out_shape,
+          pinp2,
+          p_inp2_shape,
+          out_dim);
 
       const WORD8* __restrict__ p_inp2 = (const WORD8* __restrict__)ptr1;
 
-      xa_nn_elm_logicalor_boolxbool_bool(p_out, p_inp1, p_inp2, num_elm);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_logicalor_boolxbool_bool,
+          p_out,
+          p_inp1,
+          p_inp2,
+          num_elm);
     } else {
       const WORD8* __restrict__ p_inp1 =
           (const WORD8* __restrict__)a.const_data_ptr<bool>();
diff --git a/backends/cadence/hifi/operators/op_bitwise_xor.cpp b/backends/cadence/hifi/operators/op_bitwise_xor.cpp
index d71045038e7..66b9e8cc7fe 100644
--- a/backends/cadence/hifi/operators/op_bitwise_xor.cpp
+++ b/backends/cadence/hifi/operators/op_bitwise_xor.cpp
@@ -14,6 +14,8 @@
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+#include <executorch/backends/cadence/common/xt_macros.h>
+
 using exec_aten::Scalar;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
@@ -96,14 +98,37 @@ Tensor& bitwise_xor_Tensor_out(
       for (int i = 0; i < b_dim; i++)
         p_inp2_shape[i] = b.size(i);
 
-      xa_nn_broadcast_8_8(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
-
-      xa_nn_broadcast_8_8(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr1,
+          p_out_shape,
+          pin1,
+          p_inp1_shape,
+          out_dim);
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr2,
+          p_out_shape,
+          pin2,
+          p_inp2_shape,
+          out_dim);
 
       const WORD8* __restrict__ p_inp1 = (const WORD8* __restrict__)ptr1;
       const WORD8* __restrict__ p_inp2 = (const WORD8* __restrict__)ptr2;
 
-      xa_nn_elm_logicalxor_boolxbool_bool(p_out, p_inp1, p_inp2, num_elm);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_logicalxor_boolxbool_bool,
+          p_out,
+          p_inp1,
+          p_inp2,
+          num_elm);
     } else if (a_is_broadcasted && !b_is_broadcasted) {
       WORD8* __restrict__ ptr1 =
           (WORD8* __restrict__)kernels::allocate_temp_memory(ctx, num_elm);
@@ -124,11 +149,26 @@ Tensor& bitwise_xor_Tensor_out(
       for (int i = 0; i < a_dim; i++)
         p_inp1_shape[i] = a.size(i);
 
-      xa_nn_broadcast_8_8(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr1,
+          p_out_shape,
+          pin1,
+          p_inp1_shape,
+          out_dim);
 
       const WORD8* __restrict__ p_inp1 = (const WORD8* __restrict__)ptr1;
 
-      xa_nn_elm_logicalxor_boolxbool_bool(p_out, p_inp1, p_inp2, num_elm);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_logicalxor_boolxbool_bool,
+          p_out,
+          p_inp1,
+          p_inp2,
+          num_elm);
     } else if (!a_is_broadcasted && b_is_broadcasted) {
       WORD8* __restrict__ ptr1 =
           (WORD8* __restrict__)kernels::allocate_temp_memory(ctx, num_elm);
@@ -149,11 +189,26 @@ Tensor& bitwise_xor_Tensor_out(
       for (int i = 0; i < b_dim; i++)
         p_inp2_shape[i] = b.size(i);
 
-      xa_nn_broadcast_8_8(ptr1, p_out_shape, pinp2, p_inp2_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_8_8,
+          ptr1,
+          p_out_shape,
+          pinp2,
+          p_inp2_shape,
+          out_dim);
 
       const WORD8* __restrict__ p_inp2 = (const WORD8* __restrict__)ptr1;
 
-      xa_nn_elm_logicalxor_boolxbool_bool(p_out, p_inp1, p_inp2, num_elm);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_logicalxor_boolxbool_bool,
+          p_out,
+          p_inp1,
+          p_inp2,
+          num_elm);
     } else {
       const WORD8* __restrict__ p_inp1 =
           (const WORD8* __restrict__)a.const_data_ptr<bool>();
diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp
index dd9bcff8a0c..e3d5c8914a4 100644
--- a/backends/cadence/hifi/operators/op_clamp.cpp
+++ b/backends/cadence/hifi/operators/op_clamp.cpp
@@ -20,6 +20,8 @@
 #include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+#include <executorch/backends/cadence/common/xt_macros.h>
+
 using executorch::aten::RuntimeContext;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
@@ -247,8 +249,15 @@ Tensor& clamp_Tensor_out(
               ctx, p_scratch != nullptr, MemoryAllocationFailed, out);
 
           const FLOAT32* p_brd_cond = (const FLOAT32*)p_scratch;
-          xa_nn_broadcast_32_32(
-              (WORD32*)p_brd_cond, out_shape, (WORD32*)inp_data, inp_shape, 4);
+          XT_KERNEL_CHECK(
+              ctx,
+              out,
+              xa_nn_broadcast_32_32,
+              (WORD32*)p_brd_cond,
+              out_shape,
+              (WORD32*)inp_data,
+              inp_shape,
+              4);
 
           for (int i = 0; i < 4; i++) {
             inp_shape[i] = out_shape[i];
diff --git a/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
index 30ce938e24d..c091d216556 100644
--- a/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
@@ -10,6 +10,8 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <xa_nnlib_kernels_api.h>
 
+#include <executorch/backends/cadence/common/xt_macros.h>
+
 namespace impl {
 namespace HiFi {
 namespace native {
@@ -24,8 +26,8 @@ void dequantize_per_tensor_out(
     const Tensor& input,
     double scale,
     int64_t zero_point,
-    __ET_UNUSED int64_t quant_min,
-    __ET_UNUSED int64_t quant_max,
+    ET_UNUSED int64_t quant_min,
+    ET_UNUSED int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
   float* out_data = out.mutable_data_ptr<float>();
@@ -35,8 +37,15 @@ void dequantize_per_tensor_out(
     dequantize<uint8_t>(out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-    xa_nn_elm_dequantize_asym8s_f32(
-        out_data, input_data, zero_point, scale, numel);
+    XT_KERNEL_CHECK(
+        ctx,
+        ,
+        xa_nn_elm_dequantize_asym8s_f32,
+        out_data,
+        input_data,
+        zero_point,
+        scale,
+        numel);
   } else if (input.scalar_type() == ScalarType::Short) {
     const int16_t* input_data = input.const_data_ptr<int16_t>();
     dequantize<int16_t>(out_data, input_data, scale, zero_point, numel);
diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp
index 65bb0fba56f..e5b31cc7731 100644
--- a/backends/cadence/hifi/operators/op_pow.cpp
+++ b/backends/cadence/hifi/operators/op_pow.cpp
@@ -9,13 +9,14 @@
 #include <cmath>
 
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
-#include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+#include <executorch/backends/cadence/common/xt_macros.h>
+
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
@@ -121,9 +122,25 @@ Tensor& pow_Tensor_Tensor_out(
       for (int i = 0; i < b_dim; i++)
         p_inp2_shape[i] = b.size(i);
 
-      xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
-
-      xa_nn_broadcast_32_32(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_32_32,
+          ptr1,
+          p_out_shape,
+          pin1,
+          p_inp1_shape,
+          out_dim);
+
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_32_32,
+          ptr2,
+          p_out_shape,
+          pin2,
+          p_inp2_shape,
+          out_dim);
 
       FLOAT32* __restrict__ p_out =
           (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
@@ -150,8 +167,15 @@ Tensor& pow_Tensor_Tensor_out(
       for (int i = 0; i < a_dim; i++)
         p_inp1_shape[i] = a.size(i);
 
-      xa_nn_broadcast_32_32(
-          (WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_32_32,
+          (WORD32*)ptr1,
+          p_out_shape,
+          (WORD32*)pin1,
+          p_inp1_shape,
+          out_dim);
 
       FLOAT32* __restrict__ p_out =
           (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
@@ -179,7 +203,15 @@ Tensor& pow_Tensor_Tensor_out(
       for (int i = 0; i < b_dim; i++)
         p_inp1_shape[i] = b.size(i);
 
-      xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_broadcast_32_32,
+          ptr1,
+          p_out_shape,
+          pin1,
+          p_inp1_shape,
+          out_dim);
 
       FLOAT32* __restrict__ p_out =
           (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index 1f9814c4a4e..a25dfd1bcbc 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -16,7 +16,8 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         "//executorch/kernels/portable/cpu/util:elementwise_util",
         "//executorch/kernels/portable/cpu/pattern:bitwise_op",
         "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions",
-        "//executorch/kernels/portable/cpu/pattern:comparison_op"
+        "//executorch/kernels/portable/cpu/pattern:comparison_op",
+        "//executorch/backends/cadence/common:xt_macros"
     ]
     if deps == None:
         deps = []