diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
similarity index 100%
rename from backends/cadence/hifi/operators/dequantize_per_tensor.cpp
rename to backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
index 342c982a076..82fa7502dea 100644
--- a/backends/cadence/hifi/operators/op_mean.cpp
+++ b/backends/cadence/hifi/operators/op_mean.cpp
@@ -56,7 +56,7 @@ int prepare_data(
   return num_axis_dims;
 }
 
-Tensor& mean_dim_out(
+Tensor& mean_out(
     RuntimeContext& ctx,
     const Tensor& in,
     optional<ArrayRef<int64_t>> dim_list,
diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp
similarity index 100%
rename from backends/cadence/hifi/operators/quantize_per_tensor.cpp
rename to backends/cadence/hifi/operators/op_quantize_per_tensor.cpp
diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/op_quantized_layer_norm.cpp
similarity index 100%
rename from backends/cadence/hifi/operators/quantized_layer_norm.cpp
rename to backends/cadence/hifi/operators/op_quantized_layer_norm.cpp
diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/op_quantized_linear_out.cpp
similarity index 100%
rename from backends/cadence/hifi/operators/quantized_linear_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_linear_out.cpp
diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
similarity index 98%
rename from backends/cadence/hifi/operators/quantized_relu_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_relu_out.cpp
index d78e555ad1e..0860109f7c1 100644
--- a/backends/cadence/hifi/operators/quantized_relu_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_relu_out.cpp
@@ -45,7 +45,7 @@ void quantized_relu_(
   }
 }
 
-void quantized_relu_out(
+void quantized_relu_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_zero_point,
@@ -100,4 +100,4 @@ void quantized_relu_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_remainder.cpp b/backends/cadence/hifi/operators/op_remainder.cpp
index d8c4a6d2d80..99cd6ad544e 100644
--- a/backends/cadence/hifi/operators/op_remainder.cpp
+++ b/backends/cadence/hifi/operators/op_remainder.cpp
@@ -8,6 +8,7 @@
 
 #include <cmath>
 
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
@@ -15,8 +16,6 @@
 #include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-#include "kernels.h"
-
 using executorch::aten::RuntimeContext;
 using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp
index e026afd2c92..5db8ed8425e 100644
--- a/backends/cadence/hifi/operators/op_softmax.cpp
+++ b/backends/cadence/hifi/operators/op_softmax.cpp
@@ -8,11 +8,11 @@
 
 #include <cmath>
 
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
@@ -24,7 +24,7 @@ namespace impl {
 namespace HiFi {
 namespace native {
 
-Tensor& softmax_out(
+Tensor& _softmax_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index 6c671a5f24a..1c2b481410d 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -1,243 +1,70 @@
 load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-def define_common_targets():
-    """Defines targets that should be shared between fbcode and xplat.
-
-    The directory containing this targets.bzl file should also contain both
-    TARGETS and BUCK files that call this function.
-    """
-
-    # Define build targets for all operators registered in the tables above.
 
-    runtime.cxx_library(
-        name = "quantize_per_tensor",
-        srcs = [
-            "quantize_per_tensor.cpp"
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+def define_operator(name: str, deps: list[str] | None = None) -> None:
+    op_name = "op_{}".format(name)
 
-    runtime.cxx_library(
-        name = "dequantize_per_tensor",
-        srcs = [
-            "dequantize_per_tensor.cpp"
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+    # Deps used by all operators.
+    common_deps = [
+        "//executorch/kernels/portable/cpu/util:all_deps",
+        "//executorch/kernels/portable/cpu/pattern:all_deps",
+        "//executorch/runtime/kernel:kernel_includes",
+        "//executorch/kernels/portable/cpu:scalar_utils",
+        "//executorch/backends/cadence/hifi/kernels:kernels",
+        "//executorch/kernels/portable/cpu/util:dtype_util",
+        "//executorch/kernels/portable/cpu/util:elementwise_util",
+        "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+    ]
+    if deps == None:
+        deps = []
 
     runtime.cxx_library(
-        name = "quantized_layer_norm",
-        srcs = [
-            "quantized_layer_norm.cpp"
-        ],
-        exported_headers = ["operators.h"],
+        name = op_name,
+        srcs = [op_name + ".cpp"],
         platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
         visibility = [
             "//executorch/backends/cadence/...",
             "@EXECUTORCH_CLIENTS",
         ],
-    )
-
-    runtime.cxx_library(
-        name = "quantized_linear_out",
-        srcs = [
-            "quantized_linear_out.cpp"
-        ],
+        deps = deps + common_deps,
         exported_headers = ["operators.h"],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    runtime.cxx_library(
-        name = "op_add",
-        srcs = [
-            "op_add.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-
-    runtime.cxx_library(
-        name = "op_mul",
-        srcs = [
-            "op_mul.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    runtime.cxx_library(
-        name = "op_sub",
-        srcs = [
-            "op_sub.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
     )
 
-    runtime.cxx_library(
-        name = "op_div",
-        srcs = [
-            "op_div.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+OPERATORS = [
+    "add",
+    "atan2",
+    "cat",
+    "clamp",
+    "dequantize_per_tensor",
+    "div",
+    "full",
+    "maximum",
+    "mean",
+    "minimum",
+    "mul",
+    "permute_copy",
+    "pow",
+    "quantize_per_tensor",
+    "quantized_layer_norm",
+    "quantized_linear_out",
+    "quantized_relu_out",
+    "remainder",
+    "rsqrt",
+    "sigmoid",
+    "softmax",
+    "sub",
+    "tanh",
+    "where"
+]
 
-    runtime.cxx_library(
-        name = "op_sigmoid",
-        srcs = [
-            "op_sigmoid.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:dtype_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
 
-    runtime.cxx_library(
-        name = "op_tanh",
-        srcs = [
-            "op_tanh.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
 
-    
-    runtime.cxx_library(
-        name = "op_where",
-        srcs = [
-            "op_where.cpp",
-        ],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/backends/cadence/hifi/kernels:kernels",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+    # Define build targets for all operators registered in the tables above.
+    for op in OPERATORS:
+        define_operator(op)
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c
new file mode 100644
index 00000000000..b069035dc90
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c
@@ -0,0 +1,232 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+#include "xa_nnlib_common.h"
+
+#include <string.h>
+
+/*
+ * Currently only supports upto 5D input tensors.
+ * 1/2/3/4 D input tensors will be scaled up to 5D.
+ * For example, 2x3 -> 1x1x1x2x3.
+ */
+
+WORD32 xa_nn_transpose_8_8(WORD8 * __restrict__ p_out
+                    ,const WORD32 *const p_out_shape
+                    ,const WORD8 * __restrict__ p_inp
+                    ,const WORD32 *const p_inp_shape
+                    ,const WORD32 * __restrict__ p_permute_vec
+                    ,WORD32 num_out_dims
+                    ,WORD32 num_inp_dims)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1);
+
+  /* Invalid input checks */
+  XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1);
+  XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1);
+
+  int itr = 0;
+  for(itr=0; itr < num_inp_dims; itr++)
+  {
+    XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1);
+  }
+  for(itr=0; itr < num_out_dims; itr++)
+  {
+    XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1);
+  }
+
+  /* Output shape provided must be correct based on input
+   * shape and permute values */
+  for(itr=0; itr < num_out_dims; itr++)
+  {
+    int output_dim = p_out_shape[itr];
+    int expected_dim = p_inp_shape[p_permute_vec[itr]];
+    XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1);
+  }
+
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1);
+
+  /* Shift all dim with 1 in the outer part */
+  int eff_output_shape[5];
+  int eff_permute_vec[5];
+
+  for(int i = 0; i < num_out_dims; i++)
+  {
+    eff_output_shape[i] = p_out_shape[i];
+    eff_permute_vec[i] = p_permute_vec[i];
+  }
+  
+  int one_i=num_out_dims-1, non_one_i=num_out_dims-1;
+  while(one_i > 0 && non_one_i >=0){
+    while(one_i > 0 && eff_output_shape[one_i]!=1){
+      one_i--;
+    }
+    non_one_i = one_i;
+    while(non_one_i >= 0 && eff_output_shape[non_one_i]==1)
+    {
+      non_one_i--;
+    }
+    if(one_i > 0 && non_one_i >=0){
+      int temp;
+      /*swap output_shape*/
+      {
+        temp = eff_output_shape[one_i];
+        eff_output_shape[one_i] = eff_output_shape[non_one_i];
+        eff_output_shape[non_one_i] = temp;
+      }
+      /*swap permute_vec*/
+      {
+        temp = eff_permute_vec[one_i];
+        eff_permute_vec[one_i] = eff_permute_vec[non_one_i];
+        eff_permute_vec[non_one_i] = temp;
+      }
+      
+    }
+  }
+
+
+  /* Promoting lesser dim tensors to 5D tensors. 
+   * Also updating the permute_vec and shapes as needed for optimization */
+  int p_5D_inp_shape[5] = {1, 1, 1, 1, 1};
+  int p_5D_out_shape[5] = {1, 1, 1, 1, 1};
+  int p_5D_permute_vec[5] = {0, 1, 2, 3, 4};
+  
+  /* Check if any inner inp dimension is same in the output */
+  int last_dim_same = 1, last_n_same_dim = 0;
+  itr = num_inp_dims - 1;
+  while(itr >= 0)
+  {
+    last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim;
+    last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0;
+    itr--;
+  }
+  
+  int dims_added = 5 - num_inp_dims;
+  itr = num_inp_dims - 1;
+  int same_count = last_n_same_dim;
+  int count = 4;
+  while(itr >= 0)
+  {
+    p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr];
+    p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr];
+    same_count--;
+    itr--;
+    count = (same_count > 0) ? count : count - 1;
+  }
+  
+  itr = num_inp_dims - 1;
+  same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0;
+  count = 4;
+  while(itr >= 0)
+  {
+    p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added;
+    same_count--;
+    itr--;
+    count--;
+  }
+  
+  int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4;
+  int inp_dim1, inp_dim2, inp_dim3, inp_dim4;
+  int inp_stride[5];
+
+  out_dim0 = p_5D_out_shape[0]; 
+  out_dim1 = p_5D_out_shape[1]; 
+  out_dim2 = p_5D_out_shape[2]; 
+  out_dim3 = p_5D_out_shape[3];
+  out_dim4 = p_5D_out_shape[4];
+
+  inp_dim1 = p_5D_inp_shape[1]; 
+  inp_dim2 = p_5D_inp_shape[2]; 
+  inp_dim3 = p_5D_inp_shape[3];
+  inp_dim4 = p_5D_inp_shape[4];
+
+  inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4;
+  inp_stride[1] = inp_dim2*inp_dim3*inp_dim4;
+  inp_stride[2] = inp_dim3*inp_dim4;
+  inp_stride[3] = inp_dim4;
+  inp_stride[4] = 1;
+
+  if(last_n_same_dim)
+  {
+    int itr0, itr1, itr2, itr3;
+    WORD8 *p_inp0 = (WORD8*)p_inp;
+    for(itr0 = 0; itr0 < out_dim0; itr0++)
+    {
+      WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
+#pragma loop_count min=1
+      for(itr1 = 0; itr1 < out_dim1; itr1++)
+      {
+        WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
+#pragma loop_count min=1
+        for(itr2 = 0; itr2 < out_dim2; itr2++)
+        {
+          WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
+#pragma loop_count min=1
+          for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4)
+          {
+            WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
+            memcpy(p_out, p_inp4, out_dim4);
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    int itr0, itr1, itr2, itr3, itr4;
+    WORD8 *p_inp0 = (WORD8*)p_inp;
+    for(itr0 = 0; itr0 < out_dim0; itr0++)
+    {
+      WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
+      for(itr1 = 0; itr1 < out_dim1; itr1++)
+      {
+        WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
+        for(itr2 = 0; itr2 < out_dim2; itr2++)
+        {
+          WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
+          for(itr3 = 0; itr3 < out_dim3; itr3++)
+          {
+            WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
+            for(itr4 = 0; itr4 < out_dim4; itr4++)
+            {
+              WORD8 d0 = *(p_inp4);
+              p_inp4 += inp_stride[p_5D_permute_vec[4]];
+              *p_out++ = d0;
+
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return 0;
+}