diff --git a/include/rppdefs.h b/include/rppdefs.h
index f7cd7a711..7d49c457d 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -1045,6 +1045,7 @@ typedef struct
     Rpp64u* dstBatchIndex;
     Rpp32u* inc;
     Rpp32u* dstInc;
+    hipMemRpp32u scratchBuf;
 } memGPU;
 
 /*! \brief RPP HIP-HOST memory management
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 3cb49a82b..7c2d3318b 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -150,6 +150,49 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
 RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/*! \brief Normalize Generic augmentation on HOST backend
+ * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor.
+ *          Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. Also has toggle variant(NHWC->NCHW) support for 3D.
+ * \param [in] srcPtr source tensor memory in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] axisMask axis along which normalization needs to be done
+ * \param [in] meanTensor values to be subtracted from input
+ * \param [in] stdDevTensor standard deviation values to scale the input
+ * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided)
+ * \param [in] scale value to be multiplied with data after subtracting from mean
+ * \param [in] shift value to be added finally
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_normalize_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Normalize Generic augmentation on HIP backend
+ * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor.
+ * \param [in] srcPtr source tensor memory in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] axisMask axis along which normalization needs to be done
+ * \param [in] meanTensor values to be subtracted from input
+ * \param [in] stdDevTensor standard deviation values to scale the input
+ * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided)
+ * \param [in] scale value to be multiplied with data after subtracting from mean
+ * \param [in] shift value to be added finally
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! @}
  */
 
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index 19121957b..6c327fdd6 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -2537,6 +2537,127 @@ inline Rpp32f rpp_hsum_ps(__m256 x)
     return _mm_cvtss_f32(sum);
 }
 
+/* Computes inverse square root */
+inline Rpp32f rpp_rsqrt_ps(Rpp32f x)
+{
+    __m128 X = _mm_set_ss(x);
+    __m128 tmp = _mm_rsqrt_ss(X);
+    Rpp32f y = _mm_cvtss_f32(tmp);
+    return y * (1.5f - x * 0.5f * y * y);
+}
+
+/* Compute inverse square root */
+/* SSE matches to 6 decimal places with raw C version due to newton rhapson approximation*/
+inline void rpp_rsqrt_sse(Rpp32f *input, Rpp64s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f mul)
+{
+    Rpp64s i = 0;
+    __m128 rdivx4 = _mm_set1_ps(rdiv);
+    __m128 mulx4 = _mm_set1_ps(mul * 0.5f);
+    if (eps) // epsilon is present - no need for masking, but we need to add it
+    {
+        __m128 epsx4 = _mm_set1_ps(eps);
+        for (; i + 4 <= numElements; i += 4)
+        {
+            __m128 x = _mm_loadu_ps(&input[i]);
+            x = _mm_mul_ps(x, rdivx4);
+            x = _mm_add_ps(x, epsx4);
+            __m128 y = _mm_rsqrt_ps(x);
+            __m128 y2 = _mm_mul_ps(y, y);
+            __m128 xy2 = _mm_mul_ps(x, y2);
+            __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2);
+            y = _mm_mul_ps(y, three_minus_xy2);
+            y = _mm_mul_ps(y, mulx4);
+            _mm_storeu_ps(&input[i], y);
+        }
+    }
+    else
+    {
+        for (; i + 4 <= numElements; i += 4)
+        {
+            __m128 x = _mm_loadu_ps(&input[i]);
+            x = _mm_mul_ps(x, rdivx4);
+            __m128 mask = _mm_cmpneq_ps(x, xmm_p0);
+            __m128 y = _mm_rsqrt_ps(x);
+            y = _mm_and_ps(y, mask);
+            __m128 y2 = _mm_mul_ps(y, y);
+            __m128 xy2 = _mm_mul_ps(x, y2);
+            __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2);
+            y = _mm_mul_ps(y, three_minus_xy2);
+            y = _mm_mul_ps(y, mulx4);
+            _mm_storeu_ps(&input[i], y);
+        }
+    }
+    if (eps)
+    {
+        for (; i < numElements; i++)
+            input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * mul;
+    }
+    else
+    {
+        for (; i < numElements; i++)
+        {
+            Rpp32f x = input[i] * rdiv;
+            input[i] = x ? rpp_rsqrt_ps(x) * mul : 0;
+        }
+    }
+}
+
+/* Compute inverse square root */
+/* AVX2 matches to 6 decimal places with raw C version due to newton rhapson approximation*/
+inline void rpp_rsqrt_avx(Rpp32f *input, Rpp32s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f scale)
+{
+    Rpp32s i = 0;
+    __m256 rdivx8 = _mm256_set1_ps(rdiv);
+    __m256 mulx8 = _mm256_set1_ps(scale * 0.5f);
+    if (eps) // epsilon is present - no need for masking, but we need to add it
+    {
+        __m256 epsx8 = _mm256_set1_ps(eps);
+        for (; i + 8 <= numElements; i += 8)
+        {
+            __m256 x = _mm256_loadu_ps(&input[i]);
+            x = _mm256_mul_ps(x, rdivx8);
+            x = _mm256_add_ps(x, epsx8);
+            __m256 y = _mm256_rsqrt_ps(x);
+            __m256 y2 = _mm256_mul_ps(y, y);
+            __m256 xy2 = _mm256_mul_ps(x, y2);
+            __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2);
+            y = _mm256_mul_ps(y, three_minus_xy2);
+            y = _mm256_mul_ps(y, mulx8);
+            _mm256_storeu_ps(&input[i], y);
+        }
+    }
+    else
+    {
+        for (; i + 8 <= numElements; i += 8)
+        {
+            __m256 x = _mm256_loadu_ps(&input[i]);
+            x = _mm256_mul_ps(x, rdivx8);
+            __m256 mask = _mm256_cmp_ps(x, avx_p0, _CMP_NEQ_OQ);
+            __m256 y = _mm256_rsqrt_ps(x);
+            y = _mm256_and_ps(y, mask);
+            __m256 y2 = _mm256_mul_ps(y, y);
+            __m256 xy2 = _mm256_mul_ps(x, y2);
+            __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2);
+            y = _mm256_mul_ps(y, three_minus_xy2);
+            y = _mm256_mul_ps(y, mulx8);
+            _mm256_storeu_ps(&input[i], y);
+        }
+    }
+    if (eps)
+    {
+        for (; i < numElements; i++)
+            input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * scale;
+    }
+    else
+    {
+        for (; i < numElements; i++)
+        {
+            Rpp32f x = input[i] * rdiv;
+            input[i] = x ? rpp_rsqrt_ps(x) * scale : 0;
+        }
+    }
+}
+
 static inline void fast_matmul4x4_sse(float *A, float *B, float *C)
 {
     __m128 row1 = _mm_load_ps(&B[0]);                   // Row 0 of B
diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
index 1f47e8ee7..4338483a9 100644
--- a/src/modules/CMakeLists.txt
+++ b/src/modules/CMakeLists.txt
@@ -97,6 +97,7 @@ if( "${BACKEND}" STREQUAL "HIP")
     set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP})
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS}")
     set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+    set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
 
     # Add HIP specific preprocessor flags
     add_definitions(-DHIP_COMPILE)
@@ -111,6 +112,7 @@ elseif( "${BACKEND}" STREQUAL "OCL")
     list(APPEND Rpp_Source ${CPPFILES} ${MOD_CL_CPP})
     message("-- ${Green}OpenCL kernels added!${ColourReset}")
     set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+    set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
 
     # Add OpenCL specific preprocessor flags
     add_definitions(-DOCL_COMPILE)
@@ -125,6 +127,7 @@ elseif( "${BACKEND}" STREQUAL "CPU")
     # Add CPU specific includes
     set(INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/common/)
     set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+    set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
 endif()
 message("-- ${White}AMD RPP ${PROJECT_NAME} -- Include Directories:${INCLUDE_LIST}${ColourReset}")
 add_compile_options("-Wno-unused-result")
diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp
index 32b8b62b5..7ddc238fa 100644
--- a/src/modules/cpu/host_tensor_statistical_operations.hpp
+++ b/src/modules/cpu/host_tensor_statistical_operations.hpp
@@ -28,5 +28,6 @@ SOFTWARE.
 #include "kernel/tensor_sum.hpp"
 #include "kernel/tensor_min.hpp"
 #include "kernel/tensor_max.hpp"
+#include "kernel/normalize.hpp"
 
-#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
+#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/cpu/kernel/normalize.hpp b/src/modules/cpu/kernel/normalize.hpp
new file mode 100644
index 000000000..dbe746d1a
--- /dev/null
+++ b/src/modules/cpu/kernel/normalize.hpp
@@ -0,0 +1,882 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+// Computes strides
+void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim)
+{
+    if (tensorDim > 0)
+    {
+        Rpp32u v = 1;
+        for (Rpp32u i = tensorDim - 1; i > 0; i--)
+        {
+            strides[i] = v;
+            v *= shape[i];
+        }
+        strides[0] = v;
+    }
+}
+
+// Recursive reduction helper function to compute difference of input with mean and squares them up
+template<typename T>
+void compute_diff_square_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements, Rpp32f mean)
+{
+    if (numElements > 32)
+    {
+        Rpp32s currElements = numElements >> 1;
+        Rpp32f tmp1 = 0, tmp2 = 0;
+
+        // reduce first half and accumulate
+        compute_diff_square_sum(tmp1, input, inputStride, currElements, mean);
+
+        // reduce second half and accumulate
+        compute_diff_square_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements, mean);
+
+        tmp1 += tmp2;
+        output += tmp1;
+    }
+    else
+    {
+        // reduce to a temporary
+        Rpp32f tmp = 0;
+        for (Rpp32s i = 0; i < numElements; i++)
+        {
+            Rpp32f curr = (input[i * inputStride] - mean);
+            auto curSq = curr * curr;
+            tmp += curSq;
+        }
+
+        // accumulate in target value
+        output += tmp;
+    }
+}
+
+// Recursive reduction helper function to sum up input values
+template<typename T>
+void compute_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements)
+{
+    if (numElements > 32)
+    {
+        Rpp32s currElements = numElements >> 1;
+        Rpp32f tmp1 = 0, tmp2 = 0;
+
+        // reduce first half and accumulate
+        compute_sum(tmp1, input, inputStride, currElements);
+
+        // reduce second half and accumulate
+        compute_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements);
+
+        tmp1 += tmp2;
+        output += tmp1;
+    }
+    else
+    {
+        // reduce to a temporary
+        Rpp32f tmp = 0;
+        for (Rpp32s i = 0; i < numElements; i++)
+            tmp += input[i * inputStride];
+
+        // accumulate in target value
+        output += tmp;
+    }
+}
+
+// Computes mean for 2D inputs
+void compute_2D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride)
+{
+    Rpp32f *srcPtrTemp = srcPtr;
+    Rpp32f normFactor = 1.0 / dims[1];
+    for(Rpp32u i = 0; i < dims[0]; i++)
+    {
+        meanPtr[i] = 0;
+        compute_sum(meanPtr[i], srcPtrTemp, stride[0], dims[1]);
+        srcPtrTemp += stride[1];
+        meanPtr[i] *= normFactor;
+    }
+}
+
+// Computes inverse stddev for 2D inputs
+void compute_2D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale)
+{
+
+    Rpp32f *srcPtrTemp = srcPtr;
+    Rpp32f normFactor = (Rpp32f)(1.0 / dims[1]);
+    for(Rpp32u i = 0; i < dims[0]; i++)
+    {
+        stdDevPtr[i] = 0;
+        compute_diff_square_sum(stdDevPtr[i], srcPtrTemp, stride[0], dims[1], meanPtr[i]);
+        srcPtrTemp += stride[1];
+    }
+    rpp_rsqrt_sse(stdDevPtr, (Rpp32s)dims[0], 0, normFactor, scale);
+}
+
+// Computes mean for 3D inputs
+void compute_3D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, bool isConsecutive = true)
+{
+    Rpp32f *srcPtrTemp = srcPtr;
+    if(isConsecutive)
+    {
+        Rpp32f normFactor = 1.0 / dims[2];
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            float *srcPtrRow = srcPtrTemp;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                Rpp32u index = i * dims[1] + j;
+                meanPtr[index] = 0;
+                compute_sum(meanPtr[index], srcPtrRow, stride[0], dims[2]);
+                srcPtrRow += stride[1];
+                meanPtr[index] *= normFactor;
+            }
+            srcPtrTemp += stride[2];
+        }
+    }
+    else
+    {
+        Rpp32f normFactor = 1.0 / (dims[1] * dims[2]);
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            meanPtr[i] = 0;
+            Rpp32f *srcPtrRow = srcPtrTemp;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                compute_sum(meanPtr[i], srcPtrRow, stride[0], dims[2]);
+                srcPtrRow += stride[1];
+            }
+            meanPtr[i] *= normFactor;
+            srcPtrTemp += stride[2];
+        }
+    }
+}
+
+// Computes inverse stddev for 3D inputs
+void compute_3D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale, bool isConsecutive = true)
+{
+    Rpp32f *srcPtrTemp = srcPtr;
+    if(isConsecutive)
+    {
+        Rpp32f normFactor = (Rpp32f)(1.0 / dims[2]);
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            float *srcPtrRow = srcPtrTemp;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                Rpp32u index = i * dims[1] + j;
+                stdDevPtr[index] = 0;
+                compute_diff_square_sum(stdDevPtr[index], srcPtrRow, stride[0], dims[2], meanPtr[index]);
+                srcPtrRow += stride[1];
+            }
+            srcPtrTemp += stride[2];
+        }
+        rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0] * dims[1]), 0, normFactor, scale);
+    }
+    else
+    {
+        Rpp32f normFactor = (Rpp32f)(1.0 / (dims[1] * dims[2]));
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            stdDevPtr[i] = 0;
+            Rpp32f *srcPtrRow = srcPtrTemp;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                compute_diff_square_sum(stdDevPtr[i], srcPtrRow, stride[0], dims[2], meanPtr[i]);
+                srcPtrRow += stride[1];
+            }
+            srcPtrTemp += stride[2];
+        }
+        rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0]), 0, normFactor, scale);
+    }
+}
+
+// Computes mean for ND inputs
+template<typename T>
+void compute_ND_mean(T *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis)
+{
+    if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeSum when last dimension is to be normalized
+        compute_sum(meanPtr[index], srcPtr, stride[level], dims[level]);
+    else if(level == tensorDim) // Calls computeSum when only 1st axis need to be normalized
+        compute_sum(meanPtr[index], srcPtr, stride[norm], dims[norm]);
+    else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store mean
+    {
+        for(Rpp32u i = 0; i < dims[level]; i++)
+            compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis);
+    }
+    else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized
+        compute_ND_mean(srcPtr, meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+    else if(axis[level]) // Called when axis at present level needs to be normalized
+    {
+        for(Rpp32u i = 0; i < dims[level]; i++)
+            compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+    }
+}
+
+// Computes inverse stddev for ND inputs
+template<typename T>
+void compute_ND_stddev(T *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis)
+{
+    if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeDiffSumSquare when last dimension is to be normalized
+        compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[level], dims[level], meanPtr[index]);
+    else if(level == tensorDim) // Calls computeDiffSumSquare when only 1st axis need to be normalized
+        compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[norm], dims[norm], meanPtr[index]);
+    else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store stddev
+    {
+        for(Rpp32u i = 0; i < dims[level]; i++)
+            compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis);
+    }
+    else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized
+        compute_ND_stddev(srcPtr, meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+    else if(axis[level]) // Called when axis at present level needs to be normalized
+    {
+        for(Rpp32u i = 0; i < dims[level]; i++)
+            compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+    }
+}
+
+// Computes normalize for 3D non toggle variants
+void normalize_3D_tensor_nontoggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+                         Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length)
+{
+    Rpp32s paramIdx = 0;
+    Rpp32s idx1 = 0;
+
+    for(Rpp32u i = 0; i < length[0]; i++)
+    {
+        Rpp32f *srcPtrRow = srcPtr;
+        Rpp32f *dstPtrRow = dstPtr;
+        for(Rpp32u j = 0; j < length[1]; j++)
+        {
+            Rpp32f *srcPtrRowTemp = srcPtrRow;
+            Rpp32f *dstPtrRowTemp = dstPtrRow;
+            idx1 = paramIdx;
+            for(Rpp32u k = 0; k < length[2]; k++)
+            {
+                *dstPtrRowTemp = ((*srcPtrRowTemp - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift;
+                if(k < length[2] - 1)
+                    paramIdx += paramStride[2];
+                srcPtrRowTemp++;
+                dstPtrRowTemp++;
+            }
+            if(j < length[1] - 1)
+                paramIdx = (!paramStride[1]) ? idx1 : paramIdx + paramStride[1];
+            srcPtrRow += srcGenericDescPtr->strides[2];
+            dstPtrRow += dstGenericDescPtr->strides[2];
+        }
+        if(i < length[0] - 1)
+            paramIdx = (!paramStride[0]) ? 0 : paramIdx + paramStride[0];
+        srcPtr += srcGenericDescPtr->strides[1];
+        dstPtr += dstGenericDescPtr->strides[1];
+    }
+}
+
+// Computes normalize for 3D toggle variants when axis mask is set to 3
+void normalize_3D_tensor_axis3_toggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+                         Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length)
+{
+    Rpp32f *srcPtrTemp = srcPtr;
+    Rpp32f *dstPtrTemp[length[2]];
+    dstPtrTemp[0] = dstPtr;
+    for(Rpp32u i = 1; i < length[2]; i++)
+        dstPtrTemp[i] = dstPtrTemp[i-1] + dstGenericDescPtr->strides[1];
+    Rpp32s paramIdx = 0;
+
+    for(Rpp32u i = 0; i < length[0]; i++)
+    {
+        Rpp32f *srcPtrRow = srcPtrTemp;
+        Rpp32f *dstPtrRow[length[2]];
+        for(Rpp32u l = 0; l < length[2]; l++)
+            dstPtrRow[l] = dstPtrTemp[l];
+        for(Rpp32u j = 0; j < length[1]; j++)
+        {
+            Rpp32f *srcPtrRowTemp = srcPtrRow;
+            Rpp32f *dstPtrRowTemp[length[2]];
+            for(Rpp32u l = 0; l < length[2]; l++)
+                dstPtrRowTemp[l] = dstPtrRow[l];
+            for(Rpp32u k = 0; k < length[2]; k++)
+            {
+                *dstPtrRowTemp[k]++ = ((*srcPtrRowTemp++ - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift;
+                paramIdx += paramStride[2];
+            }
+            paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1];
+            srcPtrRow += srcGenericDescPtr->strides[2];
+            for(Rpp32u l = 0; l < length[2]; l++)
+                dstPtrRow[l] += dstGenericDescPtr->strides[3];
+        }
+        srcPtrTemp += srcGenericDescPtr->strides[1];
+        for(Rpp32u l = 0; l < length[2]; l++)
+            dstPtrTemp[l] += dstGenericDescPtr->strides[2];
+    }
+}
+
+// Computes normalize for 3D non toggle variants, optimized with AVX when axis mask set to 3 and 16 channel normalize
+void normalize_3D_tensor_avx_axis3(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+                                   Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u bufferLength, Rpp32u *length)
+{
+    Rpp32u vectorIncrement = 16;
+    Rpp32u alignedLength = (bufferLength / 16) * 16;
+    Rpp32u outerDim = length[0];
+
+    // set shift, mean and stddev
+    __m256 pShift = _mm256_set1_ps(shift);
+    __m256 pMean1 = _mm256_loadu_ps(meanPtr);
+    __m256 pMean2 = _mm256_loadu_ps(meanPtr + 8);
+    __m256 pMultiplier1 = _mm256_loadu_ps(multiplierPtr);
+    __m256 pMultiplier2 = _mm256_loadu_ps(multiplierPtr + 8);
+
+    for(Rpp32u i = 0; i < outerDim; i++)
+    {
+        Rpp32f *srcPtrTemp = srcPtr + i * srcGenericDescPtr->strides[1];
+        Rpp32f *dstPtrTemp = dstPtr + i * dstGenericDescPtr->strides[1];
+
+        Rpp32u vectorLoopCount = 0;
+        for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement)
+        {
+            __m256 pSrc1 = _mm256_loadu_ps(srcPtrTemp);
+            __m256 pSrc2 = _mm256_loadu_ps(srcPtrTemp + 8);
+            __m256 pDst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc1, pMean1), pMultiplier1), pShift);
+            __m256 pDst2 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc2, pMean2), pMultiplier2), pShift);
+            _mm256_storeu_ps(dstPtrTemp, pDst1);
+            _mm256_storeu_ps(dstPtrTemp + 8, pDst2);
+            srcPtrTemp += vectorIncrement;
+            dstPtrTemp += vectorIncrement;
+        }
+    }
+}
+
+// Computes normalize for ND non toggle variants for i8 dataype
+void normalize_ND_tensor_nontoggle(Rpp32s *srcPtr, Rpp32u *srcStride, Rpp32f *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr,
+                                   Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx)
+{
+    Rpp32u idx1 = 0;
+    if(tensorDim == 1)
+    {
+        for(Rpp32u k = 0; k < length[level]; k++)
+        {
+            *dstPtr++ = (((Rpp32f)(*srcPtr + 128) - meanPtr[idx]) * multiplierPtr[idx]) + shift;
+            if(k < length[level] - 1)
+                idx += paramStride[level];
+            srcPtr++;
+        }
+    }
+    else
+    {
+        idx1 = idx;
+        for (Rpp32u i = 0; i < length[level]; i++)
+        {
+            normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx);
+            if(i < length[level] - 1)
+                idx = (!paramStride[level]) ? idx1 : idx + paramStride[level];
+            dstPtr += srcStride[level];
+            srcPtr += srcStride[level];
+        }
+    }
+}
+
+// Computes normalize for ND non toggle variants
+template<typename T1, typename T2>
+void normalize_ND_tensor_nontoggle(T1 *srcPtr, Rpp32u *srcStride, T2 *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr,
+                                   Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx)
+{
+    Rpp32u idx1 = 0;
+    if(tensorDim == 1)
+    {
+        T1 *srcPtrTemp = srcPtr;
+        T2 *dstPtrTemp = dstPtr;
+
+        for(Rpp32u k = 0; k < length[level]; k++)
+        {
+            *dstPtrTemp = (((T2)*srcPtrTemp - meanPtr[idx]) * multiplierPtr[idx]) + shift;
+            if(k < length[level] - 1)
+                idx += paramStride[level];
+            srcPtrTemp++;
+            dstPtrTemp++;
+        }
+    }
+    else
+    {
+        idx1 = idx;
+        for (Rpp32u i = 0; i < length[level]; i++)
+        {
+            normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx);
+            if(i < length[level] - 1)
+                idx = (!paramStride[level]) ? idx1 : idx + paramStride[level];
+            dstPtr += srcStride[level];
+            srcPtr += srcStride[level];
+        }
+    }
+}
+
+// Computes normalize for 2D
+void normalize_2D_tensor(Rpp32f *srcPtr, RpptGenericDescPtr srcDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstDescPtr,
+                         Rpp32f *meanPtr, Rpp32f *invStdDevPtr, Rpp32f shift, Rpp32u *dims, Rpp32u *paramStride)
+{
+    if (paramStride[1]) // Optimized with AVX when axis mask set to 2
+    {
+        Rpp32u vectorIncrement = 8;
+        Rpp32u bufferLength = dims[1];
+        Rpp32u alignedLength = (bufferLength / 8) * 8;
+
+        __m256 pShift = _mm256_set1_ps(shift);
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            Rpp32f *srcPtrTemp = srcPtr + i * srcDescPtr->strides[1];
+            Rpp32f *dstPtrTemp = dstPtr + i * dstDescPtr->strides[1];
+
+            // set mean and stddev
+            Rpp32f mean = meanPtr[i];
+            Rpp32f invStdDev = invStdDevPtr[i];
+            __m256 pMean, pInvStdDev;
+            pMean = _mm256_set1_ps(mean);
+            pInvStdDev = _mm256_set1_ps(invStdDev);
+
+            Rpp32u vectorLoopCount = 0;
+            for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement)
+            {
+                __m256 pSrc = _mm256_loadu_ps(srcPtrTemp);
+                __m256 pDst = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc, pMean), pInvStdDev), pShift);
+                _mm256_storeu_ps(dstPtrTemp, pDst);
+                srcPtrTemp += vectorIncrement;
+                dstPtrTemp += vectorIncrement;
+            }
+            for(; vectorLoopCount < dims[1] ; vectorLoopCount ++)
+                *dstPtrTemp++ = (*srcPtrTemp++ - mean) * invStdDev + shift;
+        }
+    }
+    else
+    {
+        Rpp32s paramIdx = 0;
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            Rpp32f *srcPtrTemp = srcPtr;
+            Rpp32f *dstPtrTemp = dstPtr;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                *dstPtrTemp++ = (*srcPtrTemp++ - meanPtr[paramIdx]) * invStdDevPtr[paramIdx] + shift;
+                paramIdx += paramStride[0];
+            }
+            paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1];
+            srcPtr += srcDescPtr->strides[1];
+            dstPtr += dstDescPtr->strides[1];
+        }
+    }
+}
+
+// Performs collapse axis operation wherein continuous axis that require normalization are combined together
+void collapse_axis(Rpp32u *tensorDim, Rpp32u *axis, Rpp32u *length, Rpp32u *newAxis, Rpp32u *newDims, Rpp32u *lastNormAxis)
+{
+    int skipped = 0, prev = -2, k = 0;
+    for(Rpp32u i = 0; i < *tensorDim; i++)
+    {
+        if(axis[i])
+        {
+            int temp = i - skipped;
+            if(temp != prev + 1)
+            {
+                newAxis[k] = 1;
+                newDims[k] = length[i];
+                prev = i;
+                k++;
+            }
+            else if(prev >= 0)
+            {
+                newDims[prev] *= length[i];
+                skipped++;
+            }
+        }
+        else
+        {
+            newDims[k] = length[i];
+            k++;
+        }
+    }
+    *tensorDim -= skipped;
+    for(Rpp32u i = 0; i < *tensorDim; i++)
+    {
+        if(newAxis[i])
+            *lastNormAxis = i;
+    }
+}
+
+RppStatus normalize_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                        RpptGenericDescPtr srcGenericDescPtr,
+                                        Rpp32f *dstPtr,
+                                        RpptGenericDescPtr dstGenericDescPtr,
+                                        Rpp32u axisMask,
+                                        Rpp32f *meanTensorPtr,
+                                        Rpp32f *stdDevTensorPtr,
+                                        Rpp8u computeMeanStddev,
+                                        Rpp32f scale,
+                                        Rpp32f shift,
+                                        Rpp32u *roiTensor,
+                                        RppLayoutParams layoutParams,
+                                        rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u tensorDims = srcGenericDescPtr->numDims - 1;
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    Rpp32u maxSize = 1;
+    // Compute maxSize as length of input tensors differ based on axisMask and tensorDims
+    for(int batch = 0; batch < batchSize; batch++)
+    {
+        Rpp32u size = 1;
+        for(int i = 0; i < tensorDims; i++)
+            size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i];
+        maxSize = std::max(maxSize, size);
+    }
+
+    if(!computeMeanStddev)
+    {
+        for(Rpp32u i = 0; i < maxSize; i++)
+            stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i];
+        maxSize = 0;
+    }
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+	{
+        Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[tensorDims];
+
+        Rpp32f *srcPtrTemp, *dstPtrTemp, *meanTensor, *stdDevTensor;
+        srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+        meanTensor = meanTensorPtr + batchCount * maxSize;
+        stdDevTensor = stdDevTensorPtr + batchCount * maxSize;
+
+        // Set all values in dst buffer to 0.0
+        for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++)
+            dstPtrTemp[cnt] = 0.0f;
+
+        Rpp32f *srcPtrChannel = srcPtrTemp;
+
+        if(tensorDims == 2) // Called for audio testcase and for any other 2D case
+        {
+            Rpp32u paramStride[2];
+            Rpp32u srcReductionDims[2], srcStride[2];
+            if (axisMask == 3)
+            {
+                srcStride[0] = srcStride[1] = srcGenericDescPtr->strides[2];
+                srcReductionDims[0] = 1;
+                srcReductionDims[1] = length[0] * length[1];
+                paramStride[0] = paramStride[1] = 0;
+            }
+            else if (axisMask == 1)
+            {
+                srcStride[0] = srcGenericDescPtr->strides[1];
+                srcStride[1] = srcGenericDescPtr->strides[2];
+                srcReductionDims[0] = length[1];
+                srcReductionDims[1] = length[0];
+                paramStride[0] = 1;
+                paramStride[1] = 0;
+            }
+            else if (axisMask == 2)
+            {
+                srcStride[0] = srcGenericDescPtr->strides[2];
+                srcStride[1] = srcGenericDescPtr->strides[1];
+                srcReductionDims[0] = length[0];
+                srcReductionDims[1] = length[1];
+                paramStride[0] = 0;
+                paramStride[1] = 1;
+            }
+
+            if(computeMeanStddev & 1) // Check if mean is to be computed internally
+                compute_2D_mean(srcPtrTemp, meanTensor, srcReductionDims, srcStride);
+            if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+                compute_2D_inv_std_dev(srcPtrTemp, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale);
+
+            normalize_2D_tensor(srcPtrTemp, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, length, paramStride);
+        }
+        else if(tensorDims == 3) // Called when a 3D tensor is passed to kernel
+        {
+            Rpp32u paramStride[3];
+            Rpp32u srcReductionDims[3], srcStride[3];
+            Rpp32u reductionDims;
+            bool isConsecutive = true;
+            switch(axisMask)
+            {
+                case 1: // Normalize axes 0
+                {
+                    reductionDims = length[1] * length[2];
+                    paramStride[0] = 0;
+                    paramStride[1] = paramStride[2] = 1;
+                    srcReductionDims[0] = length[1];
+                    srcReductionDims[1] = length[2];
+                    srcReductionDims[2] = length[0];
+                    srcStride[0] = srcGenericDescPtr->strides[1];
+                    srcStride[1] = srcGenericDescPtr->strides[3];
+                    srcStride[2] = srcGenericDescPtr->strides[2];
+                    break;
+                }
+                case 2: // Normalize axes 1
+                {
+                    reductionDims = length[0] * length[2];
+                    paramStride[1] = 0;
+                    paramStride[0] = paramStride[2] = 1;
+                    srcReductionDims[0] = length[0];
+                    srcReductionDims[1] = length[2];
+                    srcReductionDims[2] = length[1];
+                    srcStride[0] = srcGenericDescPtr->strides[2];
+                    srcStride[1] = srcGenericDescPtr->strides[3];
+                    srcStride[2] = srcGenericDescPtr->strides[1];
+                    break;
+                }
+                case 3: // Normalize axes 0, 1
+                {
+                    reductionDims = length[2];
+                    paramStride[0] = paramStride[1] = 0;
+                    paramStride[2] = 1;
+                    srcReductionDims[0] = 1;
+                    srcReductionDims[1] = length[2];
+                    srcReductionDims[2] = length[0] * length[1];
+                    srcStride[0] = srcGenericDescPtr->strides[2];
+                    srcStride[1] = srcGenericDescPtr->strides[3];
+                    srcStride[2] = srcGenericDescPtr->strides[3];
+                    break;
+                }
+                case 4: // Normalize across 2
+                {
+                    reductionDims = length[0] * length[1];
+                    paramStride[2] = 0;
+                    paramStride[0] = paramStride[1] = 1;
+                    srcReductionDims[0] = length[0];
+                    srcReductionDims[1] = length[1];
+                    srcReductionDims[2] = length[2];
+                    srcStride[0] = srcGenericDescPtr->strides[3];
+                    srcStride[1] = srcGenericDescPtr->strides[2];
+                    srcStride[2] = srcGenericDescPtr->strides[1];
+                    break;
+                }
+                case 5: // Normalize across 0, 2
+                {
+                    reductionDims = length[1];
+                    paramStride[0] = paramStride[2] = 0;
+                    paramStride[1] = 1;
+                    srcReductionDims[0] = length[1];
+                    srcReductionDims[1] = length[0];
+                    srcReductionDims[2] = length[2];
+                    srcStride[0] = srcGenericDescPtr->strides[3];
+                    srcStride[1] = srcGenericDescPtr->strides[1];
+                    srcStride[2] = srcGenericDescPtr->strides[2];
+                    isConsecutive = false;
+                    break;
+                }
+                case 6: // Normalize across 1, 2
+                {
+                    reductionDims = length[0];
+                    paramStride[1] = paramStride[2] = 0;
+                    paramStride[0] = 1;
+                    srcReductionDims[0] = 1;
+                    srcReductionDims[1] = length[0];
+                    srcReductionDims[2] = length[1] * length[2];
+                    srcStride[0] = srcGenericDescPtr->strides[3];
+                    srcStride[1] = srcGenericDescPtr->strides[1];
+                    srcStride[2] = srcGenericDescPtr->strides[3];
+                    break;
+                }
+                case 7: // Normalize across 0, 1, 2
+                {
+                    reductionDims = 1;
+                    paramStride[0] = paramStride[1] = paramStride[2] = 0;
+                    srcReductionDims[0] = 1;
+                    srcReductionDims[1] = 1;
+                    srcReductionDims[2] = length[0] * length[1] * length[2];
+                    srcStride[0] = srcStride[1] = srcStride[2] = srcGenericDescPtr->strides[3];
+                    break;
+                }
+                default:
+                {
+                    std::cout<<"Invalid Axis mask"<<std::endl;
+                }
+            }
+
+            for(Rpp32u i = 1; i < tensorDims; i++)
+                srcPtrChannel += begin[i - 1] * srcGenericDescPtr->strides[i];
+
+            if(computeMeanStddev & 1) // Check if mean is to be computed internally
+                compute_3D_mean(srcPtrChannel, meanTensor, srcReductionDims, srcStride, isConsecutive);
+            if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+                compute_3D_inv_std_dev(srcPtrChannel, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale, isConsecutive);
+
+            if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC) && (srcGenericDescPtr->dims[3] == 16))
+                normalize_3D_tensor_avx_axis3(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length[1] * layoutParams.bufferMultiplier, length);
+            else if((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC))
+                normalize_3D_tensor_nontoggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length);
+            else if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NCHW))
+                normalize_3D_tensor_axis3_toggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length);
+        }
+        else // Handle any other ND tensor is passed to kernel
+        {
+            // Compute length of input tensors as they differ based on axisMask and tensorDims
+            int size = 1;
+            for(int i = 0; i < tensorDims; i++)
+                size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i];
+
+            Rpp32u totalElements = 1;
+            Rpp32u lastNormAxis = 0;
+            Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims];
+            // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis
+            memset(newAxis, 0, tensorDims * sizeof(Rpp32u));
+            memset(newDims, 0, tensorDims * sizeof(Rpp32u));
+
+            for(Rpp32u i = 0; i < tensorDims; i++)
+            {
+                axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+                totalElements *= axis[i] ? length[i] : 1;
+                srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1];
+            }
+
+            Rpp32u paramStride[tensorDims], srcStride[tensorDims];
+            collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis);
+            compute_strides(srcStride, newDims, tensorDims);
+
+            if(computeMeanStddev & 1) // Check if mean is to be computed internally
+            {
+                compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+                Rpp32f normFactor = 1.0 / totalElements;
+                for(int i = 0; i < size; i++)
+                    meanTensor[i] *= normFactor;
+            }
+            if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+            {
+                compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+                Rpp32f normFactor = (Rpp32f)(1.0 / totalElements);
+                rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale);
+            }
+
+            for(Rpp32u i = 0; i < tensorDims; i++)
+                paramStride[i] = !newAxis[i];
+
+            Rpp32u idx = 0;
+            normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+template<typename T1, typename T2>
+RppStatus normalize_generic_host_tensor(T1 *srcPtr,
+                                        RpptGenericDescPtr srcGenericDescPtr,
+                                        T2 *dstPtr,
+                                        RpptGenericDescPtr dstGenericDescPtr,
+                                        Rpp32u axisMask,
+                                        Rpp32f *meanTensorPtr,
+                                        Rpp32f *stdDevTensorPtr,
+                                        Rpp8u computeMeanStddev,
+                                        Rpp32f scale,
+                                        Rpp32f shift,
+                                        Rpp32u *roiTensor,
+                                        RppLayoutParams layoutParams,
+                                        rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    Rpp32u maxSize = 1;
+    for(int batch = 0; batch < batchSize; batch++)
+    {
+        Rpp32u size = 1; // length of input tensors differ based on axisMask and tensorDims
+        for(int i = 0; i < tensorDims; i++)
+            size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i];
+        maxSize = std::max(maxSize, size);
+    }
+    if(!computeMeanStddev)
+    {
+        for(Rpp32u i = 0; i < maxSize; i++)
+            stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i];
+        maxSize = 0;
+    }
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+	{
+        int size = 1;
+        Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[tensorDims];
+
+        for(int i = 0; i < tensorDims; i++)
+            size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i];
+
+        T1 *srcPtrTemp;
+        T2 *dstPtrTemp;
+        Rpp32f *meanTensor, *stdDevTensor;
+        srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+        meanTensor = meanTensorPtr + batchCount * maxSize;
+        stdDevTensor = stdDevTensorPtr + batchCount * maxSize;
+
+        // Set all values in dst buffer to 0.0
+        for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++)
+            dstPtrTemp[cnt] = 0.0f;
+
+        T1 *srcPtrChannel = srcPtrTemp;
+
+        int totalElements = 1;
+        Rpp32u lastNormAxis = 0;
+        Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims];
+        // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis
+        memset(newAxis, 0, sizeof(newAxis));
+        memset(newDims, 0, sizeof(newDims));
+
+        for(int i = 0; i < tensorDims; i++)
+        {
+            axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+            totalElements *= axis[i] ? length[i] : 1;
+            srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1];
+        }
+
+        Rpp32u paramStride[tensorDims], srcStride[tensorDims];
+        collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis);
+        compute_strides(srcStride, newDims, tensorDims);
+
+        if(computeMeanStddev & 1) // Check if mean is to be computed internally
+        {
+            compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+            Rpp32f normFactor = 1.0 / totalElements;
+            for(int i = 0; i < size; i++)
+                meanTensor[i] *= normFactor;
+        }
+        if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+        {
+            compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+            Rpp32f normFactor = (Rpp32f)(1.0 / totalElements);
+            rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale);
+        }
+
+        for(int i = 0; i < tensorDims; i++)
+            paramStride[i] = !newAxis[i];
+
+        Rpp32u idx = 0;
+        normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp
index c79e0a951..6923b9a3f 100644
--- a/src/modules/hip/hip_tensor_statistical_operations.hpp
+++ b/src/modules/hip/hip_tensor_statistical_operations.hpp
@@ -27,5 +27,6 @@ SOFTWARE.
 #include "kernel/tensor_sum.hpp"
 #include "kernel/tensor_min.hpp"
 #include "kernel/tensor_max.hpp"
+#include "kernel/normalize.hpp"
 
 #endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/normalize.hpp b/src/modules/hip/kernel/normalize.hpp
new file mode 100644
index 000000000..384454ca7
--- /dev/null
+++ b/src/modules/hip/kernel/normalize.hpp
@@ -0,0 +1,1921 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+#define MAX_SHARED_MEMORY_SIZE 1024
+
+// -------------------- Set 0 - normalization kernels device helpers --------------------
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, uchar* dst)
+{
+    outVal = fmax(fminf(outVal, 255), 0);
+    *dst = static_cast<uchar>(outVal);
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, schar* dst)
+{
+    outVal = fmax(fminf(outVal, 127), -128);
+    *dst = static_cast<schar>(outVal);
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, float* dst)
+{
+    *dst = outVal;
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, half* dst)
+{
+    *dst = static_cast<half>(outVal);
+}
+
+// -------------------- Set 1 - normalization kernel host helpers --------------------
+
+// setup function needed for 2D/3D optimized kernel variants when mean and stddev needs to be internally computed
+void normalize_setup_2d_and_3d(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims,
+                               Rpp32u axisMask, Rpp32u &maxParamVolume)
+{
+    maxParamVolume = 1;
+    uint axisSet[RPPT_MAX_DIMS];
+    for(int i = 0; i < tensorDims; i++)
+        axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+
+    for(uint i = 0; i < batchSize; i++)
+    {
+        // calculate the max param volume
+        Rpp32u paramVolume = 1;
+        Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims];
+        for(uint j = 0; j < tensorDims; j++)
+            paramVolume *= (axisSet[j]) ? 1 : roi[j];
+        maxParamVolume = std::max(maxParamVolume, paramVolume);
+    }
+}
+
+// setup function needed for ND generic kernel variants
+void normalize_setup_nd(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims, Rpp32u axisMask,
+                        Rpp32u *paramShapeTensor, Rpp32u *paramStridesTensor, Rpp32u &maxParamVolume)
+{
+    maxParamVolume = 1;
+    uint axisSet[RPPT_MAX_DIMS];
+    for(int i = 0; i < tensorDims; i++)
+        axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+
+    for(uint i = 0; i < batchSize; i++)
+    {
+        // calculate the param shape and param volume based on the axis mask
+        Rpp32u paramVolume = 1;
+        Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims];
+        Rpp32u *paramShape = &paramShapeTensor[i * tensorDims];
+        for(uint j = 0; j < tensorDims; j++)
+        {
+            paramShape[j] = (axisSet[j]) ? 1 : roi[j];
+            paramVolume *= paramShape[j];
+        }
+        maxParamVolume = std::max(maxParamVolume, paramVolume);
+
+        // calculate the param strides from the param shape
+        Rpp32u *paramStrides = &paramStridesTensor[i * tensorDims];
+        Rpp32u val = 1;
+        for(uint j = tensorDims - 1; j > 0; j--)
+        {
+            paramStrides[j] = val;
+            val *= paramShape[j];
+        }
+        paramStrides[0] = val;
+    }
+}
+
+// -------------------- Set 2 - normalization kernels --------------------
+
+template <typename T>
+__global__ void normalize_2d_hip_tensor(T *srcPtr,
+                                        uint2 srcStridesNH,
+                                        T *dstPtr,
+                                        uint2 dstStridesNH,
+                                        float *meanTensor,
+                                        float *stdDevTensor,
+                                        float2 scaleAndShift,
+                                        uint *roiTensor,
+                                        uint2 maxParamVolumeAndAxisMask,
+                                        bool computeStdDev)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // width
+    uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+    uint *roi = &roiTensor[id_z * 4];
+    uint yBegin = roi[0];
+    uint xBegin = roi[1];
+    uint height = roi[2];
+    uint width = roi[3];
+
+    if (id_x >= width || id_y >= height)
+        return;
+
+    uint maxParamVolume = maxParamVolumeAndAxisMask.x;
+    uint axisMask = maxParamVolumeAndAxisMask.y;
+    uint paramIndex = id_z * maxParamVolume;
+    // update paramIndex based on axisMask value
+    if (axisMask == 1)
+        paramIndex += id_x;
+    else if (axisMask == 2)
+        paramIndex += id_y;
+
+    uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + id_x + xBegin;
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
+    float mean = meanTensor[paramIndex];
+    float stdDev = stdDevTensor[paramIndex];
+    float scale = scaleAndShift.x;
+    float shift = scaleAndShift.y;
+    float invStdDev;
+    if (computeStdDev)
+    {
+        float stdDevSquare = stdDev * stdDev;
+        invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+    }
+    else
+    {
+        invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+    }
+    float outVal = fmaf((static_cast<float>(srcPtr[srcIdx]) - mean), invStdDev, shift);
+    normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+template <typename T>
+__global__ void normalize_3d_hip_tensor(T *srcPtr,
+                                        uint2 srcStridesDH,
+                                        T *dstPtr,
+                                        uint2 dstStridesDH,
+                                        float *meanTensor,
+                                        float *stdDevTensor,
+                                        float2 scaleAndShift,
+                                        uint *roiTensor,
+                                        uint axisMask,
+                                        bool computeStdDev)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX
+    uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ
+
+    uint *roi = roiTensor;
+    uint zBegin = roi[0];
+    uint yBegin = roi[1];
+    uint xBegin = roi[2];
+    uint lengthZ = roi[3];
+    uint lengthY = roi[4];
+    uint lengthX = roi[5];
+
+    if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ)
+        return;
+
+    uint paramIndex = 0;
+    // update paramIndex based on axisMask value
+    if (axisMask == 1)
+        paramIndex += id_y * lengthX + id_x;
+    else if (axisMask == 2)
+        paramIndex += id_z * lengthX + id_x;
+    else if (axisMask == 4)
+        paramIndex += id_z * lengthY + id_y;
+    else if (axisMask == 3)
+        paramIndex += id_x;
+    else if (axisMask == 5)
+        paramIndex += id_y;
+    else if (axisMask == 6)
+        paramIndex += id_z;
+
+    uint srcIdx = ((id_z + zBegin) * srcStridesDH.x) + ((id_y + yBegin) * srcStridesDH.y) + id_x + xBegin;
+    uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x;
+    float mean = meanTensor[paramIndex];
+    float stdDev = stdDevTensor[paramIndex];
+    float scale = scaleAndShift.x;
+    float shift = scaleAndShift.y;
+    float invStdDev;
+    if (computeStdDev)
+    {
+        float stdDevSquare = stdDev * stdDev;
+        invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+    }
+    else
+    {
+        invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+    }
+    float outVal = fmaf((static_cast<float>(srcPtr[srcIdx]) - mean), invStdDev, shift);
+    normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+template <typename T>
+__global__ void normalize_nd_hip_tensor(T *srcPtr,
+                                        uint *srcMaxDims,
+                                        uint *srcStrides,
+                                        T *dstPtr,
+                                        float *meanTensor,
+                                        float *stdDevTensor,
+                                        float2 scaleAndShift,
+                                        uint *roiTensor,
+                                        uint *paramShapeTensor,
+                                        uint *paramStridesTensor,
+                                        uint2 maxParamVolumeAndBufferLength,
+                                        uint tensorDims,
+                                        bool computeStdDev)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    uint maxBufferLength = maxParamVolumeAndBufferLength.y;
+
+    if (id_x >= maxBufferLength)
+        return;
+
+    uint *begin = &roiTensor[id_z * tensorDims * 2];
+    uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+    uint *paramShape = &paramShapeTensor[id_z * tensorDims];
+    uint *paramStrides = &paramStridesTensor[id_z * tensorDims];
+    uint maxParamVolume = maxParamVolumeAndBufferLength.x;
+    uint srcIdx = id_z * maxBufferLength;
+
+    uint paramIndex = id_z * maxParamVolume;
+    for (int i = 0; i < tensorDims; i++)
+    {
+        uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+        srcIdx += ((begin[i] + coord) * srcStrides[i]);
+        if (coord >= length[i])
+            return;
+        paramIndex += (maxParamVolume != 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+    }
+
+    float mean = meanTensor[paramIndex];
+    float stdDev = stdDevTensor[paramIndex];
+    float scale = scaleAndShift.x;
+    float shift = scaleAndShift.y;
+    float invStdDev;
+    if (computeStdDev)
+    {
+        float stdDevSquare = stdDev * stdDev;
+        invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+    }
+    else
+    {
+        invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+    }
+    uint dstIdx = id_z * maxBufferLength + id_x;
+    float outVal = fmaf((static_cast<float>(srcPtr[srcIdx]) - mean), invStdDev, shift);
+    normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+// -------------------- Set 3 - mean and stddev compute kernels device helpers --------------------
+
+__device__ __forceinline__ void reduction_sum_x_hip(float *partialSum_smem)
+{
+    for(uint threadMax = hipBlockDim_x / 2; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialSum_smem[hipThreadIdx_x] += partialSum_smem[hipThreadIdx_x + threadMax];
+        __syncthreads();
+    }
+}
+
+// -------------------- Set 4 - mean compute kernels (reduction stage 1) --------------------
+
+template <typename T>
+__global__ void compute_mean_2d_hip_tensor(T *srcPtr,
+                                           uint2 srcStridesNH,
+                                           float *meanTensor,
+                                           float *partialSumTensor,
+                                           uint *roiTensor,
+                                           uint maxParamVolume,
+                                           uint axisMask)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *roi = &roiTensor[id_z * 4];
+    uint yBegin = roi[0];
+    uint xBegin = roi[1];
+    uint height = roi[2];
+    uint width = roi[3];
+
+    // compute column wise mean
+    if (axisMask == 1)
+    {
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin);
+        uint dstIdx = id_z * maxParamVolume + id_x;
+        if (id_x < width)
+        {
+            float accum = 0.0f;
+            for(int i = 0; i < height; i++)
+            {
+                accum += static_cast<float>(srcPtr[srcIdx]);
+                srcIdx += srcStridesNH.y;
+            }
+            meanTensor[dstIdx] = accum / static_cast<float>(height);
+        }
+    }
+    // compute partial sums needed for row wise mean
+    else if (axisMask == 2)
+    {
+        id_x *= 8;
+        __shared__ float partialRowSum_smem[256];
+        partialRowSum_smem[hipThreadIdx_x] = 0.0f;
+
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        int xAlignedLength =  width & ~7;      // alignedLength for vectorized global loads
+        int xDiff = width - xAlignedLength;    // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        if (id_x + 8 > width)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];                                           // perform small work of vectorized float4 addition
+        partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                              src_f8.f1[1] +
+                                              src_f8.f1[2] +
+                                              src_f8.f1[3]);                    // perform small work of reducing float4s to float using 256 threads and store in Shared
+        __syncthreads();
+
+        // Now do block level reduction sum
+        reduction_sum_x_hip(partialRowSum_smem);
+
+        // Final store to dst
+        if (hipThreadIdx_x == 0)
+        {
+            uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x;
+            partialSumTensor[paramIndex] = partialRowSum_smem[0];
+        }
+    }
+    // compute partial sums need for computing mean over entire rows and columns
+    else if (axisMask == 3)
+    {
+        id_x *= 8;
+        __shared__ float partialSum_smem[16][16];                               // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+        float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];     // float pointer to beginning of each row in Shared
+        partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                           // initialization of Shared to 0.0f using all 16 x 16 threads
+
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        int xAlignedLength =  width & ~7;       // alignedLength for vectorized global loads
+        int xDiff = width - xAlignedLength;     // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        if (id_x + 8 > width)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];                                           // perform small work of vectorized float4 addition
+        partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                                src_f8.f1[1] +
+                                                src_f8.f1[2] +
+                                                src_f8.f1[3]);                 // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+        __syncthreads();                                                       // syncthreads after Shared load
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSumRowPtr_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            // Reduction of 16 floats on 16 threads per block in y dimension
+            for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+            {
+                if (hipThreadIdx_y < threadMax)
+                    partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                __syncthreads();
+            }
+
+            // Final store to dst
+            if (hipThreadIdx_y == 0)
+                partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void compute_mean_3d_hip_tensor(T *srcPtr,
+                                           uint3 srcStridesNZY,
+                                           float *meanTensor,
+                                           uint *roiTensor,
+                                           float *partialSumTensor,
+                                           uint maxParamVolume,
+                                           uint axisMask)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *roi = &roiTensor[id_z * 6];
+    uint zBegin = roi[0];
+    uint yBegin = roi[1];
+    uint xBegin = roi[2];
+    uint lengthZ = roi[3];
+    uint lengthY = roi[4];
+    uint lengthX = roi[5];
+
+    // compute mean along z direction
+    if (axisMask == 1)
+    {
+        if (id_x >= lengthX || id_y >= lengthY)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin);
+        uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x;
+        float accum = 0.0f;
+        for(uint i = 0; i < lengthZ; i++)
+        {
+            accum += static_cast<float>(srcPtr[srcIdx]);
+            srcIdx += srcStridesNZY.y;
+        }
+        meanTensor[dstIdx] = accum / static_cast<float>(lengthZ);
+    }
+    // compute mean along y direction
+    else if (axisMask == 2)
+    {
+        if (id_x >= lengthX || id_y >= lengthZ)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin);
+        uint dstIdx = id_z * maxParamVolume +  id_y * lengthX + id_x;
+        float accum = 0.0f;
+        for(uint i = 0; i < lengthY; i++)
+        {
+            accum += static_cast<float>(srcPtr[srcIdx]);
+            srcIdx += srcStridesNZY.z;
+        }
+        meanTensor[dstIdx] = accum / static_cast<float>(lengthY);
+    }
+    // compute mean along x direction
+    else if (axisMask == 4)
+    {
+        if (id_x >= lengthY || id_y >= lengthZ)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+        d_float8 accum_f8;
+        accum_f8.f4[0] = (float4)0.0f;
+        accum_f8.f4[1] = (float4)0.0f;
+        for(int i = 0; i < lengthX; i += 8)
+        {
+            d_float8 src_f8;
+            rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+            if (i + 8 > lengthX)
+            {
+                int xDiff = i + 8 - lengthX;
+                for(int i = xDiff; i < 8; i++)
+                    src_f8.f1[i] = 0.0f;
+            }
+            accum_f8.f4[0] += src_f8.f4[0];
+            accum_f8.f4[1] += src_f8.f4[1];
+            srcIdx += 8;
+        }
+        accum_f8.f4[0] += accum_f8.f4[1];
+        accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+        uint dstIdx = id_z * maxParamVolume + id_y * lengthY + id_x;
+        meanTensor[dstIdx] = accum_f8.f1[0] / static_cast<float>(lengthX);
+    }
+    // compute partial sums required for computing mean along z-y direction
+    else if (axisMask == 3)
+    {
+        for(uint xIndex = 0; xIndex < lengthX; xIndex++)
+        {
+            __shared__ float partialSum_smem[16][16];
+            float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];              // float pointer to beginning of each row in Shared
+            partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                                    // initialization of Shared to 0.0f using all 16 x 16 threads
+
+            if ((id_x >= lengthY) || (id_y >= lengthZ))
+            {
+                return;
+            }
+
+            uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex);
+            partialSumRowPtr_smem[hipThreadIdx_x] = static_cast<float>(srcPtr[srcIdx]);
+            __syncthreads();                                                                      // syncthreads after Shared load
+
+            // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+            for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+            {
+                if (hipThreadIdx_x < threadMax)
+                    partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax];
+                __syncthreads();
+            }
+
+            if (hipThreadIdx_x == 0)
+            {
+                // Reduction of 16 floats on 16 threads per block in z dimension
+                for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+                {
+                    if (hipThreadIdx_y < threadMax)
+                        partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                    __syncthreads();
+                }
+
+                // Final store to dst
+                if (hipThreadIdx_y == 0)
+                {
+                    uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x);
+                    partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+                }
+            }
+            __syncthreads();
+        }
+    }
+    // compute partial sums required for computing mean along y-x direction
+    else if (axisMask == 6)
+    {
+        __shared__ float partialSum_smem[256];
+        partialSum_smem[hipThreadIdx_x] = 0.0f;
+        __syncthreads();
+
+        if (id_x >= lengthY || id_y >= lengthZ)
+            return;
+
+        uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y;
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+        d_float8 accum_f8;
+        accum_f8.f4[0] = (float4)0.0f;
+        accum_f8.f4[1] = (float4)0.0f;
+        for(int i = 0; i < lengthX; i += 8)
+        {
+            d_float8 src_f8;
+            rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+            if (i + 8 > lengthX)
+            {
+                int xDiff = lengthX - i;
+                for(int j = xDiff; j < 8; j++)
+                    src_f8.f1[j] = 0.0f;
+            }
+            accum_f8.f4[0] += src_f8.f4[0];
+            accum_f8.f4[1] += src_f8.f4[1];
+            srcIdx += 8;
+        }
+        accum_f8.f4[0] += accum_f8.f4[1];
+        accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+        partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0];
+        __syncthreads();
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSum_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
+            partialSumTensor[dstIdx] = partialSum_smem[0];
+        }
+    }
+    // compute mean along z-x direction
+    else if (axisMask == 5)
+    {
+        __shared__ float partialSum_smem[32];
+        partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+        if (hipBlockIdx_x >= lengthY)
+            return;
+
+        uint dstIdx = id_z * maxParamVolume +  hipBlockIdx_x;
+        float accum = 0.0f;
+        for (uint i = 0; i < lengthZ; i++)
+        {
+            uint tid_x = hipThreadIdx_x;
+            uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin;
+            while (tid_x < lengthX)
+            {
+                accum += static_cast<float>(srcPtr[srcIdx + tid_x]);
+                tid_x += hipBlockDim_x;
+            }
+        }
+        partialSum_smem[hipThreadIdx_x] = accum;
+        __syncthreads();
+
+        // perform reduction on shared memory sums
+        reduction_sum_x_hip(partialSum_smem);
+
+        if (hipThreadIdx_x == 0)
+            meanTensor[dstIdx] = partialSum_smem[0] / static_cast<float>(lengthX * lengthZ);
+    }
+    // compute partial sums required for computing mean along z-y-x direction
+    else if (axisMask == 7)
+    {
+        id_x *= 8;
+        __shared__ float partialSum_smem[16][16];
+        float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];              // float pointer to beginning of each row in Shared
+        partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                                    // initialization of Shared to 0.0f using all 16 x 16 threads
+
+        uint xIndex = id_x % srcStridesNZY.z;
+        uint yIndex = id_x / srcStridesNZY.z;
+        if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ))
+        {
+            return;
+        }
+
+        int xAlignedLength =  lengthX & ~7;       // alignedLength for vectorized global loads
+        int xDiff = lengthX - xAlignedLength;     // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        if (xIndex + 8 > lengthX)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];
+        partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                                 src_f8.f1[1] +
+                                                 src_f8.f1[2] +
+                                                 src_f8.f1[3]);
+        __syncthreads();
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSumRowPtr_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            // Reduction of 16 floats on 16 threads per block in y dimension
+            for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+            {
+                if (hipThreadIdx_y < threadMax)
+                    partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                __syncthreads();
+            }
+
+            // Final store to dst
+            if (hipThreadIdx_y == 0)
+            {
+                uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x);
+                partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+            }
+        }
+    }
+}
+
+template <typename T>
+__global__ void compute_mean_nd_hip_tensor(T *srcPtr,
+                                           uint *srcMaxDims,
+                                           uint *srcStrides,
+                                           float *meanTensor,
+                                           uint *roiTensor,
+                                           uint *paramShapeTensor,
+                                           uint *paramStridesTensor,
+                                           uint maxParamVolume,
+                                           uint tensorDims,
+                                           uint maxBufferLength)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *begin = &roiTensor[id_z * tensorDims * 2];
+    uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+    uint *paramShape = &paramShapeTensor[id_z * tensorDims];
+    uint *paramStrides = &paramStridesTensor[id_z * tensorDims];
+    uint srcIdx = id_z * maxBufferLength;
+    uint paramBase = id_z * maxParamVolume;
+    uint paramIndex = 0;
+
+    if (maxParamVolume > MAX_SHARED_MEMORY_SIZE)
+    {
+        if (id_x >= maxBufferLength)
+            return;
+
+        // validate if id_x is within the roi of input and compute paramIndex if valid
+        for (int i = 0; i < tensorDims; i++)
+        {
+            uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+            srcIdx += ((begin[i] + coord) * srcStrides[i]);
+            if (coord >= length[i])
+                return;
+            paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+        }
+        atomicAdd(&meanTensor[paramBase + paramIndex], static_cast<float>(srcPtr[srcIdx]));
+    }
+    else
+    {
+
+        if (id_x >= (hipBlockDim_x * hipGridDim_x))
+            return;
+
+        // if number of means needed to compute is within in the max shared memory size
+        // use shared memory for atomic addition to reduce global memory traffic
+        bool isValid = true;
+        for (int i = 0; i < tensorDims; i++)
+        {
+            uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+            srcIdx += ((begin[i] + coord) * srcStrides[i]);
+            if (coord >= length[i])
+            {
+                isValid = false;
+                break;
+            }
+            paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+        }
+
+        extern __shared__ float sh_mem[];
+        sh_mem[hipThreadIdx_x] = 0.0f;
+        __syncthreads();
+
+        if (isValid && id_x < maxBufferLength)
+            atomicAdd(&sh_mem[paramIndex], static_cast<float>(srcPtr[srcIdx]));
+        __syncthreads();
+
+        if (hipThreadIdx_x < maxParamVolume)
+            atomicAdd(&meanTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]);
+    }
+}
+
+// -------------------- Set 5 - stddev compute kernels (reduction stage 1) --------------------
+
+template <typename T>
+__global__ void compute_stddev_2d_hip_tensor(T *srcPtr,
+                                             uint2 srcStridesNH,
+                                             float *meanTensor,
+                                             float *stdDevTensor,
+                                             float *partialSumTensor,
+                                             uint *roiTensor,
+                                             uint maxParamVolume,
+                                             uint axisMask)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *roi = &roiTensor[id_z * 4];
+    uint yBegin = roi[0];
+    uint xBegin = roi[1];
+    uint height = roi[2];
+    uint width = roi[3];
+
+    // compute column wise stdDev
+    if (axisMask == 1)
+    {
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin);
+        uint paramIndex = id_z * maxParamVolume + id_x;
+        float mean = meanTensor[paramIndex];
+        if (id_x < width)
+        {
+            float accum = 0.0f;
+            for(int i = 0; i < height; i++)
+            {
+                float val = (static_cast<float>(srcPtr[srcIdx]) - mean);
+                accum += (val * val);
+                srcIdx += srcStridesNH.y;
+            }
+            stdDevTensor[paramIndex] = sqrtf(accum / static_cast<float>(height));
+        }
+    }
+    // compute partial mean subtracted squared sums needed for row wise stdDev
+    else if (axisMask == 2)
+    {
+        id_x *= 8;
+        __shared__ float partialRowSum_smem[256];
+        partialRowSum_smem[hipThreadIdx_x] = 0.0f;
+
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        int xAlignedLength =  width & ~7;      // alignedLength for vectorized global loads
+        int xDiff = width - xAlignedLength;    // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+        uint paramIndex = id_z * maxParamVolume + id_y;
+        float mean = meanTensor[paramIndex];
+        float4 mean_f4 = static_cast<float4>(mean);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+        rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+
+        if (id_x + 8 > width)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];                                           // perform small work of vectorized float4 addition
+        partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                              src_f8.f1[1] +
+                                              src_f8.f1[2] +
+                                              src_f8.f1[3]);                    // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+        __syncthreads();
+
+        // Now do block level reduction sum
+        reduction_sum_x_hip(partialRowSum_smem);
+
+        // Final store to dst
+        if (hipThreadIdx_x == 0)
+        {
+            uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x;
+            partialSumTensor[paramIndex] = partialRowSum_smem[0];
+        }
+    }
+    // compute partial mean subtracted squared sums need for computing stdDev over entire rows and columns
+    else if (axisMask == 3)
+    {
+        id_x *= 8;
+        __shared__ float partialSum_smem[16][16];                               // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+        float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];     // float pointer to beginning of each row in Shared
+        partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                           // initialization of Shared to 0.0f using all 16 x 16 threads
+
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        int xAlignedLength =  width & ~7;       // alignedLength for vectorized global loads
+        int xDiff = width - xAlignedLength;     // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+        float mean = meanTensor[id_z];
+        float4 mean_f4 = static_cast<float4>(mean);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+        rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+        if (id_x + 8 > width)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];                                           // perform small work of vectorized float4 addition
+        partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                                src_f8.f1[1] +
+                                                src_f8.f1[2] +
+                                                src_f8.f1[3]);                  // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+        __syncthreads();                                                        // syncthreads after Shared load
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSumRowPtr_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            // Reduction of 16 floats on 16 threads per block in y dimension
+            for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+            {
+                if (hipThreadIdx_y < threadMax)
+                    partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                __syncthreads();
+            }
+
+            // Final store to dst
+            if (hipThreadIdx_y == 0)
+                partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void compute_stddev_3d_hip_tensor(T *srcPtr,
+                                             uint3 srcStridesNZY,
+                                             float *meanTensor,
+                                             float *stdDevTensor,
+                                             uint *roiTensor,
+                                             float *partialSumTensor,
+                                             uint maxParamVolume,
+                                             uint axisMask)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *roi = &roiTensor[id_z * 6];
+    uint zBegin = roi[0];
+    uint yBegin = roi[1];
+    uint xBegin = roi[2];
+    uint lengthZ = roi[3];
+    uint lengthY = roi[4];
+    uint lengthX = roi[5];
+
+    // compute stddev along z direction
+    if (axisMask == 1)
+    {
+        if (id_x >= lengthX || id_y >= lengthY)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin);
+        uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x;
+        float mean = meanTensor[paramIndex];
+        float accum = 0.0f;
+        for(uint i = 0; i < lengthZ; i++)
+        {
+            float val = (static_cast<float>(srcPtr[srcIdx]) - mean);
+            accum += (val * val);
+            srcIdx += srcStridesNZY.y;
+        }
+        stdDevTensor[paramIndex] = sqrtf(accum / static_cast<float>(lengthZ));
+    }
+    // compute stddev along y direction
+    else if (axisMask == 2)
+    {
+        if (id_x >= lengthX || id_y >= lengthZ)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin);
+        uint paramIndex = id_z * maxParamVolume +  id_y * lengthX + id_x;
+        float mean = meanTensor[paramIndex];
+        float accum = 0.0f;
+        for(uint i = 0; i < lengthY; i++)
+        {
+            float val = (static_cast<float>(srcPtr[srcIdx]) - mean);
+            accum += (val * val);
+            srcIdx += srcStridesNZY.z;
+        }
+        stdDevTensor[paramIndex] = sqrtf(accum / static_cast<float>(lengthY));
+    }
+    // compute stddev along x direction
+    else if (axisMask == 4)
+    {
+        if (id_x >= lengthY || id_y >= lengthZ)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+        uint paramIndex = id_z * maxParamVolume + id_y * lengthY + id_x;
+        float mean = meanTensor[paramIndex];
+        float4 mean_f4 = static_cast<float4>(mean);
+        d_float8 accum_f8;
+        accum_f8.f4[0] = (float4)0.0f;
+        accum_f8.f4[1] = (float4)0.0f;
+        for(int i = 0; i < lengthX; i += 8)
+        {
+            d_float8 src_f8;
+            rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+            rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+            rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+            if (i + 8 > lengthX)
+            {
+                int xDiff = i + 8 - lengthX;
+                for(int i = xDiff; i < 8; i++)
+                    src_f8.f1[i] = 0.0f;
+            }
+            accum_f8.f4[0] += src_f8.f4[0];
+            accum_f8.f4[1] += src_f8.f4[1];
+            srcIdx += 8;
+        }
+        accum_f8.f4[0] += accum_f8.f4[1];
+        accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+
+        stdDevTensor[paramIndex] =  sqrtf(accum_f8.f1[0] / static_cast<float>(lengthX));
+    }
+    // compute partial mean subtracted squared sums required for computing stdDev along z-y direction
+    else if (axisMask == 3)
+    {
+        for(uint xIndex = 0; xIndex < lengthX; xIndex++)
+        {
+            __shared__ float partialSum_smem[16][16];
+            float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];              // float pointer to beginning of each row in Shared
+            partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                                    // initialization of Shared to 0.0f using all 16 x 16 threads
+
+            if ((id_x >= lengthY) || (id_y >= lengthZ))
+            {
+                return;
+            }
+
+            uint paramIndex = id_z * maxParamVolume + xIndex;
+            float mean = meanTensor[paramIndex];
+            uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex);
+            float val = static_cast<float>(srcPtr[srcIdx]) - mean;
+            partialSumRowPtr_smem[hipThreadIdx_x] = (val * val);
+            __syncthreads();                                                                      // syncthreads after Shared load
+
+            // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+            for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+            {
+                if (hipThreadIdx_x < threadMax)
+                    partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax];
+                __syncthreads();
+            }
+
+            if (hipThreadIdx_x == 0)
+            {
+                // Reduction of 16 floats on 16 threads per block in z dimension
+                for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+                {
+                    if (hipThreadIdx_y < threadMax)
+                        partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                    __syncthreads();
+                }
+
+                // Final store to dst
+                if (hipThreadIdx_y == 0)
+                {
+                    uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x);
+                    partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+                }
+            }
+            __syncthreads();
+        }
+    }
+    // compute partial mean subtracted squared sums required for computing stdDev along y-x direction
+    else if (axisMask == 6)
+    {
+        __shared__ float partialSum_smem[256];
+        partialSum_smem[hipThreadIdx_x] = 0.0f;
+        __syncthreads();
+
+        if (id_x >= lengthY || id_y >= lengthZ)
+            return;
+
+        uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y;
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+
+        uint paramIndex = id_z * maxParamVolume + id_y;
+        float mean = meanTensor[paramIndex];
+        float4 mean_f4 = static_cast<float4>(mean);
+
+        d_float8 accum_f8;
+        accum_f8.f4[0] = (float4)0.0f;
+        accum_f8.f4[1] = (float4)0.0f;
+        for(int i = 0; i < lengthX; i += 8)
+        {
+            d_float8 src_f8;
+            rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+            rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+            rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+            if (i + 8 > lengthX)
+            {
+                int xDiff = lengthX - i;
+                for(int j = xDiff; j < 8; j++)
+                    src_f8.f1[j] = 0.0f;
+            }
+            accum_f8.f4[0] += src_f8.f4[0];
+            accum_f8.f4[1] += src_f8.f4[1];
+            srcIdx += 8;
+        }
+        accum_f8.f4[0] += accum_f8.f4[1];
+        accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+        partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0];
+        __syncthreads();
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSum_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
+            partialSumTensor[dstIdx] = partialSum_smem[0];
+        }
+    }
+    // compute stddev along z-x direction
+    else if (axisMask == 5)
+    {
+        __shared__ float partialSum_smem[32];
+        partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+        if (hipBlockIdx_x >= lengthY)
+            return;
+
+        uint paramIndex = id_z * maxParamVolume +  hipBlockIdx_x;
+        float mean = meanTensor[paramIndex];
+        float accum = 0.0f;
+        for (uint i = 0; i < lengthZ; i++)
+        {
+            uint tid_x = hipThreadIdx_x;
+            uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin;
+            while (tid_x < lengthX)
+            {
+                float val = (static_cast<float>(srcPtr[srcIdx + tid_x]) - mean);
+                accum += (val * val);
+                tid_x += hipBlockDim_x;
+            }
+        }
+        partialSum_smem[hipThreadIdx_x] = accum;
+        __syncthreads();
+
+        // perform reduction on shared memory sums
+        reduction_sum_x_hip(partialSum_smem);
+
+        if (hipThreadIdx_x == 0)
+            stdDevTensor[paramIndex] = sqrtf(partialSum_smem[0] / static_cast<float>(lengthX * lengthZ));
+    }
+    // compute partial mean subtracted squared sums required for computing stdDev along z-y-x direction
+    else if (axisMask == 7)
+    {
+       id_x *= 8;
+        __shared__ float partialSum_smem[16][16];
+        float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];              // float pointer to beginning of each row in Shared
+        partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                                    // initialization of Shared to 0.0f using all 16 x 16 threads
+
+        uint xIndex = id_x % srcStridesNZY.z;
+        uint yIndex = id_x / srcStridesNZY.z;
+        if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ))
+        {
+            return;
+        }
+
+        int xAlignedLength =  lengthX & ~7;       // alignedLength for vectorized global loads
+        int xDiff = lengthX - xAlignedLength;     // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin);
+
+        uint paramIndex = id_z * maxParamVolume;
+        float mean = meanTensor[paramIndex];
+        float4 mean_f4 = static_cast<float4>(mean);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+        rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+
+        if (xIndex + 8 > lengthX)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];
+        partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                                 src_f8.f1[1] +
+                                                 src_f8.f1[2] +
+                                                 src_f8.f1[3]);                 // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+        __syncthreads();                                                        // syncthreads after Shared load
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSumRowPtr_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            // Reduction of 16 floats on 16 threads per block in y dimension
+            for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+            {
+                if (hipThreadIdx_y < threadMax)
+                    partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                __syncthreads();
+            }
+
+            // Final store to dst
+            if (hipThreadIdx_y == 0)
+            {
+                uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x);
+                partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+            }
+        }
+    }
+}
+
+template <typename T>
+__global__ void compute_stddev_nd_hip_tensor(T *srcPtr,
+                                             uint *srcMaxDims,
+                                             uint *srcStrides,
+                                             float *meanTensor,
+                                             float *stdDevTensor,
+                                             uint *roiTensor,
+                                             uint *paramShapeTensor,
+                                             uint *paramStridesTensor,
+                                             uint maxParamVolume,
+                                             uint tensorDims,
+                                             uint maxBufferLength)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *begin = &roiTensor[id_z * tensorDims * 2];
+    uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+    uint *paramShape = &paramShapeTensor[id_z * tensorDims];
+    uint *paramStrides = &paramStridesTensor[id_z * tensorDims];
+    uint srcIdx = id_z * maxBufferLength;
+    uint paramBase = id_z * maxParamVolume;
+    uint paramIndex = 0;
+
+    if (maxParamVolume > MAX_SHARED_MEMORY_SIZE)
+    {
+        if (id_x >= maxBufferLength)
+            return;
+
+        // validate if id_x is within the roi of input and compute paramIndex if valid
+        for (int i = 0; i < tensorDims; i++)
+        {
+            uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+            srcIdx += ((begin[i] + coord) * srcStrides[i]);
+            if (coord >= length[i])
+                return;
+            paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+        }
+        float val = static_cast<float>(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex];
+        atomicAdd(&stdDevTensor[paramBase + paramIndex], (val * val));
+    }
+    else
+    {
+
+        if (id_x >= (hipBlockDim_x * hipGridDim_x))
+            return;
+
+        // if number of means needed to compute is within in the max shared memory size
+        // use shared memory for atomic addition to reduce global memory traffic
+        bool isValid = true;
+        for (int i = 0; i < tensorDims; i++)
+        {
+            uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+            srcIdx += ((begin[i] + coord) * srcStrides[i]);
+            if (coord >= length[i])
+            {
+                isValid = false;
+                break;
+            }
+            paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+        }
+
+        extern __shared__ float sh_mem[];
+        sh_mem[hipThreadIdx_x] = 0.0f;
+        __syncthreads();
+
+        if (isValid && id_x < maxBufferLength)
+        {
+            float val = static_cast<float>(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex];
+            atomicAdd(&sh_mem[paramIndex], (val * val));
+        }
+        __syncthreads();
+
+        if (hipThreadIdx_x < maxParamVolume)
+            atomicAdd(&stdDevTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]);
+    }
+}
+
+// -------------------- Set 6 - mean and stddev compute kernels (reduction stage 2) --------------------
+
+__global__ void reduce_final_result_hip(float *partialSumTensor,
+                                        uint numPartialSums,
+                                        float *meanTensor,
+                                        float *stdDevTensor,
+                                        bool isMean,
+                                        uint *roiTensor,
+                                        uint axisMask,
+                                        uint tensorDims)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+
+    uint meanFactor;
+    if (tensorDims == 3)
+    {
+        uint lengthZ = roi[0];
+        uint lengthY = roi[1];
+        uint lengthX = roi[2];
+
+        if (axisMask == 3)
+            meanFactor = lengthZ * lengthY;
+        else if (axisMask == 6)
+            meanFactor = lengthY * lengthX;
+        else if (axisMask == 7)
+            meanFactor = lengthZ * lengthY * lengthX;
+    }
+    else if (tensorDims == 2)
+    {
+        uint lengthY = roi[0];
+        uint lengthX = roi[1];
+
+        if (axisMask == 2)
+            meanFactor = lengthX;
+        else if (axisMask == 3)
+            meanFactor = lengthY * lengthX;
+    }
+
+    __shared__ float partialSum_smem[16];
+    partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+    float accum = 0.0f;
+    while(id_x < numPartialSums)
+    {
+        uint srcIdx = (id_z * hipGridDim_y * numPartialSums) + (id_y * numPartialSums) + id_x;
+        accum += partialSumTensor[srcIdx];
+        id_x += hipBlockDim_x;
+    }
+    partialSum_smem[hipThreadIdx_x] = accum;
+    __syncthreads();
+
+    // Now do block level reduction sum
+    reduction_sum_x_hip(partialSum_smem);
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+    {
+        if (isMean)
+            meanTensor[id_z * hipGridDim_y + id_y] = partialSum_smem[0] / meanFactor;
+        else
+            stdDevTensor[id_z * hipGridDim_y + id_y] = sqrtf(partialSum_smem[0] / meanFactor);
+    }
+}
+
+__global__ void final_reduction_nd_hip_tensor(float *meanTensor,
+                                              float *stdDevTensor,
+                                              uint *paramShapeTensor,
+                                              uint *roiTensor,
+                                              uint tensorDims,
+                                              uint maxParamVolume,
+                                              bool isMean)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *paramShape = &paramShapeTensor[id_z * tensorDims];
+    uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+
+    uint divisionFactor = 1;
+    uint paramVolume = 1;
+    for(int i = 0; i < tensorDims; i++)
+    {
+        paramVolume *= paramShape[i];
+        if (paramShape[i] == 1)
+            divisionFactor *= roi[i];
+    }
+
+    if (id_x >= paramVolume)
+        return;
+
+    uint paramIndex = id_z * maxParamVolume + id_x;
+    if (isMean)
+        meanTensor[paramIndex] = meanTensor[paramIndex] / divisionFactor;
+    else
+        stdDevTensor[paramIndex] = sqrtf(stdDevTensor[paramIndex] / divisionFactor);
+}
+
+// -------------------- Set 7 - mean and stddev compute kernels launch helpers --------------------
+
+void set_kernel_launch_config_2d(RpptGenericDescPtr srcGenericDescPtr,
+                                 int &globalThreads_x,
+                                 int &globalThreads_y,
+                                 int &globalThreads_z,
+                                 int &localThreads_x,
+                                 int &localThreads_y,
+                                 int &localThreads_z,
+                                 Rpp32u axisMask,
+                                 Rpp32f *partialSumArr,
+                                 rpp::Handle& handle)
+{
+    switch (axisMask)
+    {
+        // compute along Y direction
+        case 1:
+        {
+            localThreads_x = 256;
+            localThreads_y = 1;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int>(ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+            globalThreads_y = 1;
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along X direction
+        case 2:
+        {
+            localThreads_x = 256;
+            localThreads_y = 1;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / 256));
+            globalThreads_y = srcGenericDescPtr->dims[1];
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = srcGenericDescPtr->dims[0] * srcGenericDescPtr->dims[1] * globalThreads_x;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+        // compute along XY direction
+        case 3:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+    }
+}
+
+void set_kernel_launch_config_3d(RpptGenericDescPtr srcGenericDescPtr,
+                                 int &globalThreads_x,
+                                 int &globalThreads_y,
+                                 int &globalThreads_z,
+                                 int &localThreads_x,
+                                 int &localThreads_y,
+                                 int &localThreads_z,
+                                 Rpp32u axisMask,
+                                 Rpp32f *partialSumArr,
+                                 rpp::Handle& handle)
+{
+    switch (axisMask)
+    {
+        // compute along Z direction
+        case 1:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[2] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along Y direction
+        case 2:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along YZ direction
+        case 3:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+        // compute along X direction
+        case 4:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along XZ direction
+        case 5:
+        {
+            localThreads_x = 32;
+            localThreads_y = 1;
+            localThreads_z = 1;
+            globalThreads_x = srcGenericDescPtr->dims[2];
+            globalThreads_y = 1;
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along XY direction
+        case 6:
+        {
+            localThreads_x = 256;
+            localThreads_y = 1;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+            globalThreads_y = srcGenericDescPtr->dims[1];
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+        // compute along XYZ direction
+        case 7:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            Rpp32u numValues = (srcGenericDescPtr->dims[2] * srcGenericDescPtr->dims[3] + 7) >> 3;
+            globalThreads_x = static_cast<int> (ceil((float)numValues / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+    }
+}
+
+// -------------------- Set 8 - mean and stddev compute kernels executor --------------------
+
+template <typename T>
+RppStatus hip_exec_compute_mean_stddev_tensor(T *srcPtr,
+                                              RpptGenericDescPtr srcGenericDescPtr,
+                                              Rpp32f *meanTensor,
+                                              Rpp32f *stdDevTensor,
+                                              bool isMean,
+                                              Rpp32u *roiTensor,
+                                              Rpp32u axisMask,
+                                              Rpp32u tensorDims,
+                                              Rpp32u maxParamVolume,
+                                              Rpp32u *paramShape,
+                                              Rpp32u *paramStrides,
+                                              rpp::Handle& handle)
+{
+    Rpp32f *partialSumArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+    Rpp32u partialSumArrLength, partialSumBlocksPerSample;
+
+    int globalThreads_x, globalThreads_y, globalThreads_z;
+    int localThreads_x, localThreads_y, localThreads_z;
+    // based on number of dimensions call the corresponding kernel
+    if (tensorDims == 2)
+    {
+        // set the block and grid configuration based on axisMask
+        set_kernel_launch_config_2d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z,
+                                    localThreads_x, localThreads_y, localThreads_z, axisMask,
+                                    partialSumArr, handle);
+
+        if (isMean)
+        {
+            hipLaunchKernelGGL(compute_mean_2d_hip_tensor,
+                               dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+                               meanTensor,
+                               partialSumArr,
+                               roiTensor,
+                               maxParamVolume,
+                               axisMask);
+        }
+        else
+        {
+            hipLaunchKernelGGL(compute_stddev_2d_hip_tensor,
+                               dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+                               meanTensor,
+                               stdDevTensor,
+                               partialSumArr,
+                               roiTensor,
+                               maxParamVolume,
+                               axisMask);
+        }
+
+        if (axisMask == 2)
+        {
+            partialSumBlocksPerSample = globalThreads_x;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+        else if (axisMask == 3)
+        {
+            partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), 1, globalThreads_z),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+    }
+    else if (tensorDims == 3)
+    {
+        // set the block and grid configuration based on axisMask
+        set_kernel_launch_config_3d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z,
+                                    localThreads_x, localThreads_y, localThreads_z, axisMask,
+                                    partialSumArr, handle);
+
+        if (isMean)
+        {
+            hipLaunchKernelGGL(compute_mean_3d_hip_tensor,
+                               dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               meanTensor,
+                               roiTensor,
+                               partialSumArr,
+                               maxParamVolume,
+                               axisMask);
+        }
+        else
+        {
+            hipLaunchKernelGGL(compute_stddev_3d_hip_tensor,
+                               dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               meanTensor,
+                               stdDevTensor,
+                               roiTensor,
+                               partialSumArr,
+                               maxParamVolume,
+                               axisMask);
+        }
+
+        // perform final reduction on block wise sums for below cases
+        // reduce on YZ partial sums
+        if (axisMask == 3)
+        {
+            partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[3], srcGenericDescPtr->dims[0]),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+        // reduce on XY partial sums
+        if (axisMask == 6)
+        {
+            partialSumBlocksPerSample = globalThreads_x;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[1], srcGenericDescPtr->dims[0]),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+        // reduce on XYZ block partial sums
+        else if (axisMask == 7)
+        {
+            partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), 1, srcGenericDescPtr->dims[0]),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+    }
+    else
+    {
+        // interpret the input as 1D tensor
+        globalThreads_x = srcGenericDescPtr->strides[0];
+        globalThreads_y = 1;
+        globalThreads_z = srcGenericDescPtr->dims[0];
+        Rpp32u batchSize = globalThreads_z;
+
+        // allocate tensor for src strides
+        Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1];
+        Rpp32u *srcStrides = &srcGenericDescPtr->strides[1];
+
+        Rpp32u shared_memory_size = 0;
+        Rpp32u block_size = 1024;
+        if (maxParamVolume <= MAX_SHARED_MEMORY_SIZE)
+        {
+            if (maxParamVolume <= 32)
+                shared_memory_size = 32;
+            else if (maxParamVolume <= 64)
+                shared_memory_size = 64;
+            else if (maxParamVolume <= 128)
+                shared_memory_size = 128;
+            else if (maxParamVolume <= 256)
+                shared_memory_size = 256;
+            else if (maxParamVolume <= 512)
+                shared_memory_size = 512;
+            else
+                shared_memory_size = MAX_SHARED_MEMORY_SIZE;
+            block_size = shared_memory_size;
+        }
+
+        if (isMean)
+        {
+            hipLaunchKernelGGL(compute_mean_nd_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+                               dim3(block_size, 1, 1),
+                               shared_memory_size,
+                               handle.GetStream(),
+                               srcPtr,
+                               srcMaxDims,
+                               srcStrides,
+                               meanTensor,
+                               roiTensor,
+                               paramShape,
+                               paramStrides,
+                               maxParamVolume,
+                               tensorDims,
+                               srcGenericDescPtr->strides[0]);
+        }
+        else
+        {
+            hipLaunchKernelGGL(compute_stddev_nd_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+                               dim3(block_size, 1, 1),
+                               shared_memory_size,
+                               handle.GetStream(),
+                               srcPtr,
+                               srcMaxDims,
+                               srcStrides,
+                               meanTensor,
+                               stdDevTensor,
+                               roiTensor,
+                               paramShape,
+                               paramStrides,
+                               maxParamVolume,
+                               tensorDims,
+                               srcGenericDescPtr->strides[0]);
+        }
+        hipLaunchKernelGGL(final_reduction_nd_hip_tensor,
+                           dim3(ceil((float)maxParamVolume/1024), 1, globalThreads_z),
+                           dim3(1024, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           meanTensor,
+                           stdDevTensor,
+                           paramShape,
+                           roiTensor,
+                           tensorDims,
+                           maxParamVolume,
+                           isMean);
+    }
+    hipStreamSynchronize(handle.GetStream());
+    return RPP_SUCCESS;
+}
+
+// -------------------- Set 9 - normalization kernel executor --------------------
+
+template <typename T>
+RppStatus hip_exec_normalize_tensor(T *srcPtr,
+                                    RpptGenericDescPtr srcGenericDescPtr,
+                                    T *dstPtr,
+                                    RpptGenericDescPtr dstGenericDescPtr,
+                                    Rpp32u axisMask,
+                                    Rpp32f *meanTensor,
+                                    Rpp32f *stdDevTensor,
+                                    Rpp8u computeMeanStddev,
+                                    Rpp32f scale,
+                                    Rpp32f shift,
+                                    Rpp32u *roiTensor,
+                                    rpp::Handle& handle)
+{
+    Rpp32u batchSize = srcGenericDescPtr->dims[0];
+    Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+
+    // create buffer for paramShape and paramStride needed for generic kernel
+    Rpp32u *paramShape, *paramStrides;
+    paramShape =  handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem;
+    paramStrides = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem + (batchSize * tensorDims);
+
+    // do initial preprocessing, compute maxParamVolue and fill the values for paramShape and paramStrides
+    Rpp32u maxParamVolume;
+    if (tensorDims == 2 || tensorDims == 3)
+        normalize_setup_2d_and_3d(roiTensor, batchSize, tensorDims,
+                                  axisMask, maxParamVolume);
+    else
+        normalize_setup_nd(roiTensor, batchSize, tensorDims, axisMask,
+                           paramShape, paramStrides, maxParamVolume);
+
+    bool computeMean = computeMeanStddev & 1;   // if 0th bit in computeMeanStddev is set, computeMean is set to true. Otherwise it is set to false
+    bool computeStdDev = computeMeanStddev & 2; // if 1st bit in computeMeanStddev is set, computeStdDev is set to true. Otherwise it is set to false
+    if ((!computeMean) && (!computeStdDev))
+        maxParamVolume = 0;
+
+    // if computeMean is set, compute mean values by processing over input based on axisMask values
+    if (computeMean)
+        hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, true,
+                                            roiTensor, axisMask, tensorDims, maxParamVolume,
+                                            paramShape, paramStrides, handle);
+
+    // if computeStdDev is set, compute stdDev values by processing over input based on axisMask values
+    if (computeStdDev)
+        hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, false,
+                                            roiTensor, axisMask, tensorDims, maxParamVolume,
+                                            paramShape, paramStrides, handle);
+
+    // based on number of dimensions call the corresponding kernel
+    if (tensorDims == 2)
+    {
+        // NHW
+        int globalThreads_x = dstGenericDescPtr->dims[2];
+        int globalThreads_y = dstGenericDescPtr->dims[1];
+        int globalThreads_z = dstGenericDescPtr->dims[0];
+
+        hipLaunchKernelGGL(normalize_2d_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+                           dstPtr,
+                           make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]),
+                           meanTensor,
+                           stdDevTensor,
+                           make_float2(scale, shift),
+                           roiTensor,
+                           make_uint2(maxParamVolume, axisMask),
+                           computeStdDev);
+    }
+    else if (tensorDims == 3)
+    {
+        // NDHW
+        int globalThreads_x = dstGenericDescPtr->dims[3];
+        int globalThreads_y = dstGenericDescPtr->dims[2];
+        int globalThreads_z = dstGenericDescPtr->dims[1];
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            hipLaunchKernelGGL(normalize_3d_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+                               make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                               make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+                               &meanTensor[batchCount * maxParamVolume],
+                               &stdDevTensor[batchCount * maxParamVolume],
+                               make_float2(scale, shift),
+                               &roiTensor[batchCount * 6],
+                               axisMask,
+                               computeStdDev);
+        }
+    }
+    else
+    {
+        // interpret the input as 1D tensor
+        int globalThreads_x = dstGenericDescPtr->strides[0];
+        int globalThreads_y = 1;
+        int globalThreads_z = dstGenericDescPtr->dims[0];
+
+        // allocate tensor for src strides
+        Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1];
+        Rpp32u *srcStrides = &srcGenericDescPtr->strides[1];
+        hipLaunchKernelGGL(normalize_nd_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+                           dim3(1024, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           srcMaxDims,
+                           srcStrides,
+                           dstPtr,
+                           meanTensor,
+                           stdDevTensor,
+                           make_float2(scale, shift),
+                           roiTensor,
+                           paramShape,
+                           paramStrides,
+                           make_uint2(maxParamVolume, srcGenericDescPtr->strides[0]),
+                           tensorDims,
+                           computeStdDev);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/rppi_validate.hpp b/src/modules/rppi_validate.hpp
index e35e5c514..3285ee756 100644
--- a/src/modules/rppi_validate.hpp
+++ b/src/modules/rppi_validate.hpp
@@ -56,11 +56,9 @@ inline RppLayoutParams get_layout_params(RpptLayout layout, Rpp32u channels)
     }
     else if(layout == RpptLayout::NHWC || layout == RpptLayout::NDHWC)
     {
-        if (channels == 3) // PKD3
-        {
-            layoutParams.channelParam = 1;
-            layoutParams.bufferMultiplier = 3;
-        }
+        //PKD
+        layoutParams.channelParam = 1;
+        layoutParams.bufferMultiplier = channels;
     }
     return layoutParams;
 }
diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp
index 28313a88f..ef69a49bb 100644
--- a/src/modules/rppt_tensor_statistical_operations.cpp
+++ b/src/modules/rppt_tensor_statistical_operations.cpp
@@ -241,13 +241,109 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr,
     return RPP_SUCCESS;
 }
 
+/******************** normalize_ND ********************/
+
+RppStatus rppt_normalize_host(RppPtr_t srcPtr,
+                              RpptGenericDescPtr srcGenericDescPtr,
+                              RppPtr_t dstPtr,
+                              RpptGenericDescPtr dstGenericDescPtr,
+                              Rpp32u axisMask,
+                              Rpp32f *meanTensor,
+                              Rpp32f *stdDevTensor,
+                              Rpp8u computeMeanStddev,
+                              Rpp32f scale,
+                              Rpp32f shift,
+                              Rpp32u *roiTensor,
+                              rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams;
+    Rpp32u tensorDim = srcGenericDescPtr->numDims - 1;
+    if (tensorDim == 3 && (srcGenericDescPtr->layout == RpptLayout::NHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]);
+    else if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+    else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+    else if(tensorDim == 2 && (srcGenericDescPtr->layout == RpptLayout::NHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[2]);
+
+    if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+    {
+        normalize_generic_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                      srcGenericDescPtr,
+                                      static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                      dstGenericDescPtr,
+                                      axisMask,
+                                      meanTensor,
+                                      stdDevTensor,
+                                      computeMeanStddev,
+                                      scale,
+                                      shift,
+                                      roiTensor,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+    {
+        normalize_generic_host_tensor(reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                      srcGenericDescPtr,
+                                      reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                      dstGenericDescPtr,
+                                      axisMask,
+                                      meanTensor,
+                                      stdDevTensor,
+                                      computeMeanStddev,
+                                      scale,
+                                      shift,
+                                      roiTensor,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        normalize_f32_f32_host_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                      srcGenericDescPtr,
+                                      reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                      dstGenericDescPtr,
+                                      axisMask,
+                                      meanTensor,
+                                      stdDevTensor,
+                                      computeMeanStddev,
+                                      scale,
+                                      shift,
+                                      roiTensor,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+    {
+        normalize_generic_host_tensor(static_cast<Rpp8s*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                      srcGenericDescPtr,
+                                      static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                      dstGenericDescPtr,
+                                      axisMask,
+                                      meanTensor,
+                                      stdDevTensor,
+                                      computeMeanStddev,
+                                      scale,
+                                      shift,
+                                      roiTensor,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
 
 /********************************************************************************************************************/
 /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
 /********************************************************************************************************************/
 
+#ifdef GPU_SUPPORT
+
 /******************** tensor_sum ********************/
-#ifdef HIP_COMPILE
+
 RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
                               RpptDescPtr srcDescPtr,
                               RppPtr_t tensorSumArr,
@@ -256,6 +352,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
                               RpptRoiType roiType,
                               rppHandle_t rppHandle)
 {
+#ifdef HIP_COMPILE
     if (srcDescPtr->c == 1)
     {
         if (tensorSumArrLength < srcDescPtr->n)      // sum of single channel
@@ -317,10 +414,12 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
     }
 
     return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
 }
 
 /******************** tensor_min ********************/
-
 RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
                               RpptDescPtr srcDescPtr,
                               RppPtr_t imageMinArr,
@@ -329,6 +428,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
                               RpptRoiType roiType,
                               rppHandle_t rppHandle)
 {
+#ifdef HIP_COMPILE
     if (srcDescPtr->c == 1)
     {
         if (imageMinArrLength < srcDescPtr->n)   // min of single channel
@@ -378,6 +478,9 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
     }
 
     return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
 }
 
 /******************** tensor_max ********************/
@@ -390,6 +493,7 @@ RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
                               RpptRoiType roiType,
                               rppHandle_t rppHandle)
 {
+#ifdef HIP_COMPILE
     if (srcDescPtr->c == 1)
     {
         if (imageMaxArrLength < srcDescPtr->n)   // max of single channel
@@ -439,5 +543,90 @@ RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
     }
 
     return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
 }
+
+RppStatus rppt_normalize_gpu(RppPtr_t srcPtr,
+                             RpptGenericDescPtr srcGenericDescPtr,
+                             RppPtr_t dstPtr,
+                             RpptGenericDescPtr dstGenericDescPtr,
+                             Rpp32u axisMask,
+                             Rpp32f *meanTensor,
+                             Rpp32f *stdDevTensor,
+                             Rpp8u computeMeanStddev,
+                             Rpp32f scale,
+                             Rpp32f shift,
+                             Rpp32u *roiTensor,
+                             rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+    {
+        hip_exec_normalize_tensor(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                  srcGenericDescPtr,
+                                  static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                  dstGenericDescPtr,
+                                  axisMask,
+                                  meanTensor,
+                                  stdDevTensor,
+                                  computeMeanStddev,
+                                  scale,
+                                  shift,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_normalize_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                  srcGenericDescPtr,
+                                  reinterpret_cast<half*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                  dstGenericDescPtr,
+                                  axisMask,
+                                  meanTensor,
+                                  stdDevTensor,
+                                  computeMeanStddev,
+                                  scale,
+                                  shift,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_normalize_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                  srcGenericDescPtr,
+                                  reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                  dstGenericDescPtr,
+                                  axisMask,
+                                  meanTensor,
+                                  stdDevTensor,
+                                  computeMeanStddev,
+                                  scale,
+                                  shift,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+    {
+        hip_exec_normalize_tensor(static_cast<Rpp8s*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                  srcGenericDescPtr,
+                                  static_cast<Rpp8s*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                  dstGenericDescPtr,
+                                  axisMask,
+                                  meanTensor,
+                                  stdDevTensor,
+                                  computeMeanStddev,
+                                  scale,
+                                  shift,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
 #endif // backend
+}
+
+#endif // GPU_SUPPORT
diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt
index 82ed65309..77052cabe 100644
--- a/utilities/test_suite/CMakeLists.txt
+++ b/utilities/test_suite/CMakeLists.txt
@@ -83,7 +83,7 @@ if(Python3_FOUND)
     if(NIFTI_FOUND)
         add_test(
             NAME rpp_qa_tests_tensor_voxel_host_all
-            COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runTests_voxel.py --qa_mode 1 --batch_size 3
+            COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runVoxelTests.py --qa_mode 1 --batch_size 3
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
         )
     endif(NIFTI_FOUND)
@@ -105,7 +105,7 @@ if(Python3_FOUND)
         if(NIFTI_FOUND)
             add_test(
                 NAME rpp_qa_tests_tensor_voxel_hip_all
-                COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runTests_voxel.py --qa_mode 1 --batch_size 3
+                COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runVoxelTests.py --qa_mode 1 --batch_size 3
                 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
             )
         endif(NIFTI_FOUND)
diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt
index 26017065e..a0bd42fa0 100644
--- a/utilities/test_suite/HIP/CMakeLists.txt
+++ b/utilities/test_suite/HIP/CMakeLists.txt
@@ -83,8 +83,10 @@ if (hip_FOUND AND OpenCV_FOUND)
     link_directories(${ROCM_PATH}/lib /usr/local/lib)
 
     add_executable(Tensor_hip Tensor_hip.cpp)
+    add_executable(Tensor_misc_hip Tensor_misc_hip.cpp)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPU_SUPPORT=1 -DRPP_BACKEND_HIP=1 -std=gnu++17")
     target_link_libraries(Tensor_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device)
+    target_link_libraries(Tensor_misc_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device)
 else()
     message(FATAL_ERROR "-- ${Red}Error: OpenCV and hip must be installed to install ${PROJECT_NAME} successfully!${ColourReset}")
 endif()
diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
new file mode 100644
index 000000000..96197f432
--- /dev/null
+++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
@@ -0,0 +1,231 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "../rpp_test_suite_misc.h"
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+    const int MIN_ARG_COUNT = 9;
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_misc_hip <case number = 0:0> <test type 0/1> <toggle 0/1> <number of dimensions> <batch size> <num runs> <dst path> <script path>\n");
+        return -1;
+    }
+    Rpp32u testCase, testType, nDim, batchSize, numRuns, toggle;
+    bool qaMode;
+
+    testCase = atoi(argv[1]);
+    testType = atoi(argv[2]);
+    toggle = atoi(argv[3]);
+    nDim = atoi(argv[4]);
+    batchSize = atoi(argv[5]);
+    numRuns = atoi(argv[6]);
+    string dst = argv[8];
+    string scriptPath = argv[9];
+    qaMode = (testType == 0);
+    bool axisMaskCase = (testCase == 1);
+    int axisMask = (axisMaskCase) ? atoi(argv[7]) : 1;
+
+    if (qaMode && batchSize != 3)
+    {
+        std::cout<<"QA mode can only run with batchsize 3"<<std::endl;
+        return -1;
+    }
+
+    string funcName = augmentationMiscMap[testCase];
+    if (funcName.empty())
+    {
+        printf("\ncase %d is not supported\n", testCase);
+        return -1;
+    }
+
+    string func = funcName;
+    if (axisMaskCase)
+    {
+        char additionalParam_char[2];
+        std::sprintf(additionalParam_char, "%d", axisMask);
+        func += "_" + std::to_string(nDim) + "d" + "_axisMask";
+        func += additionalParam_char;
+    }
+
+    // fill roi based on mode and number of dimensions
+    Rpp32u *roiTensor;
+    CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, nDim * 2 * batchSize, sizeof(Rpp32u)));
+    fill_roi_values(nDim, batchSize, roiTensor, qaMode);
+
+    // set src/dst generic tensor descriptors
+    RpptGenericDescPtr srcDescriptorPtrND, dstDescriptorPtrND;
+    CHECK_RETURN_STATUS(hipHostMalloc(&srcDescriptorPtrND, sizeof(RpptGenericDesc)));
+    CHECK_RETURN_STATUS(hipHostMalloc(&dstDescriptorPtrND, sizeof(RpptGenericDesc)));
+
+    // set dims and compute strides
+    int bitDepth = 2, offSetInBytes = 0;
+    set_generic_descriptor(srcDescriptorPtrND, nDim, offSetInBytes, bitDepth, batchSize, roiTensor);
+    set_generic_descriptor(dstDescriptorPtrND, nDim, offSetInBytes, bitDepth, batchSize, roiTensor);
+    set_generic_descriptor_layout(srcDescriptorPtrND, dstDescriptorPtrND, nDim, toggle, qaMode);
+
+    Rpp32u bufferSize = 1;
+    for(int i = 0; i <= nDim; i++)
+        bufferSize *= srcDescriptorPtrND->dims[i];
+
+    // allocate memory for input / output
+    Rpp32f *inputF32 = NULL, *outputF32 = NULL;
+    inputF32 = static_cast<Rpp32f *>(calloc(bufferSize, sizeof(Rpp32f)));
+    outputF32 = static_cast<Rpp32f *>(calloc(bufferSize, sizeof(Rpp32f)));
+
+    void *d_inputF32, *d_outputF32;
+    CHECK_RETURN_STATUS(hipMalloc(&d_inputF32, bufferSize * sizeof(Rpp32f)));
+    CHECK_RETURN_STATUS(hipMalloc(&d_outputF32, bufferSize * sizeof(Rpp32f)));
+
+    // read input data
+    if(qaMode)
+        read_data(inputF32, nDim, 0, scriptPath);
+    else
+    {
+        std::srand(0);
+        for(int i = 0; i < bufferSize; i++)
+            inputF32[i] = static_cast<float>((std::rand() % 255));
+    }
+
+    // copy data from HOST to HIP
+    CHECK_RETURN_STATUS(hipMemcpy(d_inputF32, inputF32, bufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice));
+    CHECK_RETURN_STATUS(hipDeviceSynchronize());
+
+    rppHandle_t handle;
+    hipStream_t stream;
+    CHECK_RETURN_STATUS(hipStreamCreate(&stream));
+    rppCreateWithStreamAndBatchSize(&handle, stream, batchSize);
+
+    Rpp32f *meanTensor = nullptr, *stdDevTensor = nullptr;
+    Rpp32f *meanTensorCPU = nullptr, *stdDevTensorCPU = nullptr;
+    double startWallTime, endWallTime;
+    double maxWallTime = 0, minWallTime = 500, avgWallTime = 0, wallTime = 0;
+
+    // case-wise RPP API and measure time script for Unit and Performance test
+    printf("\nRunning %s %d times (each time with a batch size of %d) and computing mean statistics...", func.c_str(), numRuns, batchSize);
+    for(int perfCount = 0; perfCount < numRuns; perfCount++)
+    {
+        switch(testCase)
+        {
+            case 1:
+            {
+                float scale = 1.0;
+                float shift = 0.0;
+
+                // computeMeanStddev set to 3 means both mean and stddev should be computed internally.
+                // Wherein 0th bit used to represent computeMean and 1st bit for computeStddev.
+                Rpp8u computeMeanStddev = 3;
+
+                Rpp32u size = 1; // length of mean and stddev tensors differ based on axisMask and nDim
+                Rpp32u maxSize = 1;
+                for(int batch = 0; batch < batchSize; batch++)
+                {
+                    size = 1;
+                    for(int i = 0; i < nDim; i++)
+                        size *= ((axisMask & (int)(pow(2,i))) >= 1) ? 1 : roiTensor[(nDim * 2 * batch) + nDim + i];
+                    maxSize = max(maxSize, size);
+                }
+
+                // allocate memory if no memory is allocated
+                if(meanTensor == nullptr)
+                    CHECK_RETURN_STATUS(hipMalloc(&meanTensor, maxSize * batchSize * sizeof(Rpp32f)));
+
+                if(stdDevTensor == nullptr)
+                    CHECK_RETURN_STATUS(hipMalloc(&stdDevTensor, maxSize * batchSize * sizeof(Rpp32f)));
+
+                if(!computeMeanStddev)
+                {
+                    if(meanTensorCPU == nullptr)
+                        meanTensorCPU = static_cast<Rpp32f *>(malloc(maxSize * sizeof(Rpp32f)));
+                    if(stdDevTensorCPU == nullptr)
+                        stdDevTensorCPU = static_cast<Rpp32f *>(malloc(maxSize * sizeof(Rpp32f)));
+                    fill_mean_stddev_values(nDim, maxSize, meanTensorCPU, stdDevTensorCPU, qaMode, axisMask, scriptPath);
+                    CHECK_RETURN_STATUS(hipMemcpy(meanTensor, meanTensorCPU, maxSize * sizeof(Rpp32f), hipMemcpyHostToDevice));
+                    CHECK_RETURN_STATUS(hipMemcpy(stdDevTensor, stdDevTensorCPU, maxSize * sizeof(Rpp32f), hipMemcpyHostToDevice));
+                    CHECK_RETURN_STATUS(hipDeviceSynchronize());
+                }
+
+                startWallTime = omp_get_wtime();
+                rppt_normalize_gpu(d_inputF32, srcDescriptorPtrND, d_outputF32, dstDescriptorPtrND, axisMask, meanTensor, stdDevTensor, computeMeanStddev, scale, shift, roiTensor, handle);
+
+                // compare outputs if qaMode is true
+                if(qaMode)
+                {
+                    CHECK_RETURN_STATUS(hipDeviceSynchronize());
+                    CHECK_RETURN_STATUS(hipMemcpy(outputF32, d_outputF32, bufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost));
+                    CHECK_RETURN_STATUS(hipDeviceSynchronize());
+                    bool externalMeanStd = !computeMeanStddev; // when mean and stddev is passed from user
+                    compare_output(outputF32, nDim, batchSize, bufferSize, dst, func, axisMask, scriptPath, externalMeanStd);
+                }
+                break;
+            }
+            default:
+            {
+                cout << "functionality is not supported" <<std::endl;
+                exit(0);
+            }
+        }
+        if(!qaMode)
+        {
+            CHECK_RETURN_STATUS(hipDeviceSynchronize());
+            endWallTime = omp_get_wtime();
+
+            wallTime = endWallTime - startWallTime;
+            maxWallTime = std::max(maxWallTime, wallTime);
+            minWallTime = std::min(minWallTime, wallTime);
+            avgWallTime += wallTime;
+        }
+    }
+    rppDestroyGPU(handle);
+
+    if(!qaMode)
+    {
+        maxWallTime *= 1000;
+        minWallTime *= 1000;
+        avgWallTime *= 1000;
+        avgWallTime /= numRuns;
+        cout << fixed << "\nmax,min,avg wall times in ms/batch = " << maxWallTime << "," << minWallTime << "," << avgWallTime;
+    }
+
+
+    free(inputF32);
+    free(outputF32);
+    CHECK_RETURN_STATUS(hipHostFree(srcDescriptorPtrND));
+    CHECK_RETURN_STATUS(hipHostFree(dstDescriptorPtrND));
+    CHECK_RETURN_STATUS(hipHostFree(roiTensor));
+    CHECK_RETURN_STATUS(hipFree(d_inputF32));
+    CHECK_RETURN_STATUS(hipFree(d_outputF32));
+    if(meanTensor != nullptr)
+        CHECK_RETURN_STATUS(hipFree(meanTensor));
+    if(stdDevTensor != nullptr)
+        CHECK_RETURN_STATUS(hipFree(stdDevTensor));
+    if(meanTensorCPU != nullptr)
+        free(meanTensorCPU);
+    if(stdDevTensorCPU != nullptr)
+        free(stdDevTensorCPU);
+
+    return 0;
+}
diff --git a/utilities/test_suite/HIP/runMiscTests.py b/utilities/test_suite/HIP/runMiscTests.py
new file mode 100644
index 000000000..bbe037b06
--- /dev/null
+++ b/utilities/test_suite/HIP/runMiscTests.py
@@ -0,0 +1,281 @@
+"""
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import os
+import subprocess  # nosec
+import argparse
+import datetime
+import shutil
+import sys
+sys.dont_write_bytecode = True
+sys.path.append(os.path.join(os.path.dirname( __file__ ), '..' ))
+from common import *
+
+# Set the timestamp
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+scriptPath = os.path.dirname(os.path.realpath(__file__))
+outFolderPath = os.getcwd()
+buildFolderPath = os.getcwd()
+caseMin = 1
+caseMax = 1
+
+# Get a list of log files based on a flag for preserving output
+def get_log_file_list():
+    return [
+        outFolderPath + "/OUTPUT_PERFORMANCE_MISC_LOGS_HIP_" + timestamp + "/Tensor_misc_hip_raw_performance_log.txt",
+    ]
+
+def case_file_check(CASE_FILE_PATH, new_file):
+    try:
+        case_file = open(CASE_FILE_PATH,'r')
+        for line in case_file:
+            print(line)
+            if not(line.startswith('"Name"')):
+                new_file.write(line)
+        case_file.close()
+        return True
+    except IOError:
+        print("Unable to open case results")
+        return False
+
+# Generate performance reports based on counters and a list of types
+def generate_performance_reports(RESULTS_DIR):
+    import pandas as pd
+    pd.options.display.max_rows = None
+    # Generate performance report
+    df = pd.read_csv(RESULTS_DIR + "/consolidated_results.stats.csv")
+    df["AverageMs"] = df["AverageNs"] / 1000000
+    dfPrint = df.drop(['Percentage'], axis = 1)
+    dfPrint["HIP Kernel Name"] = dfPrint.iloc[:,0].str.lstrip("Hip_")
+    dfPrint_noIndices = dfPrint.astype(str)
+    dfPrint_noIndices.replace(['0', '0.0'], '', inplace = True)
+    dfPrint_noIndices = dfPrint_noIndices.to_string(index = False)
+    print(dfPrint_noIndices)
+
+def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+    print(result.stdout.decode())
+    print("------------------------------------------------------------------------------------------")
+
+def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
+        print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+        read_from_subprocess_and_write_to_log(process, logFile)
+
+def run_performance_test_with_profiler_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    if not os.path.exists(f"{outFilePath}/case_{case}"):
+        os.mkdir(f"{outFilePath}/case_{case}")
+
+    with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
+        print(f"\nrocprof --basenames on --timestamp on --stats -o {outFilePath}/case_{case}/output_case{case}.csv ./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+        process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{outFilePath}/case_{case}/output_case{case}.csv", "./Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+        read_from_subprocess_and_write_to_log(process, logFile)
+    print("------------------------------------------------------------------------------------------")
+
+def run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg, profilingOption = 'NO'):
+    print("\n\n\n\n")
+    print("--------------------------------")
+    print("Running a New Functionality...")
+    print("--------------------------------")
+    if testType == 0:
+        run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+    elif testType == 1 and profilingOption == "NO":
+        run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+    elif testType == 1 and profilingOption == "YES":
+        run_performance_test_with_profiler_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+
+# Parse and validate command-line arguments for the RPP test suite
+def rpp_test_suite_parser_and_validator():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = QA tests / 1 = Performance tests)")
+    parser.add_argument('--toggle', type = int, default = 0, help = "Toggle outputs")
+    parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to test", required = False)
+    parser.add_argument("--num_dims", type = int, default = 2, help = "Number of dimensions for input")
+    parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
+    parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
+    parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Outputs from tests will be compared with golden outputs - (0 / 1)", required = False)
+    parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
+    parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
+    args = parser.parse_args()
+
+    # validate the parameters passed by user
+    if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
+        print("Starting case# and Ending case# must be in the 0:1 range. Aborting!")
+        exit(0)
+    elif args.case_end < args.case_start:
+        print("Ending case# must be greater than starting case#. Aborting!")
+        exit(0)
+    elif args.test_type < 0 or args.test_type > 1:
+        print("Test Type# must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.qa_mode < 0 or args.qa_mode > 1:
+        print("QA mode must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.case_list is not None and args.case_start > caseMin and args.case_end < caseMax:
+        print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
+        exit(0)
+    elif args.num_runs <= 0:
+        print("Number of Runs must be greater than 0. Aborting!")
+        exit(0)
+    elif args.batch_size <= 0:
+        print("Batch size must be greater than 0. Aborting!")
+        exit(0)
+    elif args.profiling != 'YES' and args.profiling != 'NO':
+        print("Profiling option value must be either 'YES' or 'NO'.")
+        exit(0)
+
+    if args.case_list is None:
+        args.case_list = range(args.case_start, args.case_end + 1)
+        args.case_list = [str(x) for x in args.case_list]
+    else:
+        for case in args.case_list:
+            if int(case) < caseMin or int(case) > caseMax:
+                print("The case# must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+                exit(0)
+    return args
+
+args = rpp_test_suite_parser_and_validator()
+caseStart = args.case_start
+caseEnd = args.case_end
+testType = args.test_type
+toggle = args.toggle
+caseList = args.case_list
+numDims = args.num_dims
+numRuns = args.num_runs
+profilingOption = args.profiling
+batchSize = args.batch_size
+qaMode = args.qa_mode
+if qaMode:
+    testType = 0
+preserveOutput = args.preserve_output
+outFilePath = " "
+
+if testType == 0 and batchSize != 3:
+    print("QA mode can only run with a batch size of 3.")
+    exit(0)
+
+if preserveOutput == 0:
+    validate_and_remove_folders(outFolderPath, "QA_RESULTS_MISC_HIP")
+    validate_and_remove_folders(outFolderPath, "OUTPUT_PERFORMANCE_MISC_LOGS_HIP")
+
+if(testType == 0):
+    outFilePath = outFolderPath + '/QA_RESULTS_MISC_HIP_' + timestamp
+    numRuns = 1
+elif(testType == 1):
+    if "--num_runs" not in sys.argv:
+        numRuns = 100   #default numRuns for running performance tests
+    outFilePath = outFolderPath + '/OUTPUT_PERFORMANCE_MISC_LOGS_HIP_' + timestamp
+else:
+    print("Invalid TEST_TYPE specified. TEST_TYPE should be 0/1 (0 = QA tests / 1 = Performance tests)")
+    exit(0)
+
+os.mkdir(outFilePath)
+loggingFolder = outFilePath
+dstPath = outFilePath
+
+# Enable extglob
+if os.path.exists(buildFolderPath + "/build"):
+    shutil.rmtree(buildFolderPath + "/build")
+os.makedirs(buildFolderPath + "/build")
+os.chdir(buildFolderPath + "/build")
+
+# Run cmake and make commands
+subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.run(["make", "-j16"], cwd=".")    # nosec
+
+supportedCaseList = ['1']
+for case in caseList:
+    if case not in supportedCaseList:
+        continue
+    if case == "1":
+        for axisMask in range(1, pow(2, numDims)):
+            run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, axisMask, profilingOption)
+    else:
+        run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, "", profilingOption)
+
+if (testType == 1 and profilingOption == "YES"):
+    RESULTS_DIR = outFolderPath + "/OUTPUT_PERFORMANCE_MISC_LOGS_HIP_" + timestamp
+    print("RESULTS_DIR = " + RESULTS_DIR)
+    CONSOLIDATED_FILE = RESULTS_DIR + "/consolidated_results.stats.csv"
+
+    CASE_NUM_LIST = caseList
+    BIT_DEPTH_LIST = [2]
+    OFT_LIST = [0]
+
+    # Open csv file
+    new_file = open(CONSOLIDATED_FILE, 'w')
+    new_file.write('"HIP Kernel Name","Calls","TotalDurationNs","AverageNs","Percentage"\n')
+
+    # Loop through cases
+    for CASE_NUM in CASE_NUM_LIST:
+        # Set results directory
+        CASE_RESULTS_DIR = RESULTS_DIR + "/case_" + str(CASE_NUM)
+        print("CASE_RESULTS_DIR = " + CASE_RESULTS_DIR)
+
+        # Loop through bit depths
+        for BIT_DEPTH in BIT_DEPTH_LIST:
+            # Loop through output format toggle cases
+            for OFT in OFT_LIST:
+                # Write into csv file
+                CASE_FILE_PATH = CASE_RESULTS_DIR + "/output_case" + str(CASE_NUM) + ".stats.csv"
+                print("CASE_FILE_PATH = " + CASE_FILE_PATH)
+                fileCheck = case_file_check(CASE_FILE_PATH, new_file)
+                if fileCheck == False:
+                    continue
+
+    new_file.close()
+    subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), CONSOLIDATED_FILE])  # nosec
+    try:
+        generate_performance_reports(RESULTS_DIR)
+    except ImportError:
+        print("\nPandas not available! Results of GPU profiling experiment are available in the following files:\n" + \
+                CONSOLIDATED_FILE + "\n")
+    except IOError:
+        print("Unable to open results in " + CONSOLIDATED_FILE)
+
+# print the results of qa tests
+nonQACaseList = []
+supportedCases = 0
+for num in caseList:
+    if num in supportedCaseList:
+        supportedCases += 1
+caseInfo = "Tests are run for " + str(supportedCases) + " supported cases out of the " + str(len(caseList)) + " cases requested"
+if testType == 0:
+    qaFilePath = os.path.join(outFilePath, "QA_results.txt")
+    checkFile = os.path.isfile(qaFilePath)
+    if checkFile:
+        print("---------------------------------- Results of QA Test - Tensor_misc_hip ----------------------------------\n")
+        print_qa_tests_summary(qaFilePath, supportedCaseList, nonQACaseList)
+
+# Performance tests
+if (testType == 1 and profilingOption == "NO"):
+    logFileList = get_log_file_list()
+    functionalityGroupList = ["statiscal_operations"]
+
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index 28db84bee..d32d20e2b 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -116,18 +116,13 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
 
             print("------------------------------------------------------------------------------------------")
 
-def run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
-    with open("{}/Tensor_hip_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file:
+def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+    with open("{}/Tensor_hip_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
         print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ")
         process = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)   # nosec
-        while True:
-            output = process.stdout.readline()
-            if not output and process.poll() is not None:
-                break
-            print(output.strip())
-            log_file.write(output)
+        read_from_subprocess_and_write_to_log(process, logFile)
 
-def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     print("\n\n\n\n")
     print("--------------------------------")
     print("Running a New Functionality...")
@@ -143,60 +138,34 @@ def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dst
 
             if case == "40" or case == "41" or case == "49" or case == "54":
                 for kernelSize in range(3, 10, 2):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, kernelSize, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, kernelSize, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             elif case == "8":
                 # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
                 for noiseType in range(3):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             elif case == "21" or case == "23" or case == "24":
                 # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
                 for interpolationType in range(6):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             else:
-                run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "0", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "0", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
                 print("------------------------------------------------------------------------------------------")
 
-def run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, additionalParamType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+def run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, additionalParamType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     addtionalParamString = additionalParamType + str(additionalParam)
-    if layout == 0:
-        if not os.path.isdir(f"{dstPath}/Tensor_PKD3/case_{case}"):
-            os.mkdir(f"{dstPath}/Tensor_PKD3/case_{case}")
-        with open(f"{loggingFolder}/Tensor_hip_{log_file_layout}_raw_performance_log.txt", "a") as log_file:
-            print(f'rocprof --basenames on --timestamp on --stats -o {dstPath}/Tensor_PKD3/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv ./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0')
-            process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_PKD3/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)   # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                output_str = output.decode('utf-8')
-                log_file.write(output_str)
-    elif layout == 1:
-        if not os.path.isdir(f"{dstPath}/Tensor_PLN3/case_{case}"):
-            os.mkdir(f"{dstPath}/Tensor_PLN3/case_{case}")
-        with open(f"{loggingFolder}/Tensor_hip_{log_file_layout}_raw_performance_log.txt", "a") as log_file:
-            print(f'rocprof --basenames on --timestamp on --stats -o {dstPath}/Tensor_PLN3/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv ./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0')
-            process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_PLN3/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)    # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                output_str = output.decode('utf-8')
-                log_file.write(output_str)
-    elif layout == 2:
-        if not os.path.isdir(f"{dstPath}/Tensor_PLN1/case_{case}"):
-            os.mkdir(f"{dstPath}/Tensor_PLN1/case_{case}")
-        with open(f"{loggingFolder}/Tensor_hip_{log_file_layout}_raw_performance_log.txt", "a") as log_file:
-            print(f'rocprof --basenames on --timestamp on --stats -o "{dstPath}/Tensor_PLN1/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv" "./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0"')
-            process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_PLN1/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)   # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                output_str = output.decode('utf-8')
-                log_file.write(output_str)
+    layoutName = get_layout_name(layout)
+    if not os.path.isdir(f"{dstPath}/Tensor_{layoutName}/case_{case}"):
+        os.mkdir(f"{dstPath}/Tensor_{layoutName}/case_{case}")
+    with open(f"{loggingFolder}/Tensor_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
+        print(f'rocprof --basenames on --timestamp on --stats -o {dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv ./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0')
+        process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)   # nosec
+        while True:
+            output = process.stdout.readline()
+            if not output and process.poll() is not None:
+                break
+            print(output.strip())
+            output_str = output.decode('utf-8')
+            logFile.write(output_str)
 
 # Parse and validate command-line arguments for the RPP test suite
 def rpp_test_suite_parser_and_validator():
@@ -352,7 +321,7 @@ def rpp_test_suite_parser_and_validator():
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
         for layout in range(3):
-            dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
+            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
 
             if qaMode == 0:
                 if not os.path.isdir(dstPathTemp):
@@ -372,9 +341,9 @@ def rpp_test_suite_parser_and_validator():
                 srcPath1 = ricapInFilePath
                 srcPath2 = ricapInFilePath
             for layout in range(3):
-                dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
+                dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
 
-                run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
 
     elif (testType == 1 and profilingOption == "YES"):
         NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69]
@@ -386,7 +355,7 @@ def rpp_test_suite_parser_and_validator():
                 srcPath1 = ricapInFilePath
                 srcPath2 = ricapInFilePath
             for layout in range(3):
-                dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
+                dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
 
                 print("\n\n\n\n")
                 print("--------------------------------")
@@ -403,17 +372,17 @@ def rpp_test_suite_parser_and_validator():
 
                         if case == "40" or case == "41" or case == "49" or case == "54":
                             for kernelSize in range(3, 10, 2):
-                                run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, kernelSize, "_kernelSize", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                                run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, kernelSize, "_kernelSize", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
                         elif case == "8":
                             # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
                             for noiseType in range(3):
-                                run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, "_noiseType", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                                run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, "_noiseType", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
                         elif case == "21" or case == "23" or case == "24":
                             # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
                             for interpolationType in range(6):
-                                run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, "_interpolationType", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                                run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, "_interpolationType", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
                         else:
-                            run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "", "", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                            run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "", "", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
 
                         print("------------------------------------------------------------------------------------------")
 
@@ -509,9 +478,9 @@ def rpp_test_suite_parser_and_validator():
             print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv")
 
 if (testType == 1 and profilingOption == "NO"):
-    log_file_list = get_log_file_list(preserveOutput)
+    logFileList = get_log_file_list(preserveOutput)
 
-    functionality_group_list = [
+    functionalityGroupList = [
     "color_augmentations",
     "data_exchange_operations",
     "effects_augmentations",
@@ -522,8 +491,8 @@ def rpp_test_suite_parser_and_validator():
     "logical_operations",
     "statistical_operations"
     ]
-    for log_file in log_file_list:
-        print_performance_tests_summary(log_file, functionality_group_list, numRuns)
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
 
 # print the results of qa tests
 nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
diff --git a/utilities/test_suite/HIP/runTests_voxel.py b/utilities/test_suite/HIP/runVoxelTests.py
similarity index 66%
rename from utilities/test_suite/HIP/runTests_voxel.py
rename to utilities/test_suite/HIP/runVoxelTests.py
index 7f1dff6f4..b6f2ebe5a 100644
--- a/utilities/test_suite/HIP/runTests_voxel.py
+++ b/utilities/test_suite/HIP/runVoxelTests.py
@@ -56,89 +56,55 @@ def func_group_finder(case_number):
     else:
         return "miscellaneous"
 
-def run_unit_test(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    bitDepths = [0, 2]
-    if qaMode:
-        bitDepths = [2]
-    for bitDepth in bitDepths:
-        print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
+def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
+    print(result.stdout.decode())
+    print("------------------------------------------------------------------------------------------")
+
+def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
         print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-        result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
-        print(result.stdout.decode())
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+        while True:
+            output = process.stdout.readline()
+            if not output and process.poll() is not None:
+                break
+            print(output.strip())
+            if "Running" in output or "max,min,avg wall times" in output:
+                cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
+                cleanedOutput = cleanedOutput.strip()  # Remove leading/trailing whitespace
+                logFile.write(cleanedOutput + '\n')
+                if "max,min,avg wall times" in output:
+                    logFile.write("\n")
         print("------------------------------------------------------------------------------------------")
 
-def run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
+def run_performance_test_with_profiler_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    layoutName = get_layout_name(layout)
+    if not os.path.exists(f"{loggingFolder}/Tensor_{layoutName}/case_{case}"):
+        os.mkdir(f"{loggingFolder}/Tensor_{layoutName}/case_{case}")
+
     bitDepths = [0, 2]
     for bitDepth in bitDepths:
-        with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-            print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-            process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+        with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
+            print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_{layoutName}/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp}  {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
+            process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
             while True:
                 output = process.stdout.readline()
                 if not output and process.poll() is not None:
                     break
                 print(output.strip())
-                if "Running" in output or "max,min,avg wall times" in output:
-                    cleaned_output = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
-                    cleaned_output = cleaned_output.strip()  # Remove leading/trailing whitespace
-                    log_file.write(cleaned_output + '\n')
-                    if "max,min,avg wall times" in output:
-                        log_file.write("\n")
-        print("------------------------------------------------------------------------------------------")
-
-def run_performance_test_with_profiler(loggingFolder, logFileLayout, dstPath, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    bitDepths = [0, 2]
-    for bitDepth in bitDepths:
-        if layout == 0:
-            if not os.path.exists(f"{dstPath}/Tensor_PKD3/case_{case}"):
-                os.mkdir(f"{dstPath}/Tensor_PKD3/case_{case}")
-            with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-                print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_PKD3/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp}  {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-                process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_PKD3/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
-                while True:
-                    output = process.stdout.readline()
-                    if not output and process.poll() is not None:
-                        break
-                    print(output.strip())
-                    log_file.write(output.decode('utf-8'))
-        elif layout == 1:
-            if not os.path.exists(f"{dstPath}/Tensor_PLN3/case_{case}"):
-                os.mkdir(f"{dstPath}/Tensor_PLN3/case_{case}")
-            with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-                print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_PLN3/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp}  {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-                process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_PLN3/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
-                while True:
-                    output = process.stdout.readline()
-                    if not output and process.poll() is not None:
-                        break
-                    print(output.strip())
-                    log_file.write(output.decode('utf-8'))
-        elif layout == 2:
-            if not os.path.exists(f"{dstPath}/Tensor_PLN1/case_{case}"):
-                os.mkdir(f"{dstPath}/Tensor_PLN1/case_{case}")
-            with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-                print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_PLN1/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp}  {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-                process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_PLN1/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
-                while True:
-                    output = process.stdout.readline()
-                    if not output and process.poll() is not None:
-                        break
-                    print(output.strip())
-                    log_file.write(output.decode('utf-8'))
+                logFile.write(output.decode('utf-8'))
     print("------------------------------------------------------------------------------------------")
 
+def run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize, profilingOption = 'NO'):
+    if testType == 0:
+        run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+    elif testType == 1 and profilingOption == "NO":
+        run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+    elif testType == 1 and profilingOption == "YES":
+        run_performance_test_with_profiler_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+
 # Parse and validate command-line arguments for the RPP test suite
 def rpp_test_suite_parser_and_validator():
     parser = argparse.ArgumentParser()
@@ -277,24 +243,28 @@ def rpp_test_suite_parser_and_validator():
 print("Running all layout Inputs...")
 print("##########################################################################################")
 
-if testType == 0:
+bitDepths = [0, 2]
+if (testType == 0 or (testType == 1 and profilingOption == "NO")):
     for case in caseList:
         if case not in supportedCaseList:
             continue
+        print("\n\n\n\n")
+        print("--------------------------------")
+        print("Running a New Functionality...")
+        print("--------------------------------")
         for layout in range(3):
             dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
-            if qaMode == 0:
+            if testType == 0 and qaMode == 0:
                 if not os.path.isdir(dstPathTemp):
                     os.mkdir(dstPathTemp)
 
-            run_unit_test(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
-elif (testType == 1 and profilingOption == "NO"):
-    for case in caseList:
-        if case not in supportedCaseList:
-            continue
-        for layout in range(3):
-            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
-            run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+            bitDepths = [0, 2]
+            if testType == 0 and qaMode:
+                bitDepths = [2]
+            for bitDepth in bitDepths:
+                print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
+                print("\n\n\n\n")
+                run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
 elif (testType == 1 and profilingOption == "YES"):
     NEW_FUNC_GROUP_LIST = [0, 1]
     for case in caseList:
@@ -302,7 +272,7 @@ def rpp_test_suite_parser_and_validator():
             continue
         for layout in range(3):
             dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
-            run_performance_test_with_profiler(loggingFolder, logFileLayout, dstPath, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+            run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize, profilingOption)
 
         RESULTS_DIR = ""
         RESULTS_DIR = outFolderPath + "/OUTPUT_PERFORMANCE_LOGS_HIP_VOXEL_" + timestamp
@@ -378,13 +348,8 @@ def rpp_test_suite_parser_and_validator():
 if (testType == 0 and qaMode == 0): # Unit tests
     create_layout_directories(dstPath, layoutDict)
 elif (testType == 1 and profilingOption == "NO"): # Performance tests
-    log_file_list = get_log_file_list(preserveOutput)
-
-    functionality_group_list = [
-        "arithmetic_operations",
-        "geometric_augmentations",
-        "effects_augmentations"
-    ]
+    logFileList = get_log_file_list(preserveOutput)
+    functionalityGroupList = ["arithmetic_operations", "geometric_augmentations", "effects_augmentations"]
 
-    for log_file in log_file_list:
-        print_performance_tests_summary(log_file, functionality_group_list, numRuns)
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/HOST/CMakeLists.txt b/utilities/test_suite/HOST/CMakeLists.txt
index b7abf5d77..13a9b5d1c 100644
--- a/utilities/test_suite/HOST/CMakeLists.txt
+++ b/utilities/test_suite/HOST/CMakeLists.txt
@@ -82,6 +82,7 @@ if (OpenCV_FOUND)
     link_directories(${ROCM_PATH}/lib /usr/local/lib)
 
     add_executable(Tensor_host Tensor_host.cpp)
+    add_executable(Tensor_misc_host Tensor_misc_host.cpp)
     add_executable(BatchPD_host_pkd3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp)
     add_executable(BatchPD_host_pln1 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp)
     add_executable(BatchPD_host_pln3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp)
@@ -91,6 +92,7 @@ if (OpenCV_FOUND)
     target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
     target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
     target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
+    target_link_libraries(Tensor_misc_host ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
 else()
     message("-- ${Red}Error: OpenCV must be installed to install ${PROJECT_NAME} successfully!${ColourReset}")
 endif()
@@ -117,10 +119,10 @@ else()
     include_directories(${SndFile_INCLUDE_DIRS})
     link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/)
 
-    add_executable(Tensor_host_audio Tensor_host_audio.cpp)
+    add_executable(Tensor_audio_host Tensor_audio_host.cpp)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
     if(NOT APPLE)
         set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs)
     endif()
-    target_link_libraries(Tensor_host_audio ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST})
+    target_link_libraries(Tensor_audio_host ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST})
 endif()
diff --git a/utilities/test_suite/HOST/Tensor_host_audio.cpp b/utilities/test_suite/HOST/Tensor_audio_host.cpp
similarity index 99%
rename from utilities/test_suite/HOST/Tensor_host_audio.cpp
rename to utilities/test_suite/HOST/Tensor_audio_host.cpp
index 6f2b888c4..f982d3360 100644
--- a/utilities/test_suite/HOST/Tensor_host_audio.cpp
+++ b/utilities/test_suite/HOST/Tensor_audio_host.cpp
@@ -112,7 +112,7 @@ int main(int argc, char **argv)
     // create generic descriptor in case of slice
     RpptGenericDesc descriptor3D;
     RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
-    if(testCase == 4)
+    if(testCase == 5)
     {
         descriptorPtr3D->numDims = 2;
         descriptorPtr3D->offsetInBytes = 0;
diff --git a/utilities/test_suite/HOST/Tensor_misc_host.cpp b/utilities/test_suite/HOST/Tensor_misc_host.cpp
new file mode 100644
index 000000000..4efcd93fe
--- /dev/null
+++ b/utilities/test_suite/HOST/Tensor_misc_host.cpp
@@ -0,0 +1,193 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "../rpp_test_suite_misc.h"
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+    const int MIN_ARG_COUNT = 9;
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_normalize_host <case number = 0:0> <test type 0/1> <toggle 0/1> <number of dimensions> <batch size> <num runs> <dst path> <script path>\n");
+        return -1;
+    }
+    Rpp32u testCase, testType, nDim, batchSize, numRuns, toggle, addi;
+    bool qaMode;
+
+    testCase = atoi(argv[1]);
+    testType = atoi(argv[2]);
+    toggle = atoi(argv[3]);
+    nDim = atoi(argv[4]);
+    batchSize = atoi(argv[5]);
+    numRuns = atoi(argv[6]);
+    string dst = argv[8];
+    string scriptPath = argv[9];
+    qaMode = (testType == 0);
+    bool axisMaskCase = (testCase == 1);
+    int axisMask = (axisMaskCase) ? atoi(argv[7]) : 1;
+
+    if (qaMode && batchSize != 3)
+    {
+        std::cout<<"QA mode can only run with batchsize 3"<<std::endl;
+        return -1;
+    }
+
+    string funcName = augmentationMiscMap[testCase];
+    if (funcName.empty())
+    {
+        printf("\ncase %d is not supported\n", testCase);
+        return -1;
+    }
+
+    string func = funcName;
+    if (axisMaskCase)
+    {
+        char additionalParam_char[2];
+        std::sprintf(additionalParam_char, "%d", axisMask);
+        func += "_" + std::to_string(nDim) + "d" + "_axisMask";
+        func += additionalParam_char;
+    }
+
+    // fill roi based on mode and number of dimensions
+    Rpp32u *roiTensor = static_cast<Rpp32u *>(calloc(nDim * 2 * batchSize, sizeof(Rpp32u)));
+    fill_roi_values(nDim, batchSize, roiTensor, qaMode);
+
+    // set src/dst generic tensor descriptors
+    RpptGenericDesc srcDescriptor, dstDescriptor;
+    RpptGenericDescPtr srcDescriptorPtrND, dstDescriptorPtrND;
+    srcDescriptorPtrND = &srcDescriptor;
+    dstDescriptorPtrND = &dstDescriptor;
+    int bitDepth = 2, offSetInBytes = 0;
+    set_generic_descriptor(srcDescriptorPtrND, nDim, offSetInBytes, bitDepth, batchSize, roiTensor);
+    set_generic_descriptor(dstDescriptorPtrND, nDim, offSetInBytes, bitDepth, batchSize, roiTensor);
+    set_generic_descriptor_layout(srcDescriptorPtrND, dstDescriptorPtrND, nDim, toggle, qaMode);
+
+    Rpp32u bufferSize = 1;
+    for(int i = 0; i <= nDim; i++)
+        bufferSize *= srcDescriptorPtrND->dims[i];
+
+    // allocate memory for input / output
+    Rpp32f *inputF32 = NULL, *outputF32 = NULL;
+    inputF32 = static_cast<Rpp32f *>(calloc(bufferSize, sizeof(Rpp32f)));
+    outputF32 = static_cast<Rpp32f *>(calloc(bufferSize, sizeof(Rpp32f)));
+
+    // read input data
+    if(qaMode)
+        read_data(inputF32, nDim, 0, scriptPath);
+    else
+    {
+        std::srand(0);
+        for(int i = 0; i < bufferSize; i++)
+            inputF32[i] = static_cast<float>(std::rand() % 255);
+    }
+
+    // Set the number of threads to be used by OpenMP pragma for RPP batch processing on host.
+    // If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size
+    Rpp32u numThreads = 0;
+    rppHandle_t handle;
+    rppCreateWithBatchSize(&handle, batchSize, numThreads);
+
+    Rpp32f *meanTensor = nullptr, *stdDevTensor = nullptr;
+    double startWallTime, endWallTime;
+    double maxWallTime = 0, minWallTime = 500, avgWallTime = 0, wallTime = 0;
+
+    // case-wise RPP API and measure time script for Unit and Performance test
+    printf("\nRunning %s %d times (each time with a batch size of %d) and computing mean statistics...", func.c_str(), numRuns, batchSize);
+    for(int perfCount = 0; perfCount < numRuns; perfCount++)
+    {
+        switch(testCase)
+        {
+            case 1:
+            {
+                float scale = 1.0;
+                float shift = 0.0;
+                // computeMeanStddev set to 3 means both mean and stddev should be computed internally.
+                // Wherein 0th bit used to represent computeMean and 1st bit for computeStddev.
+                Rpp8u computeMeanStddev = 3;
+
+                Rpp32u size = 1; // length of mean and stddev tensors differ based on axisMask and nDim
+                Rpp32u maxSize = 1;
+                for(int batch = 0; batch < batchSize; batch++)
+                {
+                    size = 1;
+                    for(int i = 0; i < nDim; i++)
+                        size *= ((axisMask & (int)(pow(2,i))) >= 1) ? 1 : roiTensor[(nDim * 2 * batch) + nDim + i];
+                    maxSize = max(maxSize, size);
+                }
+
+                // allocate memory if no memory is allocated
+                if(meanTensor == nullptr)
+                    meanTensor = static_cast<Rpp32f *>(calloc(maxSize * batchSize, sizeof(Rpp32f)));
+
+                if(stdDevTensor == nullptr)
+                    stdDevTensor = static_cast<Rpp32f *>(calloc(maxSize * batchSize, sizeof(Rpp32f)));
+
+                if(!computeMeanStddev)
+                    fill_mean_stddev_values(nDim, maxSize, meanTensor, stdDevTensor, qaMode, axisMask, scriptPath);
+
+                startWallTime = omp_get_wtime();
+                rppt_normalize_host(inputF32, srcDescriptorPtrND, outputF32, dstDescriptorPtrND, axisMask, meanTensor, stdDevTensor, computeMeanStddev, scale, shift, roiTensor, handle);
+
+                // compare outputs if qaMode is true
+                if(qaMode)
+                {
+                    bool externalMeanStd = !computeMeanStddev; // when mean and stddev is passed from user
+                    compare_output(outputF32, nDim, batchSize, bufferSize, dst, func, axisMask, scriptPath, externalMeanStd);
+                }
+                break;
+            }
+            default:
+            {
+                cout << "functionality is not supported" <<std::endl;
+                exit(0);
+            }
+        }
+        endWallTime = omp_get_wtime();
+
+        wallTime = endWallTime - startWallTime;
+        maxWallTime = std::max(maxWallTime, wallTime);
+        minWallTime = std::min(minWallTime, wallTime);
+        avgWallTime += wallTime;
+    }
+
+    if(!qaMode)
+    {
+        maxWallTime *= 1000;
+        minWallTime *= 1000;
+        avgWallTime *= 1000;
+        avgWallTime /= numRuns;
+        cout << fixed << "\nmax,min,avg wall times in ms/batch = " << maxWallTime << "," << minWallTime << "," << avgWallTime;
+    }
+
+    free(inputF32);
+    free(outputF32);
+    free(roiTensor);
+    if(meanTensor != nullptr)
+        free(meanTensor);
+    if(stdDevTensor != nullptr)
+        free(stdDevTensor);
+    return 0;
+}
diff --git a/utilities/test_suite/HOST/runAudioTests.py b/utilities/test_suite/HOST/runAudioTests.py
index 94bd15251..0d213a142 100644
--- a/utilities/test_suite/HOST/runAudioTests.py
+++ b/utilities/test_suite/HOST/runAudioTests.py
@@ -41,35 +41,31 @@
 # Get a list of log files based on a flag for preserving output
 def get_log_file_list():
     return [
-        outFolderPath + "/OUTPUT_PERFORMANCE_AUDIO_LOGS_HOST_" + timestamp + "/Tensor_host_audio_raw_performance_log.txt",
+        outFolderPath + "/OUTPUT_PERFORMANCE_AUDIO_LOGS_HOST_" + timestamp + "/Tensor_audio_host_raw_performance_log.txt",
     ]
 
-def run_unit_test(srcPath, case, numRuns, testType, batchSize, outFilePath):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    print(f"./Tensor_host_audio {srcPath} {case} {testType} {numRuns} {batchSize}")
-    result = subprocess.run([buildFolderPath + "/build/Tensor_host_audio", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+def run_unit_test_cmd(srcPath, case, numRuns, testType, batchSize, outFilePath):
+    print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
     print(result.stdout.decode())
-
     print("------------------------------------------------------------------------------------------")
 
-def run_performance_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
+def run_performance_test_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
+    with open("{}/Tensor_audio_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
+        print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize} ")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+        read_from_subprocess_and_write_to_log(process, logFile)
+        print("------------------------------------------------------------------------------------------")
+
+def run_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
     print("\n\n\n\n")
     print("--------------------------------")
     print("Running a New Functionality...")
     print("--------------------------------")
-    with open("{}/Tensor_host_audio_raw_performance_log.txt".format(loggingFolder), "a") as log_file:
-        print(f"./Tensor_host_audio {srcPath} {case} {testType} {numRuns} {batchSize} ")
-        process = subprocess.Popen([buildFolderPath + "/build/Tensor_host_audio", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
-        while True:
-            output = process.stdout.readline()
-            if not output and process.poll() is not None:
-                break
-            print(output.strip())
-            log_file.write(output)
-    print("------------------------------------------------------------------------------------------")
+    if testType == 0:
+        run_unit_test_cmd(srcPath, case, numRuns, testType, batchSize, outFilePath)
+    elif testType == 1:
+        run_performance_test_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath)
 
 # Parse and validate command-line arguments for the RPP test suite
 def rpp_test_suite_parser_and_validator():
@@ -178,34 +174,21 @@ def rpp_test_suite_parser_and_validator():
 subprocess.run(["make", "-j16"], cwd=".")    # nosec
 
 # List of cases supported
-supportedCaseList = ['0', '1', '2', '3', '4', '6', '7']
-
-if testType == 0:
-    if batchSize != 3:
-        print("QA tests can only run with a batch size of 3.")
-        exit(0)
-
-    for case in caseList:
-        if "--input_path" not in sys.argv:
-            if case == "3":
-                srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1"
-            else:
-                srcPath = inFilePath
-        if case not in supportedCaseList:
-            continue
+supportedCaseList = ['0', '1', '2', '3', '4', '5', '6', '7']
+if qaMode and batchSize != 3:
+    print("QA tests can only run with a batch size of 3.")
+    exit(0)
 
-        run_unit_test(srcPath, case, numRuns, testType, batchSize, outFilePath)
-else:
-    for case in caseList:
-        if "--input_path" not in sys.argv:
-            if case == "3":
-                srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1"
-            else:
-                srcPath = inFilePath
-        if case not in supportedCaseList:
-            continue
+for case in caseList:
+    if "--input_path" not in sys.argv:
+        if case == "3":
+            srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1"
+        else:
+            srcPath = inFilePath
 
-        run_performance_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath)
+    if case not in supportedCaseList:
+        continue
+    run_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath)
 
 # print the results of qa tests
 nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support
@@ -214,7 +197,7 @@ def rpp_test_suite_parser_and_validator():
     qaFilePath = os.path.join(outFilePath, "QA_results.txt")
     checkFile = os.path.isfile(qaFilePath)
     if checkFile:
-        print("---------------------------------- Results of QA Test - Tensor_host_audio -----------------------------------\n")
+        print("---------------------------------- Results of QA Test - Tensor_audio_host -----------------------------------\n")
         print_qa_tests_summary(qaFilePath, supportedCaseList, nonQACaseList)
 
 # Performance tests
diff --git a/utilities/test_suite/HOST/runMiscTests.py b/utilities/test_suite/HOST/runMiscTests.py
new file mode 100644
index 000000000..553b6e35e
--- /dev/null
+++ b/utilities/test_suite/HOST/runMiscTests.py
@@ -0,0 +1,197 @@
+"""
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import os
+import subprocess  # nosec
+import argparse
+import datetime
+import shutil
+import sys
+sys.dont_write_bytecode = True
+sys.path.append(os.path.join(os.path.dirname( __file__ ), '..' ))
+from common import *
+
+# Set the timestamp
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+scriptPath = os.path.dirname(os.path.realpath(__file__))
+outFolderPath = os.getcwd()
+buildFolderPath = os.getcwd()
+caseMin = 1
+caseMax = 1
+
+# Get a list of log files based on a flag for preserving output
+def get_log_file_list():
+    return [
+        outFolderPath + "/OUTPUT_PERFORMANCE_MISC_LOGS_HOST_" + timestamp + "/Tensor_misc_host_raw_performance_log.txt",
+    ]
+
+def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+    print(result.stdout.decode())
+    print("------------------------------------------------------------------------------------------")
+
+def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    with open("{}/Tensor_misc_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
+        print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+        read_from_subprocess_and_write_to_log(process, logFile)
+
+def run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    print("\n\n\n\n")
+    print("--------------------------------")
+    print("Running a New Functionality...")
+    print("--------------------------------")
+    if testType == 0:
+        run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+    elif testType == 1:
+        run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+
+# Parse and validate command-line arguments for the RPP test suite
+def rpp_test_suite_parser_and_validator():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = QA tests / 1 = Performance tests)")
+    parser.add_argument('--toggle', type = int, default = 0, help = "Toggle outputs")
+    parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to test", required = False)
+    parser.add_argument("--num_dims", type = int, default = 2, help = "Number of dimensions for input")
+    parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
+    parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Outputs from tests will be compared with golden outputs - (0 / 1)", required = False)
+    parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
+    parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
+    args = parser.parse_args()
+
+    # validate the parameters passed by user
+    if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
+        print("Starting case# and Ending case# must be in the 0:1 range. Aborting!")
+        exit(0)
+    elif args.case_end < args.case_start:
+        print("Ending case# must be greater than starting case#. Aborting!")
+        exit(0)
+    elif args.test_type < 0 or args.test_type > 1:
+        print("Test Type# must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.qa_mode < 0 or args.qa_mode > 1:
+        print("QA mode must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.case_list is not None and args.case_start > caseMin and args.case_end < caseMax:
+        print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
+        exit(0)
+    elif args.num_runs <= 0:
+        print("Number of Runs must be greater than 0. Aborting!")
+        exit(0)
+    elif args.batch_size <= 0:
+        print("Batch size must be greater than 0. Aborting!")
+        exit(0)
+
+    if args.case_list is None:
+        args.case_list = range(args.case_start, args.case_end + 1)
+        args.case_list = [str(x) for x in args.case_list]
+    else:
+        for case in args.case_list:
+            if int(case) < caseMin or int(case) > caseMax:
+                print("The case# must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+                exit(0)
+    return args
+
+args = rpp_test_suite_parser_and_validator()
+caseStart = args.case_start
+caseEnd = args.case_end
+testType = args.test_type
+toggle = args.toggle
+caseList = args.case_list
+numDims = args.num_dims
+numRuns = args.num_runs
+batchSize = args.batch_size
+qaMode = args.qa_mode
+if qaMode:
+    testType = 0
+preserveOutput = args.preserve_output
+bitDepth = 2 # Current audio test suite only supports bit depth 2
+outFilePath = " "
+
+if testType == 0 and batchSize != 3:
+    print("QA mode can only run with a batch size of 3.")
+    exit(0)
+if preserveOutput == 0:
+    validate_and_remove_folders(outFolderPath, "QA_RESULTS_MISC_HOST")
+    validate_and_remove_folders(outFolderPath, "OUTPUT_PERFORMANCE_MISC_LOGS_HOST")
+
+if(testType == 0):
+    outFilePath = outFolderPath + '/QA_RESULTS_MISC_HOST_' + timestamp
+    numRuns = 1
+elif(testType == 1):
+    if "--num_runs" not in sys.argv:
+        numRuns = 100   #default numRuns for running performance tests
+    outFilePath = outFolderPath + '/OUTPUT_PERFORMANCE_MISC_LOGS_HOST_' + timestamp
+else:
+    print("Invalid TEST_TYPE specified. TEST_TYPE should be 0/1 (0 = QA tests / 1 = Performance tests)")
+    exit(0)
+
+os.mkdir(outFilePath)
+loggingFolder = outFilePath
+dstPath = outFilePath
+
+# Enable extglob
+if os.path.exists(buildFolderPath + "/build"):
+    shutil.rmtree(buildFolderPath + "/build")
+os.makedirs(buildFolderPath + "/build")
+os.chdir(buildFolderPath + "/build")
+
+# Run cmake and make commands
+subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.run(["make", "-j16"], cwd=".")    # nosec
+
+supportedCaseList = ['1']
+for case in caseList:
+    if case not in supportedCaseList:
+        continue
+    if case == "1":
+        for axisMask in range(1, pow(2, numDims)):
+            run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, axisMask)
+    else:
+        run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, axisMask)
+
+# print the results of qa tests
+nonQACaseList = []
+supportedCases = 0
+for num in caseList:
+    if num in supportedCaseList:
+        supportedCases += 1
+caseInfo = "Tests are run for " + str(supportedCases) + " supported cases out of the " + str(len(caseList)) + " cases requested"
+if testType == 0:
+    qaFilePath = os.path.join(outFilePath, "QA_results.txt")
+    checkFile = os.path.isfile(qaFilePath)
+    if checkFile:
+        print("---------------------------------- Results of QA Test - Tensor_misc_host ----------------------------------\n")
+        print_qa_tests_summary(qaFilePath, supportedCaseList, nonQACaseList)
+
+# Performance tests
+if (testType == 1):
+    logFileList = get_log_file_list()
+    functionalityGroupList = ["statiscal_operations"]
+
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index 7fc946d46..6f8eb7f70 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -105,28 +105,18 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
 
             print("------------------------------------------------------------------------------------------")
 
-def run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     if qaMode == 1:
-        with open("{}/BatchPD_host_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file:
-            process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + log_file_layout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                log_file.write(output)
-
-    with open("{}/Tensor_host_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file:
+        with open("{}/BatchPD_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
+            process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + logFileLayout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+            read_from_subprocess_and_write_to_log(process, logFile)
+
+    with open("{}/Tensor_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
         print(f"./Tensor_host {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ")
         process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
-        while True:
-            output = process.stdout.readline()
-            if not output and process.poll() is not None:
-                break
-            print(output.strip())
-            log_file.write(output)
-
-def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+        read_from_subprocess_and_write_to_log(process, logFile)
+
+def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     print("\n\n\n\n")
     print("--------------------------------")
     print("Running a New Functionality...")
@@ -144,13 +134,13 @@ def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dst
             if case == "8":
                 # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
                 for noiseType in range(3):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             elif case == "21" or case == "23" or case == "24":
                 # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
                 for interpolationType in range(6):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             else:
-                run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "0", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "0", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             print("------------------------------------------------------------------------------------------")
 
 # Parse and validate command-line arguments for the RPP test suite
@@ -301,7 +291,7 @@ def rpp_test_suite_parser_and_validator():
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
         for layout in range(3):
-            dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
+            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
 
             if qaMode == 0:
                 if not os.path.isdir(dstPathTemp):
@@ -323,8 +313,8 @@ def rpp_test_suite_parser_and_validator():
             srcPath1 = ricapInFilePath
             srcPath2 = ricapInFilePath
         for layout in range(3):
-            dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
-            run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
+            run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
 
 # print the results of qa tests
 nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
@@ -461,7 +451,7 @@ def rpp_test_suite_parser_and_validator():
     passedCases = df['Test_Result'].eq('PASSED').sum()
     failedCases = df['Test_Result'].eq('FAILED').sum()
 
-    summary_row = {'BatchPD_Augmentation_Type': pd.NA,
+    summaryRow = {'BatchPD_Augmentation_Type': pd.NA,
                    'Tensor_Augmentation_Type': pd.NA,
                    'Performance Speedup (%)': pd.NA,
                    'Test_Result': f'Final Results of Tests: Passed: {passedCases}, Failed: {failedCases}'}
@@ -470,8 +460,8 @@ def rpp_test_suite_parser_and_validator():
 
     # Append the summary row to the DataFrame
     # Convert the dictionary to a DataFrame
-    summary_row = pd.DataFrame([summary_row])
-    df = pd.concat([df, summary_row], ignore_index=True)
+    summaryRow = pd.DataFrame([summaryRow])
+    df = pd.concat([df, summaryRow], ignore_index=True)
 
     df.to_excel(excelFilePath, index=False)
     print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------")
@@ -481,9 +471,9 @@ def rpp_test_suite_parser_and_validator():
     print("- Random observations of negative speedups might always occur due to current test machine temperature/load variances or other CPU/GPU state-dependent conditions.")
     print("\n-------------------------------------------------------------------\n")
 elif (testType == 1 and qaMode == 0):
-    log_file_list = get_log_file_list(preserveOutput)
+    logFileList = get_log_file_list(preserveOutput)
 
-    functionality_group_list = [
+    functionalityGroupList = [
         "color_augmentations",
         "data_exchange_operations",
         "effects_augmentations",
@@ -492,5 +482,5 @@ def rpp_test_suite_parser_and_validator():
         "statistical_operations",
     ]
 
-    for log_file in log_file_list:
-        print_performance_tests_summary(log_file, functionality_group_list, numRuns)
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/HOST/runTests_voxel.py b/utilities/test_suite/HOST/runVoxelTests.py
similarity index 73%
rename from utilities/test_suite/HOST/runTests_voxel.py
rename to utilities/test_suite/HOST/runVoxelTests.py
index ad0dbcf7d..cf22fdbc9 100644
--- a/utilities/test_suite/HOST/runTests_voxel.py
+++ b/utilities/test_suite/HOST/runVoxelTests.py
@@ -57,44 +57,34 @@ def func_group_finder(case_number):
     else:
         return "miscellaneous"
 
-def run_unit_test(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    bitDepths = [0, 2]
-    if qaMode:
-        bitDepths = [2]
-    for bitDepth in bitDepths:
-        print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
-        print("\n\n\n\n")
+def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
+    print(result.stdout.decode())
+    print("------------------------------------------------------------------------------------------")
+
+def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    with open(f"{loggingFolder}/Tensor_voxel_host_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
         print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-        result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
-        print(result.stdout.decode())
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+        while True:
+            output = process.stdout.readline()
+            if not output and process.poll() is not None:
+                break
+            print(output.strip())
+            if "Running" in output or "max,min,avg wall times" in output:
+                cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
+                cleanedOutput = cleanedOutput.strip()  # Remove leading/trailing whitespace
+                logFile.write(cleanedOutput + '\n')
+                if "max,min,avg wall times" in output:
+                    logFile.write("\n")
         print("------------------------------------------------------------------------------------------")
 
-def run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    bitDepths = [0, 2]
-    for bitDepth in bitDepths:
-        with open(f"{loggingFolder}/Tensor_voxel_host_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-            print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-            process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                if "Running" in output or "max,min,avg wall times" in output:
-                    cleaned_output = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
-                    cleaned_output = cleaned_output.strip()  # Remove leading/trailing whitespace
-                    log_file.write(cleaned_output + '\n')
-                    if "max,min,avg wall times" in output:
-                        log_file.write("\n")
-        print("------------------------------------------------------------------------------------------")
+def run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    if testType == 0:
+        run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+    elif testType == 1:
+        run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
 
 # Parse and validate command-line arguments for the RPP test suite
 def rpp_test_suite_parser_and_validator():
@@ -224,24 +214,26 @@ def rpp_test_suite_parser_and_validator():
 print("##########################################################################################")
 
 bitDepths = [0, 2]
-if testType == 0:
-    for case in caseList:
-        if case not in supportedCaseList:
-            continue
-        for layout in range(3):
-            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
-            if qaMode == 0:
-                if not os.path.isdir(dstPathTemp):
-                    os.mkdir(dstPathTemp)
-
-            run_unit_test(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
-else:
-    for case in caseList:
-        if case not in supportedCaseList:
-            continue
-        for layout in range(3):
-            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
-            run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+for case in caseList:
+    if case not in supportedCaseList:
+        continue
+    print("\n\n\n\n")
+    print("--------------------------------")
+    print("Running a New Functionality...")
+    print("--------------------------------")
+    for layout in range(3):
+        dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
+        if testType == 0 and qaMode == 0:
+            if not os.path.isdir(dstPathTemp):
+                os.mkdir(dstPathTemp)
+
+        bitDepths = [0, 2]
+        if testType == 0 and qaMode:
+            bitDepths = [2]
+        for bitDepth in bitDepths:
+            print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
+            print("\n\n\n\n")
+            run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
 
 # print the results of qa tests
 nonQACaseList = ['6'] # Add cases present in supportedCaseList, but without QA support
@@ -256,13 +248,8 @@ def rpp_test_suite_parser_and_validator():
 if (testType == 0 and qaMode == 0):   # Unit tests
     create_layout_directories(dstPath, layoutDict)
 elif (testType == 1):   # Performance tests
-    log_file_list = get_log_file_list()
-
-    functionality_group_list = [
-        "arithmetic_operations",
-        "geometric_augmentations",
-        "effects_augmentations"
-    ]
+    logFileList = get_log_file_list()
+    functionalityGroupList = ["arithmetic_operations", "geometric_augmentations", "effects_augmentations"]
 
-    for log_file in log_file_list:
-        print_performance_tests_summary(log_file, functionality_group_list, numRuns)
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/NORMALIZE/input/2d_input.bin b/utilities/test_suite/NORMALIZE/input/2d_input.bin
new file mode 100644
index 000000000..e344a605a
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/input/2d_input.bin differ
diff --git a/utilities/test_suite/NORMALIZE/input/2d_mean_std.bin b/utilities/test_suite/NORMALIZE/input/2d_mean_std.bin
new file mode 100644
index 000000000..d50768a58
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/input/2d_mean_std.bin differ
diff --git a/utilities/test_suite/NORMALIZE/input/3d_input.bin b/utilities/test_suite/NORMALIZE/input/3d_input.bin
new file mode 100644
index 000000000..c3829ec21
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/input/3d_input.bin differ
diff --git a/utilities/test_suite/NORMALIZE/input/3d_mean_std.bin b/utilities/test_suite/NORMALIZE/input/3d_mean_std.bin
new file mode 100644
index 000000000..aed068df8
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/input/3d_mean_std.bin differ
diff --git a/utilities/test_suite/NORMALIZE/output/2d_output.bin b/utilities/test_suite/NORMALIZE/output/2d_output.bin
new file mode 100644
index 000000000..0f97d6bcf
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/output/2d_output.bin differ
diff --git a/utilities/test_suite/NORMALIZE/output/3d_output.bin b/utilities/test_suite/NORMALIZE/output/3d_output.bin
new file mode 100644
index 000000000..e9f31134a
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/output/3d_output.bin differ
diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md
index 41ee5109c..375247665 100644
--- a/utilities/test_suite/README.md
+++ b/utilities/test_suite/README.md
@@ -191,43 +191,43 @@ The voxel test suite accepts the following command line arguments:
 ### Running the Tests for HOST Backend (RPP Voxel Test Suite)
 The test suite can be run with the following command:
 ``` python
-python runTests_voxel.py --header_path <header_path> --data_path <data_path> --case_start <case_start> --case_end <case_end> --test_type <test_type>
+python runVoxelTests.py --header_path <header_path> --data_path <data_path> --case_start <case_start> --case_end <case_end> --test_type <test_type>
 ```
 
 ### Running the Tests for HIP Backend (RPP Voxel Test Suite)
 The test suite can be run with the following command:
 ``` python
-python runTests_voxel.py --header_path <header_path> --data_path <data_path> --case_start <case_start> --case_end <case_end> --test_type <test_type> --profiling <profiling>
+python runVoxelTests.py --header_path <header_path> --data_path <data_path> --case_start <case_start> --case_end <case_end> --test_type <test_type> --profiling <profiling>
 ```
 
 ### Modes of operation (RPP Voxel Test Suite)
 -   QA mode - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs.
 ``` python
-python runTests_voxel.py --case_start 0 --case_end 4 --test_type 0 --qa_mode 1 --batch_size 3
+python runVoxelTests.py --case_start 0 --case_end 4 --test_type 0 --qa_mode 1 --batch_size 3
 ```
 -   Unit test mode - Unit tests allowing users to pass a path to a folder containing nii fikes, to execute the desired functionality and variant once, report RPP execution wall time, save and view output images, gifs and nii files
 ``` python
-python runTests_voxel.py --case_start 0 --case_end 4 --test_type 0 --qa_mode 0
+python runVoxelTests.py --case_start 0 --case_end 4 --test_type 0 --qa_mode 0
 ```
 -   Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time, or optionally, AMD rocprof kernel profiler max/min/avg time for HIP backend variants.
 ``` python
-python runTests_voxel.py --case_start 0 --case_end 4 --test_type 1
+python runVoxelTests.py --case_start 0 --case_end 4 --test_type 1
 ```
 
 To run the unit tests / performance tests for specific case numbers. please case use case_list parameter. Example as below
 
 -   To run unittests for case numbers 0, 2, 4
 ``` python
-python runTests_voxel.py --case_list 0 2 4 --test_type 0
+python runVoxelTests.py --case_list 0 2 4 --test_type 0
 ```
 -   To run performance tests for case numbers 0, 2, 4
 ``` python
-python runTests_voxel.py --case_list 0 2 4 --test_type 1
+python runVoxelTests.py --case_list 0 2 4 --test_type 1
 ```
 
 To run performance tests with AMD rocprof kernel profiler for HIP backend variants. This will generate profiler times for HIP backend variants
 ``` python
-python runTests_voxel.py --test_type 1 --profiling YES
+python runVoxelTests.py --test_type 1 --profiling YES
 ```
 
 ### Summary of features (RPP Voxel Test Suite)
@@ -298,3 +298,45 @@ The audio test suite includes:
 -   Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time.
 -   QA and Performance tests are included for one input/output bitdepth F32.
 -   Support for output referencing against golden outputs, and functionality validation checking, by tolerance-based pass/fail criterions for each variant.
+
+## RPP Miscellaneous Test Suite
+The miscellaneous test suite can be executed to validate the functionality and performance of the AMD ROCm Performance Primitives (RPP) generic libraries that can process N-Dimensional inputs
+-   HOST backend - (On a CPU with HOST backend)
+-   HIP backend - (On a GPU with HIP backend)
+-   F32 Bit Depth
+
+### Command Line Arguments (RPP Miscellaneous Test Suite)
+The miscellaneous test suite accepts the following command line arguments:
+-   case_start: The starting case number for the test range (1-1). Default is 1
+-   case_end: The ending case number for the test range (1-1). Default is 1
+-   test_type: The type of test to run (0 = QA tests, 1 = Performance tests). Default is 0
+-   qa_mode: Output audio data from tests will be compared with golden outputs - (0 / 1). Default is 0
+-   case_list: A list of specific case numbers to run. Must be used in conjunction with --test_type
+-   num_dims: The number of dimensions of input. Default is 2
+-   num_runs: Specifies the number of runs for running the performance tests
+-   preserve_output: preserves the output or performance logs generated from the previous test suite run - (0 = remove output or performance logs / 1 = preserve output or performance logs). Default is 1
+-   batch_size: Specifies the batch size to use for running tests. Default is 1
+-   profiling: Run the tests with a profiler (YES/NO). Default is NO. This option is only available with HIP backend
+
+### Running the Tests for HOST Backend (RPP Miscellaneous Test Suite)
+The test suite can be run with the following command:
+``` python
+python runMiscTests.py --case_start <case_start> --case_end <case_end> --test_type <test_type>
+```
+
+### Modes of operation (RPP Miscellaneous Test Suite)
+-   QA mode - Tolerance based PASS/FAIL tests for RPP MISC HOST functionalities checking match between output and preset golden outputs. Please note that QA mode is only supported with a batch size of 3 and num dims 2 or 3
+``` python
+python runMiscTests.py --case_start 1 --case_end 1 --qa_mode 1 --batch_size 3
+```
+
+-   Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time.
+``` python
+python runMiscTests.py --case_start 1 --case_end 1 --test_type 1
+```
+
+### Summary of features (RPP Miscellaneous Test Suite)
+The miscellaneous test suite includes:
+-   Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time.
+-   QA and Performance tests are included for one input/output bitdepth F32.
+-   Support for output referencing against golden outputs, and functionality validation checking, by tolerance-based pass/fail criterions for each variant.
\ No newline at end of file
diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py
index 1646cdc94..3795a5daf 100644
--- a/utilities/test_suite/common.py
+++ b/utilities/test_suite/common.py
@@ -229,4 +229,22 @@ def print_performance_tests_summary(logFile, functionalityGroupList, numRuns):
         print("No variants under this category")
 
     # Closing log file
-    f.close()
\ No newline at end of file
+    f.close()
+
+# Read the standard output from subprocess and writes to log file
+def read_from_subprocess_and_write_to_log(process, logFile):
+    while True:
+        output = process.stdout.readline()
+        if not output and process.poll() is not None:
+            break
+        print(output.strip())
+        logFile.write(output)
+
+# Returns the layout name based on layout value
+def get_layout_name(layout):
+    if layout == 0:
+        return "PKD3"
+    elif  layout == 1:
+        return "PLN3"
+    elif layout == 2:
+        return "PLN1"
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index ea3117041..5623c1b7b 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -933,12 +933,18 @@ inline void read_bin_file(string refFile, T *binaryContent)
     FILE *fp;
     fp = fopen(refFile.c_str(), "rb");
     if(!fp)
-        std::cerr << "\n unable to open file : "<<refFile;
+    {
+        std::cout << "\n unable to open file : "<<refFile;
+        exit(0);
+    }
 
     fseek(fp, 0, SEEK_END);
     long fsize = ftell(fp);
     if (fsize == 0)
-        std::cerr << "File is empty";
+    {
+        std::cout << "File is empty";
+        exit(0);
+    }
 
     fseek(fp, 0, SEEK_SET);
     fread(binaryContent, fsize, 1, fp);
diff --git a/utilities/test_suite/rpp_test_suite_misc.h b/utilities/test_suite/rpp_test_suite_misc.h
new file mode 100644
index 000000000..111dac74e
--- /dev/null
+++ b/utilities/test_suite/rpp_test_suite_misc.h
@@ -0,0 +1,340 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rpp_test_suite_common.h"
+
+using namespace std;
+
+std::map<int, string> augmentationMiscMap =
+{
+    {1, "normalize"}
+};
+
+void compute_strides(RpptGenericDescPtr descriptorPtr)
+{
+    if (descriptorPtr->numDims > 0)
+    {
+        uint64_t v = 1;
+        for (int i = descriptorPtr->numDims - 1; i > 0; i--)
+        {
+            descriptorPtr->strides[i] = v;
+            v *= descriptorPtr->dims[i];
+        }
+        descriptorPtr->strides[0] = v;
+    }
+}
+
+string get_path(Rpp32u nDim, Rpp32u readType, string scriptPath, bool isMeanStd = false)
+{
+    string folderName, suffix;
+    if(readType == 0)
+    {
+        folderName = "input";
+        if(isMeanStd)
+            suffix = "mean_std";
+        else
+            suffix = "input";
+    }
+    else if(readType == 1)
+    {
+        folderName = "output";
+        suffix = "output";
+    }
+
+    string fileName = std::to_string(nDim) + "d_" + suffix + ".bin";
+    string finalPath = scriptPath + "/../NORMALIZE/" + folderName + "/" + fileName;
+    return finalPath;
+}
+
+void read_data(Rpp32f *data, Rpp32u nDim, Rpp32u readType, string scriptPath, bool isMeanStd = false)
+{
+    if(nDim != 2 && nDim != 3)
+    {
+        std::cout<<"\nGolden Inputs / Outputs are generated only for 2D/3D data"<<std::endl;
+        exit(0);
+    }
+    string refPath = get_path(nDim, readType, scriptPath, isMeanStd);
+    read_bin_file(refPath, data);
+}
+
+// Fill the starting indices and length of ROI values
+void fill_roi_values(Rpp32u nDim, Rpp32u batchSize, Rpp32u *roiTensor, bool qaMode)
+{
+    if(qaMode)
+    {
+        switch(nDim)
+        {
+            case 2:
+            {
+                std::array<Rpp32u, 4> roi = {0, 0, 100, 100};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 4)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            case 3:
+            {
+                std::array<Rpp32u, 6> roi = {0, 0, 0, 50, 50, 8};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 6)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            default:
+            {
+                cout << "Error! QA mode is supported only for 2D/3D inputs" << endl;
+                exit(0);
+            }
+        }
+    }
+    else
+    {
+        switch(nDim)
+        {
+            case 2:
+            {
+                std::array<Rpp32u, 4> roi = {0, 0, 1920, 1080};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 4)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            case 3:
+            {
+                std::array<Rpp32u, 6> roi = {0, 0, 0, 1920, 1080, 3};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 6)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            case 4:
+            {
+                std::array<Rpp32u, 8> roi = {0, 0, 0, 0, 1, 128, 128, 128};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 8)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            default:
+            {
+                // if nDim is not 2/3/4 and mode choosen is not QA
+                for(int i = 0; i < batchSize; i++)
+                {
+                    int startIndex = i * nDim * 2;
+                    int lengthIndex = startIndex + nDim;
+                    for(int j = 0; j < nDim; j++)
+                    {
+                        roiTensor[startIndex + j] = 0;
+                        roiTensor[lengthIndex + j] = std::rand() % 10;  // limiting max value in a dimension to 10 for testing purposes
+                    }
+                }
+                break;
+            }
+        }
+    }
+}
+
+// Set layout for generic descriptor
+void set_generic_descriptor_layout(RpptGenericDescPtr srcDescriptorPtrND, RpptGenericDescPtr dstDescriptorPtrND, Rpp32u nDim, int toggle, int qaMode)
+{
+    if(qaMode && !toggle)
+    {
+        switch(nDim)
+        {
+            case 2:
+            {
+                srcDescriptorPtrND->layout = RpptLayout::NHWC;
+                dstDescriptorPtrND->layout = RpptLayout::NHWC;
+                break;
+            }
+            case 3:
+            {
+                srcDescriptorPtrND->layout = RpptLayout::NHWC;
+                dstDescriptorPtrND->layout = RpptLayout::NHWC;
+                break;
+            }
+            case 4:
+            {
+                srcDescriptorPtrND->layout = RpptLayout::NDHWC;
+                dstDescriptorPtrND->layout = RpptLayout::NDHWC;
+                break;
+            }
+            default:
+            {
+                cout << "Error! QA mode is supported only for 2D/3D inputs" << endl;
+                exit(0);
+            }
+        }
+    }
+    else if(nDim == 3)
+    {
+        if(toggle)
+        {
+            srcDescriptorPtrND->layout = RpptLayout::NHWC;
+            dstDescriptorPtrND->layout = RpptLayout::NCHW;
+        }
+    }
+    else
+    {
+        srcDescriptorPtrND->layout = RpptLayout::NDHWC;
+        dstDescriptorPtrND->layout = RpptLayout::NDHWC;
+    }
+}
+
+// sets generic descriptor numDims, offsetInBytes,  bitdepth, dims and strides
+inline void set_generic_descriptor(RpptGenericDescPtr descriptorPtr3D, int nDim, int offsetInBytes, int bitDepth, int batchSize, Rpp32u *roiTensor)
+{
+    descriptorPtr3D->numDims = nDim + 1;
+    descriptorPtr3D->offsetInBytes = offsetInBytes;
+    if (bitDepth == 0)
+        descriptorPtr3D->dataType = RpptDataType::U8;
+    else if (bitDepth == 1)
+        descriptorPtr3D->dataType = RpptDataType::F16;
+    else if (bitDepth == 2)
+        descriptorPtr3D->dataType = RpptDataType::F32;
+    else if (bitDepth == 5)
+        descriptorPtr3D->dataType = RpptDataType::I8;
+    descriptorPtr3D->dims[0] = batchSize;
+    for(int i = 1; i <= nDim; i++)
+        descriptorPtr3D->dims[i] = roiTensor[nDim + i - 1];
+    compute_strides(descriptorPtr3D);
+}
+
+// strides used for jumping to corresponding axisMask mean and stddev
+std::map<Rpp32s, Rpp32u> paramStrideMap2D =
+{
+    {1, 0},
+    {2, 100},
+    {3, 200}
+};
+
+// strides used for jumping to corresponding axisMask mean and stddev
+std::map<Rpp32s, Rpp32u> paramStrideMap3D =
+{
+    {1, 0},
+    {2, 400},
+    {3, 800},
+    {4, 808},
+    {5, 3308},
+    {6, 3358},
+    {7, 3408}
+};
+
+// fill the mean and stddev values used for normalize
+void fill_mean_stddev_values(Rpp32u nDim, Rpp32u size, Rpp32f *meanTensor,
+                             Rpp32f *stdDevTensor, bool qaMode, int axisMask, string scriptPath)
+{
+    if(qaMode)
+    {
+        Rpp32u numValues, paramStride;
+        switch(nDim)
+        {
+            case 2:
+            {
+                numValues = 100 + 100 + 1;
+                paramStride = paramStrideMap2D[axisMask];
+                break;
+            }
+            case 3:
+            {
+                numValues = 400 + 400 + 8 + 2500 + 50 + 50 + 1;
+                paramStride = paramStrideMap3D[axisMask];
+                break;
+            }
+            default:
+            {
+                cout << "Error! QA mode is supported only for 2D/3D inputs" << endl;
+                exit(0);
+            }
+        }
+        std::vector<Rpp32f> paramBuf(numValues * 2);
+        Rpp32f *data = paramBuf.data();
+        read_data(data, nDim, 0, scriptPath, true);
+        memcpy(meanTensor, data + paramStride, size * sizeof(Rpp32f));
+        memcpy(stdDevTensor, data + numValues + paramStride, size * sizeof(Rpp32f));
+    }
+    else
+    {
+        for(int j = 0; j < size; j++)
+        {
+            meanTensor[j] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
+            stdDevTensor[j] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
+        }
+    }
+}
+
+Rpp32u get_bin_size(Rpp32u nDim, Rpp32u readType, string scriptPath)
+{
+    string refFile = get_path(nDim, readType, scriptPath);
+    std::ifstream filestream(refFile, ios_base::in | ios_base::binary);
+    filestream.seekg(0, ios_base::end);
+    Rpp32u filesize = filestream.tellg();
+    return filesize;
+}
+
+void compare_output(Rpp32f *outputF32, Rpp32u nDim, Rpp32u batchSize, Rpp32u bufferLength,
+                    string dst, string funcName, int axisMask, string scriptPath, bool isMeanStd = false)
+{
+    Rpp32u goldenOutputLength = get_bin_size(nDim, 1, scriptPath);
+    Rpp32f *refOutput = static_cast<Rpp32f *>(calloc(goldenOutputLength, 1));
+    read_data(refOutput, nDim, 1, scriptPath);
+    int meanStdDevOutputStride = 0;
+    if(isMeanStd)
+        meanStdDevOutputStride = goldenOutputLength / (2 * sizeof(Rpp32f));
+    int axisMaskStride = (axisMask - 1) * bufferLength;
+    int sampleLength = bufferLength / batchSize;
+    int fileMatch = 0;
+    for(int i = 0; i < batchSize; i++)
+    {
+        Rpp32f *ref = refOutput + meanStdDevOutputStride + axisMaskStride + i * sampleLength;
+        Rpp32f *out = outputF32 + i * sampleLength;
+        int cnt = 0;
+        for(int j = 0; j < sampleLength; j++)
+        {
+            bool invalid_comparision = ((out[j] == 0.0f) && (ref[j] != 0.0f));
+            if(!invalid_comparision && abs(out[j] - ref[j]) < 1e-4)
+                cnt++;
+        }
+        if (cnt == sampleLength)
+            fileMatch++;
+    }
+
+    std::string status = funcName + ": ";
+    cout << std::endl << "Results for Test case: " << funcName << std::endl;
+    if (fileMatch == batchSize)
+    {
+        std::cout << "\nPASSED!"<<std::endl;
+        status += "PASSED";
+    }
+    else
+    {
+        std::cout << "\nFAILED! " << fileMatch << "/" << batchSize << " outputs are matching with reference outputs" << std::endl;
+        status += "FAILED";
+    }
+    free(refOutput);
+
+    // Append the QA results to file
+    std::string qaResultsPath = dst + "/QA_results.txt";
+    std:: ofstream qaResults(qaResultsPath, ios_base::app);
+    if (qaResults.is_open())
+    {
+        qaResults << status << std::endl;
+        qaResults.close();
+    }
+}