diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6c439d35d..16c4251f4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,30 @@
 
 Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/rpp/en/latest/).
 
-### RPP 1.5.0 (unreleased)
+### RPP 1.8.0 (unreleased)
+
+### Changes
+
+* Prerequisites - ROCm install requires only --usecase=rocm
+* Use pre-allocated common scratchBufferHip everywhere in Tensor code for scratch HIP memory
+* Use CHECK_RETURN_STATUS everywhere to adhere to C++17 for hip
+* RPP Tensor Audio support on HOST for Spectrogram
+* RPP Tensor Audio support on HOST/HIP for Slice, by modifying voxel slice kernels to now accept anchor and shape params for a more generic version
+* RPP Tensor Audio support on HOST for Mel Filter Bank
+* RPP Tensor Normalize ND support on HOST and HIP
+
+### Tested configurations
+
+* Linux distribution
+  * Ubuntu - `20.04` / `22.04`
+  * CentOS - `7`
+  * RHEL - `8`/`9`
+* ROCm: rocm-core - `6.1.0.60100`
+* Clang - Version `5.0.1`
+* CMake - Version `3.22.3`
+* IEEE 754-based half-precision floating-point library - Version `1.12.0`
+
+### RPP 1.5.0
 
 ### Changes
 
@@ -376,4 +399,4 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 
 ### Known issues
 
-* `CPU` backend is not enabled
\ No newline at end of file
+* `CPU` backend is not enabled
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5659a8e6..a53e2a0a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,6 +308,7 @@ message("-- ${White}${PROJECT_NAME} -- Link Libraries: ${LINK_LIBRARY_LIST}${Col
 target_link_libraries(${PROJECT_NAME} ${LINK_LIBRARY_LIST})
 set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+target_link_libraries(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/libs/third_party/ffts/libffts.a)
 set_target_properties(${PROJECT_NAME} PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR})
 
 target_include_directories(${PROJECT_NAME}
@@ -429,15 +430,12 @@ set(CPACK_RPM_COMPONENT_INSTALL ON)
 set(CPACK_RPM_RUNTIME_PACKAGE_NAME "${PROJECT_NAME}")
 set(CPACK_RPM_RUNTIME_PACKAGE_REQUIRES "rocm-core, ${RPP_RPM_PACKAGE_LIST}")
 set(CPACK_RPM_RUNTIME_PACKAGE_PROVIDES "${PROJECT_NAME}")
-set(CPACK_RPM_RUNTIME_PACKAGE_OBSOLETES "${PROJECT_NAME}")
 set(CPACK_RPM_DEV_PACKAGE_NAME "${PROJECT_NAME}-devel")
 set(CPACK_RPM_DEV_PACKAGE_REQUIRES "rocm-core, ${PROJECT_NAME}, ${RPP_RPM_DEV_PACKAGE_LIST}")
 set(CPACK_RPM_DEV_PACKAGE_PROVIDES "${PROJECT_NAME}-devel")
-set(CPACK_RPM_DEV_PACKAGE_OBSOLETES "${PROJECT_NAME}-devel")
 set(CPACK_RPM_TEST_PACKAGE_NAME "${PROJECT_NAME}-test")
 set(CPACK_RPM_TEST_PACKAGE_REQUIRES "rocm-core, ${PROJECT_NAME}-devel")
 set(CPACK_RPM_TEST_PACKAGE_PROVIDES "${PROJECT_NAME}-test")
-set(CPACK_RPM_TEST_PACKAGE_OBSOLETES "${PROJECT_NAME}-test")
 set(CPACK_RPM_PACKAGE_LICENSE "MIT" )
 # RPM package specific variable for ASAN
 set(CPACK_RPM_ASAN_PACKAGE_NAME "${PROJECT_NAME}-asan" )
diff --git a/README.md b/README.md
index c9349c394..1235faed1 100644
--- a/README.md
+++ b/README.md
@@ -26,14 +26,14 @@ Input<br>(nifti1 .nii medical image) | fused_multiply_add_scalar<br>(brightened
 ## Prerequisites
 
 * Linux
-  * **Ubuntu** - `20.04` / `22.04`
-  * **CentOS** - `7`
-  * **RedHat** - `8` / `9`
-  * **SLES** - `15-SP4`
+  * Ubuntu - `20.04` / `22.04`
+  * CentOS - `7`
+  * RedHat - `8` / `9`
+  * SLES - `15-SP4`
 
 * [ROCm supported hardware](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html)
 
-* Install ROCm with [amdgpu-install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/amdgpu-install.html) with `--usecase=graphics,rocm --no-32`
+* Install ROCm with [amdgpu-install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/amdgpu-install.html) with `--usecase=rocm`
 
 * Clang Version `5.0.1` and above
 
@@ -74,15 +74,15 @@ Input<br>(nifti1 .nii medical image) | fused_multiply_add_scalar<br>(brightened
   sudo apt-get install half
   ```
 
-> [!NOTE]
-> You must use the appropriate package manager for your operating system.
-
 * Compiler with support for C++ Version `17` and above
 
 * OpenMP
 
 * Threads
 
+> [!NOTE]
+> You must use the appropriate package manager for your operating system.
+
 ## Build and install instructions
 
 ### Package install
diff --git a/docker/rpp-on-ubuntu20.dockerfile b/docker/rpp-on-ubuntu20.dockerfile
index 568b427ab..cff7071c3 100644
--- a/docker/rpp-on-ubuntu20.dockerfile
+++ b/docker/rpp-on-ubuntu20.dockerfile
@@ -15,7 +15,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get -y install initramfs-tools libnuma-de
         wget ${ROCM_INSTALLER_REPO} && \
         sudo apt-get install -y ./${ROCM_INSTALLER_PACKAGE} && \
         sudo apt-get update -y && \
-        sudo amdgpu-install -y --usecase=graphics,rocm
+        sudo amdgpu-install -y --usecase=rocm
 # install rpp dependencies - half.hpp
 RUN wget https://sourceforge.net/projects/half/files/half/1.12.0/half-1.12.0.zip && \
         unzip half-1.12.0.zip -d half-files && mkdir -p /usr/local/include/half && cp half-files/include/half.hpp /usr/local/include/half
diff --git a/docker/rpp-on-ubuntu22.dockerfile b/docker/rpp-on-ubuntu22.dockerfile
index b5571d039..dd94379ff 100644
--- a/docker/rpp-on-ubuntu22.dockerfile
+++ b/docker/rpp-on-ubuntu22.dockerfile
@@ -15,7 +15,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get -y install initramfs-tools libnuma-de
         wget ${ROCM_INSTALLER_REPO} && \
         sudo apt-get install -y ./${ROCM_INSTALLER_PACKAGE} && \
         sudo apt-get update -y && \
-        sudo amdgpu-install -y --usecase=graphics,rocm
+        sudo amdgpu-install -y --usecase=rocm
 # install rpp dependencies - half.hpp
 RUN wget https://sourceforge.net/projects/half/files/half/1.12.0/half-1.12.0.zip && \
         unzip half-1.12.0.zip -d half-files && mkdir -p /usr/local/include/half && cp half-files/include/half.hpp /usr/local/include/half
diff --git a/docs/install/install.rst b/docs/install/install.rst
index 7693cc5af..cc0a4a0a1 100644
--- a/docs/install/install.rst
+++ b/docs/install/install.rst
@@ -22,7 +22,7 @@ Prerequisites
 
 * `ROCm supported hardware <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html>`_
 
-* Install ROCm with `amdgpu-install <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/amdgpu-install.html>`_ with ``--usecase=graphics,rocm --no-32``
+* Install ROCm with `amdgpu-install <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/amdgpu-install.html>`_ with ``--usecase=rocm``
 
 * Clang Version `5.0.1` and above
 
diff --git a/include/rppdefs.h b/include/rppdefs.h
index c5861e0e6..28876d7f5 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -134,7 +134,7 @@ typedef enum
     /*! \brief Out of bound source ROI \ingroup group_rppdefs */
     RPP_ERROR_OUT_OF_BOUND_SRC_ROI      = -17,
     /*! \brief src and dst layout mismatch \ingroup group_rppdefs */
-    RPP_ERROR_SRC_DST_LAYOUT_MISMATCH   = -18,
+    RPP_ERROR_LAYOUT_MISMATCH           = -18,
     /*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
     RPP_ERROR_INVALID_CHANNELS          = -19
 } RppStatus;
@@ -369,10 +369,13 @@ typedef enum
  */
 typedef enum
 {
-    NCHW,
-    NHWC,
-    NCDHW,
-    NDHWC
+    NCHW,   // BatchSize-Channels-Height-Width
+    NHWC,   // BatchSize-Height-Width-Channels
+    NCDHW,  // BatchSize-Channels-Depth-Height-Width
+    NDHWC,  // BatchSize-Depth-Height-Width-Channels
+    NHW,    // BatchSize-Height-Width
+    NFT,    // BatchSize-Frequency-Time -> Frequency Major used for Spectrogram / MelfilterBank
+    NTF     // BatchSize-Time-Frequency -> Time Major used for Spectrogram / MelfilterBank
 } RpptLayout;
 
 /*! \brief RPPT Tensor 2D ROI type enum
@@ -425,6 +428,15 @@ typedef enum
     REFLECT
 } RpptAudioBorderType;
 
+/*! \brief RPPT Mel Scale Formula
+ * \ingroup group_rppdefs
+ */
+typedef enum
+{
+    SLANEY = 0,  // Follows Slaney’s MATLAB Auditory Modelling Work behavior
+    HTK,         // Follows O’Shaughnessy’s book formula, consistent with Hidden Markov Toolkit(HTK), m = 2595 * log10(1 + (f/700))
+} RpptMelScaleFormula;
+
 /*! \brief RPPT Tensor 2D ROI LTRB struct
  * \ingroup group_rppdefs
  */
@@ -1024,6 +1036,7 @@ typedef struct
     Rpp64u* dstBatchIndex;
     Rpp32u* inc;
     Rpp32u* dstInc;
+    hipMemRpp32u scratchBuf;
 } memGPU;
 
 /*! \brief RPP HIP-HOST memory management
diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
index 1941df0da..31f3e95ef 100644
--- a/include/rppt_tensor_audio_augmentations.h
+++ b/include/rppt_tensor_audio_augmentations.h
@@ -60,7 +60,7 @@ extern "C" {
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
  */
-RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32f *detectedIndexTensor, Rpp32f *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
+RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
 
 /*! \brief To Decibels augmentation on HOST backend
  * \details To Decibels augmentation for 1D audio buffer converts magnitude values to decibel values
@@ -110,6 +110,47 @@ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr,
 */
 RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle);
 
+/*! \brief Produces a spectrogram from a 1D audio buffer on HOST backend
+ * \details Spectrogram for 1D audio buffer
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT / NTF)
+ * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
+ * \param [in] centerWindows indicates whether extracted windows should be padded so that the window function is centered at multiples of window_step
+ * \param [in] reflectPadding indicates the padding policy when sampling outside the bounds of the signal
+ * \param [in] windowFunction samples of the window function that will be multiplied to each extracted window when calculating the Short Time Fourier Transform (STFT)
+ * \param [in] nfft size of the FFT
+ * \param [in] power exponent of the magnitude of the spectrum
+ * \param [in] windowLength window size in number of samples
+ * \param [in] windowStep step between the STFT windows in number of samples
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_spectrogram_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcLengthTensor, bool centerWindows, bool reflectPadding, Rpp32f *windowFunction, Rpp32s nfft, Rpp32s power, Rpp32s windowLength, Rpp32s windowStep, rppHandle_t rppHandle);
+
+/*! \brief Mel filter bank augmentation HOST backend
+ * \details Mel filter bank augmentation for audio data
+ * \param[in] srcPtr source tensor in HOST memory
+ * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT)
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT)
+ * \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+ * \param[in] maxFreq maximum frequency if not provided maxFreq = sampleRate / 2
+ * \param[in] minFreq minimum frequency
+ * \param[in] melFormula formula used to convert frequencies from hertz to mel and from mel to hertz (SLANEY / HTK)
+ * \param[in] numFilter number of mel filters
+ * \param[in] sampleRate sampling rate of the audio
+ * \param[in] normalize boolean variable that determine whether to normalize weights / not
+ * \param[in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDims, Rpp32f maxFreq, Rpp32f minFreq, RpptMelScaleFormula melFormula, Rpp32s numFilter, Rpp32f sampleRate, bool normalize, rppHandle_t rppHandle);
+
 /*! \brief Resample augmentation on HOST backend
 * \details Resample augmentation for audio data
 * \param[in] srcPtr source tensor in HOST memory
diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h
index 695c3252d..8e846a41b 100644
--- a/include/rppt_tensor_geometric_augmentations.h
+++ b/include/rppt_tensor_geometric_augmentations.h
@@ -448,38 +448,42 @@ RppStatus rppt_phase_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc
  * \details This function performs slice augmentation on a generic 4D tensor.
  *          Slice augmentation involves selecting a region of interest (ROI) from the source tensor
  *          and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstGenericDescPtr destination tensor descriptor
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] srcPtr source tensor memory in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions)
+ * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions)
+ * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true
+ * \param [in] enablePadding boolean flag to specify if padding is enabled or not
+ * \param [in] roiTensor roi data in HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
- * \ingroup group_tensor_geometric
  */
-RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle);
 
 #ifdef GPU_SUPPORT
 /*! \brief Slice augmentation GPU
  * \details This function performs slice augmentation on a generic 4D tensor.
  *          Slice augmentation involves selecting a region of interest (ROI) from the source tensor
  *          and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \param[in] srcPtr source tensor in HIP memory
- * \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor in HIP memory
- * \param[in] dstGenericDescPtr destination tensor descriptor
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] srcPtr source tensor memory in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions)
+ * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions)
+ * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true
+ * \param [in] enablePadding boolean flag to specify if padding is enabled or not
+ * \param [in] roiTensor roi data in pinned/HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
- * \ingroup group_tensor_geometric
  */
-RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
 /*! \brief Crop and Patch augmentation on HOST backend for a NCHW/NHWC layout tensor
@@ -539,15 +543,15 @@ RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPt
             <br> Support added for f32 -> f32 and u8 -> u8 dataypes.
  * \image html input150x150x4.gif Sample Input
  * \image html geometric_augmentations_flip_150x150x4.gif Sample Output
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
  * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
  * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
  * \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in HOST memory, of size batchSize, with depthTensor[i] = 0/1)
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
@@ -562,15 +566,15 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes
             <br> Support added for f32 -> f32 and u8 -> u8 dataypes.
  * \image html input150x150x4.gif Sample Input
  * \image html geometric_augmentations_flip_150x150x4.gif Sample Output
- * \param[in] srcPtr source tensor in HIP memory
- * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
- * \param[out] dstPtr destination tensor in HIP memory
- * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
  * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
  * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
  * \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with depthTensor[i] = 0/1)
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 89418be78..7026b13b4 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -49,7 +49,7 @@ extern "C" {
  * \param [in] srcPtr source tensor in HOST memory
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] tensorSumArr destination array in HOST memory
- * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength = srcDescPtr->n * 4)
+ * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
  * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
@@ -67,7 +67,7 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcPtr source tensor in HIP memory
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] tensorSumArr destination array in HIP memory
- * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength = srcDescPtr->n * 4)
+ * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
  * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
@@ -85,7 +85,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcPtr source tensor in HOST memory
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] minArr destination array in HOST memory
- * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then minArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then minArrLength = srcDescPtr->n * 4)
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
  * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
@@ -103,7 +103,7 @@ RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcPtr source tensor in HIP memory
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] minArr destination array in HIP memory
- * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then minArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then minArrLength = srcDescPtr->n * 4)
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
  * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
@@ -121,7 +121,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcPtr source tensor in HOST memory
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] maxArr destination array in HOST memory
- * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then maxArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then maxArrLength = srcDescPtr->n * 4)
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
  * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
@@ -139,10 +139,10 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcPtr source tensor in HIP memory
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] maxArr destination array in HIP memory
- * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then maxArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then maxArrLength = srcDescPtr->n * 4)
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
  * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
- * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
@@ -150,6 +150,49 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
 RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/*! \brief Normalize Generic augmentation on HOST backend
+ * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor.
+ *          Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. Also has toggle variant(NHWC->NCHW) support for 3D.
+ * \param [in] srcPtr source tensor memory in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] axisMask axis along which normalization needs to be done
+ * \param [in] meanTensor values to be subtracted from input
+ * \param [in] stdDevTensor standard deviation values to scale the input
+ * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided)
+ * \param [in] scale value to be multiplied with data after subtracting from mean
+ * \param [in] shift value to be added finally
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_normalize_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Normalize Generic augmentation on HIP backend
+ * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor.
+ * \param [in] srcPtr source tensor memory in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] axisMask axis along which normalization needs to be done
+ * \param [in] meanTensor values to be subtracted from input
+ * \param [in] stdDevTensor standard deviation values to scale the input
+ * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided)
+ * \param [in] scale value to be multiplied with data after subtracting from mean
+ * \param [in] shift value to be added finally
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! \brief Tensor mean operation on HOST backend for a NCHW/NHWC layout tensor
  * \details The tensor mean is a reduction operation that finds the channel-wise (R mean / G mean / B mean) and total mean for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
  * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
@@ -229,4 +272,4 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr
 #ifdef __cplusplus
 }
 #endif
-#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
+#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
\ No newline at end of file
diff --git a/include/third_party/ffts/ffts.h b/include/third_party/ffts/ffts.h
new file mode 100644
index 000000000..cc85a885b
--- /dev/null
+++ b/include/third_party/ffts/ffts.h
@@ -0,0 +1,110 @@
+/*
+
+ This file is part of FFTS.
+
+ Copyright (c) 2012, Anthony M. Blake
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_H
+#define FFTS_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if (defined(_WIN32) || defined(WIN32)) && defined(FFTS_SHARED)
+#  ifdef FFTS_BUILD
+#    define FFTS_API __declspec(dllexport)
+#  else
+#    define FFTS_API __declspec(dllimport)
+#  endif
+#else
+#  if (__GNUC__ >= 4) || defined(HAVE_GCC_VISIBILITY)
+#    define FFTS_API __attribute__ ((visibility("default")))
+#  else
+#    define FFTS_API
+#  endif
+#endif
+
+/* The direction of the transform
+   (i.e, the sign of the exponent in the transform.)
+*/
+#define FFTS_FORWARD (-1)
+#define FFTS_BACKWARD (+1)
+
+struct _ffts_plan_t;
+typedef struct _ffts_plan_t ffts_plan_t;
+
+/* Complex data is stored in the interleaved format
+   (i.e, the real and imaginary parts composing each
+   element of complex data are stored adjacently in memory)
+
+   The multi-dimensional arrays passed are expected to be
+   stored as a single contiguous block in row-major order
+*/
+FFTS_API ffts_plan_t*
+ffts_init_1d(size_t N, int sign);
+
+FFTS_API ffts_plan_t*
+ffts_init_2d(size_t N1, size_t N2, int sign);
+
+FFTS_API ffts_plan_t*
+ffts_init_nd(int rank, size_t *Ns, int sign);
+
+/* For real transforms, sign == FFTS_FORWARD implies a real-to-complex
+   forwards tranform, and sign == FFTS_BACKWARD implies a complex-to-real
+   backwards transform.
+
+   The output of a real-to-complex transform is N/2+1 complex numbers,
+   where the redundant outputs have been omitted.
+*/
+FFTS_API ffts_plan_t*
+ffts_init_1d_real(size_t N, int sign);
+
+FFTS_API ffts_plan_t*
+ffts_init_2d_real(size_t N1, size_t N2, int sign);
+
+FFTS_API ffts_plan_t*
+ffts_init_nd_real(int rank, size_t *Ns, int sign);
+
+FFTS_API void
+ffts_execute(ffts_plan_t *p, const void *input, void *output);
+
+FFTS_API void
+ffts_free(ffts_plan_t *p);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FFTS_H */
diff --git a/include/third_party/ffts/ffts_attributes.h b/include/third_party/ffts/ffts_attributes.h
new file mode 100644
index 000000000..bdfd6162f
--- /dev/null
+++ b/include/third_party/ffts/ffts_attributes.h
@@ -0,0 +1,111 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_ATTRIBUTES_H
+#define FFTS_ATTRIBUTES_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+/* Macro definitions for various function/variable attributes */
+#ifdef __GNUC__
+#define GCC_VERSION_AT_LEAST(x,y) \
+	(__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y)
+#else
+#define GCC_VERSION_AT_LEAST(x,y) 0
+#endif
+
+#ifdef __GNUC__
+#define FFTS_ALIGN(x) __attribute__((aligned(x)))
+#elif defined(_MSC_VER)
+#define FFTS_ALIGN(x) __declspec(align(x))
+#else
+#define FFTS_ALIGN(x)
+#endif
+
+#if GCC_VERSION_AT_LEAST(3,1)
+#define FFTS_ALWAYS_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define FFTS_ALWAYS_INLINE __forceinline
+#else
+#define FFTS_ALWAYS_INLINE inline
+#endif
+
+#if defined(_MSC_VER)
+#define FFTS_INLINE __inline
+#else
+#define FFTS_INLINE inline
+#endif
+
+#if defined(__GNUC__)
+#define FFTS_RESTRICT __restrict
+#elif defined(_MSC_VER)
+#define FFTS_RESTRICT __restrict
+#else
+#define FFTS_RESTRICT
+#endif
+
+#if GCC_VERSION_AT_LEAST(4,5)
+#define FFTS_ASSUME(cond) do { if (!(cond)) __builtin_unreachable(); } while (0)
+#elif defined(_MSC_VER)
+#define FFTS_ASSUME(cond) __assume(cond)
+#else
+#define FFTS_ASSUME(cond)
+#endif
+
+#if GCC_VERSION_AT_LEAST(4,7)
+#define FFTS_ASSUME_ALIGNED_16(x) __builtin_assume_aligned(x, 16)
+#else
+#define FFTS_ASSUME_ALIGNED_16(x) x
+#endif
+
+#if GCC_VERSION_AT_LEAST(4,7)
+#define FFTS_ASSUME_ALIGNED_32(x) __builtin_assume_aligned(x, 32)
+#else
+#define FFTS_ASSUME_ALIGNED_32(x) x
+#endif
+
+#if defined(__GNUC__)
+#define FFTS_LIKELY(cond) __builtin_expect(!!(cond), 1)
+#else
+#define FFTS_LIKELY(cond) cond
+#endif
+
+#if defined(__GNUC__)
+#define FFTS_UNLIKELY(cond) __builtin_expect(!!(cond), 0)
+#else
+#define FFTS_UNLIKELY(cond) cond
+#endif
+
+#endif /* FFTS_ATTRIBUTES_H */
diff --git a/libs/third_party/ffts/libffts.a b/libs/third_party/ffts/libffts.a
new file mode 100644
index 000000000..b30b11471
Binary files /dev/null and b/libs/third_party/ffts/libffts.a differ
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index a6d58d5a2..5f8048a19 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -6123,41 +6123,41 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2
 inline void compute_variance_8_host(__m256d *p1, __m256d *pMean, __m256d *pVar)
 {
     __m256d pSub = _mm256_sub_pd(p1[0], pMean[0]);
-    pVar[0] = _mm256_fmadd_pd(pSub, pSub, pVar[0]);
+    pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]);
     pSub = _mm256_sub_pd(p1[1], pMean[0]);
-    pVar[0] = _mm256_fmadd_pd(pSub, pSub, pVar[0]);
+    pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]);
 }
 
-inline void compute_varianceChannel_pln3_24_host(__m256d *p1, __m256d *pMeanR, __m256d *pMeanG, __m256d *pMeanB, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
+inline void compute_variance_channel_pln3_24_host(__m256d *p1, __m256d *pMeanR, __m256d *pMeanG, __m256d *pMeanB, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
 {
     __m256d pSub = _mm256_sub_pd(p1[0], pMeanR[0]);
-    pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]);
+    pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
     pSub = _mm256_sub_pd(p1[1], pMeanR[0]);
-    pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]);
+    pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
     pSub = _mm256_sub_pd(p1[2], pMeanG[0]);
-    pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]);
+    pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
     pSub = _mm256_sub_pd(p1[3], pMeanG[0]);
-    pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]);
+    pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
     pSub = _mm256_sub_pd(p1[4], pMeanB[0]);
-    pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]);
+    pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
     pSub = _mm256_sub_pd(p1[5], pMeanB[0]);
-    pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]);
+    pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
 }
 
-inline void compute_varianceImage_pln3_24_host(__m256d *p1, __m256d *pMean, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
+inline void compute_variance_image_pln3_24_host(__m256d *p1, __m256d *pMean, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
 {
     __m256d pSub = _mm256_sub_pd(p1[0], pMean[0]);
-    pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]);
+    pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
     pSub = _mm256_sub_pd(p1[1], pMean[0]);
-    pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]);
+    pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
     pSub = _mm256_sub_pd(p1[2], pMean[0]);
-    pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]);
+    pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
     pSub = _mm256_sub_pd(pMean[0], p1[3]);
-    pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]);
+    pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
     pSub = _mm256_sub_pd(p1[4], pMean[0]);
-    pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]);
+    pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
     pSub = _mm256_sub_pd(p1[5], pMean[0]);
-    pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]);
+    pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
 }
 
 inline void compute_vignette_48_host(__m256 *p, __m256 &pMultiplier, __m256 &pILocComponent, __m256 &pJLocComponent)
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index d47193831..fd9373669 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -2611,6 +2611,127 @@ inline Rpp32f rpp_hsum_ps(__m256 x)
     return _mm_cvtss_f32(sum);
 }
 
+/* Computes inverse square root */
+inline Rpp32f rpp_rsqrt_ps(Rpp32f x)
+{
+    __m128 X = _mm_set_ss(x);
+    __m128 tmp = _mm_rsqrt_ss(X);
+    Rpp32f y = _mm_cvtss_f32(tmp);
+    return y * (1.5f - x * 0.5f * y * y);
+}
+
+/* Compute inverse square root */
+/* SSE matches to 6 decimal places with raw C version due to newton rhapson approximation*/
+inline void rpp_rsqrt_sse(Rpp32f *input, Rpp64s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f mul)
+{
+    Rpp64s i = 0;
+    __m128 rdivx4 = _mm_set1_ps(rdiv);
+    __m128 mulx4 = _mm_set1_ps(mul * 0.5f);
+    if (eps) // epsilon is present - no need for masking, but we need to add it
+    {
+        __m128 epsx4 = _mm_set1_ps(eps);
+        for (; i + 4 <= numElements; i += 4)
+        {
+            __m128 x = _mm_loadu_ps(&input[i]);
+            x = _mm_mul_ps(x, rdivx4);
+            x = _mm_add_ps(x, epsx4);
+            __m128 y = _mm_rsqrt_ps(x);
+            __m128 y2 = _mm_mul_ps(y, y);
+            __m128 xy2 = _mm_mul_ps(x, y2);
+            __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2);
+            y = _mm_mul_ps(y, three_minus_xy2);
+            y = _mm_mul_ps(y, mulx4);
+            _mm_storeu_ps(&input[i], y);
+        }
+    }
+    else
+    {
+        for (; i + 4 <= numElements; i += 4)
+        {
+            __m128 x = _mm_loadu_ps(&input[i]);
+            x = _mm_mul_ps(x, rdivx4);
+            __m128 mask = _mm_cmpneq_ps(x, xmm_p0);
+            __m128 y = _mm_rsqrt_ps(x);
+            y = _mm_and_ps(y, mask);
+            __m128 y2 = _mm_mul_ps(y, y);
+            __m128 xy2 = _mm_mul_ps(x, y2);
+            __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2);
+            y = _mm_mul_ps(y, three_minus_xy2);
+            y = _mm_mul_ps(y, mulx4);
+            _mm_storeu_ps(&input[i], y);
+        }
+    }
+    if (eps)
+    {
+        for (; i < numElements; i++)
+            input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * mul;
+    }
+    else
+    {
+        for (; i < numElements; i++)
+        {
+            Rpp32f x = input[i] * rdiv;
+            input[i] = x ? rpp_rsqrt_ps(x) * mul : 0;
+        }
+    }
+}
+
+/* Compute inverse square root */
+/* AVX2 matches to 6 decimal places with raw C version due to newton rhapson approximation*/
+inline void rpp_rsqrt_avx(Rpp32f *input, Rpp32s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f scale)
+{
+    Rpp32s i = 0;
+    __m256 rdivx8 = _mm256_set1_ps(rdiv);
+    __m256 mulx8 = _mm256_set1_ps(scale * 0.5f);
+    if (eps) // epsilon is present - no need for masking, but we need to add it
+    {
+        __m256 epsx8 = _mm256_set1_ps(eps);
+        for (; i + 8 <= numElements; i += 8)
+        {
+            __m256 x = _mm256_loadu_ps(&input[i]);
+            x = _mm256_mul_ps(x, rdivx8);
+            x = _mm256_add_ps(x, epsx8);
+            __m256 y = _mm256_rsqrt_ps(x);
+            __m256 y2 = _mm256_mul_ps(y, y);
+            __m256 xy2 = _mm256_mul_ps(x, y2);
+            __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2);
+            y = _mm256_mul_ps(y, three_minus_xy2);
+            y = _mm256_mul_ps(y, mulx8);
+            _mm256_storeu_ps(&input[i], y);
+        }
+    }
+    else
+    {
+        for (; i + 8 <= numElements; i += 8)
+        {
+            __m256 x = _mm256_loadu_ps(&input[i]);
+            x = _mm256_mul_ps(x, rdivx8);
+            __m256 mask = _mm256_cmp_ps(x, avx_p0, _CMP_NEQ_OQ);
+            __m256 y = _mm256_rsqrt_ps(x);
+            y = _mm256_and_ps(y, mask);
+            __m256 y2 = _mm256_mul_ps(y, y);
+            __m256 xy2 = _mm256_mul_ps(x, y2);
+            __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2);
+            y = _mm256_mul_ps(y, three_minus_xy2);
+            y = _mm256_mul_ps(y, mulx8);
+            _mm256_storeu_ps(&input[i], y);
+        }
+    }
+    if (eps)
+    {
+        for (; i < numElements; i++)
+            input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * scale;
+    }
+    else
+    {
+        for (; i < numElements; i++)
+        {
+            Rpp32f x = input[i] * rdiv;
+            input[i] = x ? rpp_rsqrt_ps(x) * scale : 0;
+        }
+    }
+}
+
 static inline void fast_matmul4x4_sse(float *A, float *B, float *C)
 {
     __m128 row1 = _mm_load_ps(&B[0]);                   // Row 0 of B
diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
index 1f47e8ee7..4338483a9 100644
--- a/src/modules/CMakeLists.txt
+++ b/src/modules/CMakeLists.txt
@@ -97,6 +97,7 @@ if( "${BACKEND}" STREQUAL "HIP")
     set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP})
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS}")
     set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+    set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
 
     # Add HIP specific preprocessor flags
     add_definitions(-DHIP_COMPILE)
@@ -111,6 +112,7 @@ elseif( "${BACKEND}" STREQUAL "OCL")
     list(APPEND Rpp_Source ${CPPFILES} ${MOD_CL_CPP})
     message("-- ${Green}OpenCL kernels added!${ColourReset}")
     set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+    set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
 
     # Add OpenCL specific preprocessor flags
     add_definitions(-DOCL_COMPILE)
@@ -125,6 +127,7 @@ elseif( "${BACKEND}" STREQUAL "CPU")
     # Add CPU specific includes
     set(INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/common/)
     set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+    set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
 endif()
 message("-- ${White}AMD RPP ${PROJECT_NAME} -- Include Directories:${INCLUDE_LIST}${ColourReset}")
 add_compile_options("-Wno-unused-result")
diff --git a/src/modules/cpu/host_tensor_audio_augmentations.hpp b/src/modules/cpu/host_tensor_audio_augmentations.hpp
index 0c6ccf211..82d43d082 100644
--- a/src/modules/cpu/host_tensor_audio_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_audio_augmentations.hpp
@@ -29,6 +29,8 @@ SOFTWARE.
 #include "kernel/to_decibels.hpp"
 #include "kernel/pre_emphasis_filter.hpp"
 #include "kernel/down_mixing.hpp"
+#include "kernel/spectrogram.hpp"
+#include "kernel/mel_filter_bank.hpp"
 #include "kernel/resample.hpp"
 
 #endif // HOST_TENSOR_AUDIO_AUGMENTATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp
index 15a4245c5..0a7e8ff51 100644
--- a/src/modules/cpu/host_tensor_statistical_operations.hpp
+++ b/src/modules/cpu/host_tensor_statistical_operations.hpp
@@ -30,5 +30,6 @@ SOFTWARE.
 #include "kernel/tensor_max.hpp"
 #include "kernel/tensor_mean.hpp"
 #include "kernel/tensor_stddev.hpp"
+#include "kernel/normalize.hpp"
 
-#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
+#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/cpu/kernel/mel_filter_bank.hpp b/src/modules/cpu/kernel/mel_filter_bank.hpp
new file mode 100644
index 000000000..9cc6d26d2
--- /dev/null
+++ b/src/modules/cpu/kernel/mel_filter_bank.hpp
@@ -0,0 +1,252 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+struct BaseMelScale
+{
+    public:
+        virtual Rpp32f hz_to_mel(Rpp32f hz) = 0;
+        virtual Rpp32f mel_to_hz(Rpp32f mel) = 0;
+        virtual ~BaseMelScale() = default;
+};
+
+struct HtkMelScale : public BaseMelScale
+{
+    Rpp32f hz_to_mel(Rpp32f hz) { return 1127.0f * std::log(1.0f + (hz / 700.0f)); }
+    Rpp32f mel_to_hz(Rpp32f mel) { return 700.0f * (std::exp(mel / 1127.0f) - 1.0f); }
+    public:
+        ~HtkMelScale() {};
+};
+
+struct SlaneyMelScale : public BaseMelScale
+{
+    const Rpp32f freqLow = 0;
+    const Rpp32f fsp = 200.0 / 3.0;
+    const Rpp32f minLogHz = 1000.0;
+    const Rpp32f minLogMel = (minLogHz - freqLow) / fsp;
+    const Rpp32f stepLog = 0.068751777;  // Equivalent to std::log(6.4) / 27.0;
+
+    const Rpp32f invMinLogHz = 1.0f / 1000.0;
+    const Rpp32f invStepLog = 1.0f / stepLog;
+    const Rpp32f invFsp = 1.0f / fsp;
+
+    Rpp32f hz_to_mel(Rpp32f hz)
+    {
+        Rpp32f mel = 0.0f;
+        if (hz >= minLogHz)
+            mel = minLogMel + std::log(hz * invMinLogHz) * invStepLog;
+        else
+            mel = (hz - freqLow) * invFsp;
+
+        return mel;
+    }
+
+    Rpp32f mel_to_hz(Rpp32f mel)
+    {
+        Rpp32f hz = 0.0f;
+        if (mel >= minLogMel)
+            hz = minLogHz * std::exp(stepLog * (mel - minLogMel));
+        else
+            hz = freqLow + mel * fsp;
+        return hz;
+    }
+    public:
+        ~SlaneyMelScale() {};
+};
+
+RppStatus mel_filter_bank_host_tensor(Rpp32f *srcPtr,
+                                      RpptDescPtr srcDescPtr,
+                                      Rpp32f *dstPtr,
+                                      RpptDescPtr dstDescPtr,
+                                      Rpp32s *srcDimsTensor,
+                                      Rpp32f maxFreqVal,    // check unused
+                                      Rpp32f minFreqVal,
+                                      RpptMelScaleFormula melFormula,
+                                      Rpp32s numFilter,
+                                      Rpp32f sampleRate,
+                                      bool normalize,
+                                      rpp::Handle& handle)
+{
+    BaseMelScale *melScalePtr;
+    switch(melFormula)
+    {
+        case RpptMelScaleFormula::HTK:
+            melScalePtr = new HtkMelScale;
+            break;
+        case RpptMelScaleFormula::SLANEY:
+        default:
+            melScalePtr = new SlaneyMelScale();
+            break;
+    }
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u batchSize = srcDescPtr->n;
+    Rpp32f *scratchMem = handle.GetInitHandle()->mem.mcpu.scratchBufferHost;
+
+    Rpp32f maxFreq = sampleRate / 2;
+    Rpp32f minFreq = minFreqVal;
+
+    // Convert lower, higher frequencies to mel scale and find melStep
+    Rpp64f melLow = melScalePtr->hz_to_mel(minFreq);
+    Rpp64f melHigh = melScalePtr->hz_to_mel(maxFreq);
+    Rpp64f melStep = (melHigh - melLow) / (numFilter + 1);
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+    {
+        Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        // Extract nfft, number of Frames, numBins
+        Rpp32s nfft = (srcDimsTensor[batchCount * 2] - 1) * 2;
+        Rpp32s numBins = nfft / 2 + 1;
+        Rpp32s numFrames = srcDimsTensor[batchCount * 2 + 1];
+
+        // Find hzStep
+        Rpp64f hzStep = static_cast<Rpp64f>(sampleRate) / nfft;
+        Rpp64f invHzStep = 1.0 / hzStep;
+
+        // Find fftBinStart and fftBinEnd
+        Rpp32s fftBinStart = std::ceil(minFreq * invHzStep);
+        Rpp32s fftBinEnd = std::ceil(maxFreq * invHzStep);
+        fftBinEnd = std::min(fftBinEnd, numBins);
+
+        // Set/Fill normFactors, weightsDown and intervals
+        Rpp32f *normFactors = scratchMem + (batchCount * numFilter);
+        std::fill(normFactors, normFactors + numFilter, 1.f);           // normFactors contain numFilter values of type float
+        Rpp32f *weightsDown = scratchMem + (batchSize * numFilter) + (batchCount * numBins);
+        memset(weightsDown, 0, sizeof(numBins * sizeof(Rpp32f)));       // weightsDown contain numBins values of type float
+        Rpp32s *intervals = reinterpret_cast<Rpp32s *>(weightsDown + (batchSize * numBins));
+        std::fill(intervals, intervals + numBins, -1);                  // intervals contain numBins values of type integer
+
+        Rpp32s fftBin = fftBinStart;
+        Rpp64f mel0 = melLow, mel1 = melLow + melStep;
+        Rpp64f fIter = fftBin * hzStep;
+        for (int interval = 0; interval < numFilter + 1; interval++, mel0 = mel1, mel1 += melStep)
+        {
+            Rpp64f f0 = melScalePtr->mel_to_hz(mel0);
+            Rpp64f f1 = melScalePtr->mel_to_hz(interval == numFilter ? melHigh : mel1);
+            Rpp64f slope = 1. / (f1 - f0);
+
+            if (normalize && interval < numFilter)
+            {
+                Rpp64f f2 = melScalePtr->mel_to_hz(mel1 + melStep);
+                normFactors[interval] = 2.0 / (f2 - f0);
+            }
+
+            for (; fftBin < fftBinEnd && fIter < f1; fftBin++, fIter = fftBin * hzStep)
+            {
+                weightsDown[fftBin] = (f1 - fIter) * slope;
+                intervals[fftBin] = interval;
+            }
+        }
+
+        Rpp32u maxFrames = std::min(static_cast<Rpp32u>(numFrames + 8), dstDescPtr->strides.hStride);
+        Rpp32u maxAlignedLength = maxFrames & ~7;
+        Rpp32u vectorIncrement = 8;
+
+        // Set ROI values in dst buffer to 0.0
+        for(int i = 0; i < numFilter; i++)
+        {
+            Rpp32f *dstPtrRow = dstPtrTemp + i * dstDescPtr->strides.hStride;
+            Rpp32u vectorLoopCount = 0;
+            for(; vectorLoopCount < maxAlignedLength; vectorLoopCount += 8)
+            {
+                _mm256_storeu_ps(dstPtrRow, avx_p0);
+                dstPtrRow += 8;
+            }
+            for(; vectorLoopCount < maxFrames; vectorLoopCount++)
+                *dstPtrRow++ = 0.0f;
+        }
+
+        Rpp32u alignedLength = numFrames & ~7;
+        __m256 pSrc, pDst;
+        Rpp32f *srcRowPtr = srcPtrTemp + fftBinStart * srcDescPtr->strides.hStride;
+        for (int64_t fftBin = fftBinStart; fftBin < fftBinEnd; fftBin++)
+        {
+            auto filterUp = intervals[fftBin];
+            auto weightUp = 1.0f - weightsDown[fftBin];
+            auto filterDown = filterUp - 1;
+            auto weightDown = weightsDown[fftBin];
+
+            if (filterDown >= 0)
+            {
+                Rpp32f *dstRowPtrTemp = dstPtrTemp + filterDown * dstDescPtr->strides.hStride;
+                Rpp32f *srcRowPtrTemp = srcRowPtr;
+
+                if (normalize)
+                    weightDown *= normFactors[filterDown];
+                __m256 pWeightDown = _mm256_set1_ps(weightDown);
+
+                int vectorLoopCount = 0;
+                for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    pSrc = _mm256_loadu_ps(srcRowPtrTemp);
+                    pSrc = _mm256_mul_ps(pSrc, pWeightDown);
+                    pDst = _mm256_loadu_ps(dstRowPtrTemp);
+                    pDst = _mm256_add_ps(pDst, pSrc);
+                    _mm256_storeu_ps(dstRowPtrTemp, pDst);
+                    dstRowPtrTemp += vectorIncrement;
+                    srcRowPtrTemp += vectorIncrement;
+                }
+
+                for (; vectorLoopCount < numFrames; vectorLoopCount++)
+                    (*dstRowPtrTemp++) += weightDown * (*srcRowPtrTemp++);
+            }
+
+            if (filterUp >= 0 && filterUp < numFilter)
+            {
+                Rpp32f *dstRowPtrTemp = dstPtrTemp + filterUp *  dstDescPtr->strides.hStride;
+                Rpp32f *srcRowPtrTemp = srcRowPtr;
+
+                if (normalize)
+                    weightUp *= normFactors[filterUp];
+                __m256 pWeightUp = _mm256_set1_ps(weightUp);
+
+                int vectorLoopCount = 0;
+                for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    pSrc = _mm256_loadu_ps(srcRowPtrTemp);
+                    pSrc = _mm256_mul_ps(pSrc, pWeightUp);
+                    pDst = _mm256_loadu_ps(dstRowPtrTemp);
+                    pDst = _mm256_add_ps(pDst, pSrc);
+                    _mm256_storeu_ps(dstRowPtrTemp, pDst);
+                    dstRowPtrTemp += vectorIncrement;
+                    srcRowPtrTemp += vectorIncrement;
+                }
+
+                for (; vectorLoopCount < numFrames; vectorLoopCount++)
+                    (*dstRowPtrTemp++) += weightUp * (*srcRowPtrTemp++);
+            }
+
+            srcRowPtr += srcDescPtr->strides.hStride;
+        }
+    }
+    delete melScalePtr;
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/non_silent_region_detection.hpp b/src/modules/cpu/kernel/non_silent_region_detection.hpp
index 74dffb18e..39d9e6940 100644
--- a/src/modules/cpu/kernel/non_silent_region_detection.hpp
+++ b/src/modules/cpu/kernel/non_silent_region_detection.hpp
@@ -95,8 +95,8 @@ Rpp32f getSquare(Rpp32f &value)
 RppStatus non_silent_region_detection_host_tensor(Rpp32f *srcPtr,
                                                   RpptDescPtr srcDescPtr,
                                                   Rpp32s *srcLengthTensor,
-                                                  Rpp32f *detectedIndexTensor,
-                                                  Rpp32f *detectionLengthTensor,
+                                                  Rpp32s *detectedIndexTensor,
+                                                  Rpp32s *detectionLengthTensor,
                                                   Rpp32f cutOffDB,
                                                   Rpp32s windowLength,
                                                   Rpp32f referencePower,
diff --git a/src/modules/cpu/kernel/normalize.hpp b/src/modules/cpu/kernel/normalize.hpp
new file mode 100644
index 000000000..dbe746d1a
--- /dev/null
+++ b/src/modules/cpu/kernel/normalize.hpp
@@ -0,0 +1,882 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+// Computes strides
+void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim)
+{
+    if (tensorDim > 0)
+    {
+        Rpp32u v = 1;
+        for (Rpp32u i = tensorDim - 1; i > 0; i--)
+        {
+            strides[i] = v;
+            v *= shape[i];
+        }
+        strides[0] = v;
+    }
+}
+
+// Recursive reduction helper function to compute difference of input with mean and squares them up
+template<typename T>
+void compute_diff_square_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements, Rpp32f mean)
+{
+    if (numElements > 32)
+    {
+        Rpp32s currElements = numElements >> 1;
+        Rpp32f tmp1 = 0, tmp2 = 0;
+
+        // reduce first half and accumulate
+        compute_diff_square_sum(tmp1, input, inputStride, currElements, mean);
+
+        // reduce second half and accumulate
+        compute_diff_square_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements, mean);
+
+        tmp1 += tmp2;
+        output += tmp1;
+    }
+    else
+    {
+        // reduce to a temporary
+        Rpp32f tmp = 0;
+        for (Rpp32s i = 0; i < numElements; i++)
+        {
+            Rpp32f curr = (input[i * inputStride] - mean);
+            auto curSq = curr * curr;
+            tmp += curSq;
+        }
+
+        // accumulate in target value
+        output += tmp;
+    }
+}
+
+// Recursive reduction helper function to sum up input values
+template<typename T>
+void compute_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements)
+{
+    if (numElements > 32)
+    {
+        Rpp32s currElements = numElements >> 1;
+        Rpp32f tmp1 = 0, tmp2 = 0;
+
+        // reduce first half and accumulate
+        compute_sum(tmp1, input, inputStride, currElements);
+
+        // reduce second half and accumulate
+        compute_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements);
+
+        tmp1 += tmp2;
+        output += tmp1;
+    }
+    else
+    {
+        // reduce to a temporary
+        Rpp32f tmp = 0;
+        for (Rpp32s i = 0; i < numElements; i++)
+            tmp += input[i * inputStride];
+
+        // accumulate in target value
+        output += tmp;
+    }
+}
+
+// Computes mean for 2D inputs
+void compute_2D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride)
+{
+    Rpp32f *srcPtrTemp = srcPtr;
+    Rpp32f normFactor = 1.0 / dims[1];
+    for(Rpp32u i = 0; i < dims[0]; i++)
+    {
+        meanPtr[i] = 0;
+        compute_sum(meanPtr[i], srcPtrTemp, stride[0], dims[1]);
+        srcPtrTemp += stride[1];
+        meanPtr[i] *= normFactor;
+    }
+}
+
+// Computes inverse stddev for 2D inputs
+void compute_2D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale)
+{
+
+    Rpp32f *srcPtrTemp = srcPtr;
+    Rpp32f normFactor = (Rpp32f)(1.0 / dims[1]);
+    for(Rpp32u i = 0; i < dims[0]; i++)
+    {
+        stdDevPtr[i] = 0;
+        compute_diff_square_sum(stdDevPtr[i], srcPtrTemp, stride[0], dims[1], meanPtr[i]);
+        srcPtrTemp += stride[1];
+    }
+    rpp_rsqrt_sse(stdDevPtr, (Rpp32s)dims[0], 0, normFactor, scale);
+}
+
+// Computes mean for 3D inputs
+void compute_3D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, bool isConsecutive = true)
+{
+    Rpp32f *srcPtrTemp = srcPtr;
+    if(isConsecutive)
+    {
+        Rpp32f normFactor = 1.0 / dims[2];
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            float *srcPtrRow = srcPtrTemp;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                Rpp32u index = i * dims[1] + j;
+                meanPtr[index] = 0;
+                compute_sum(meanPtr[index], srcPtrRow, stride[0], dims[2]);
+                srcPtrRow += stride[1];
+                meanPtr[index] *= normFactor;
+            }
+            srcPtrTemp += stride[2];
+        }
+    }
+    else
+    {
+        Rpp32f normFactor = 1.0 / (dims[1] * dims[2]);
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            meanPtr[i] = 0;
+            Rpp32f *srcPtrRow = srcPtrTemp;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                compute_sum(meanPtr[i], srcPtrRow, stride[0], dims[2]);
+                srcPtrRow += stride[1];
+            }
+            meanPtr[i] *= normFactor;
+            srcPtrTemp += stride[2];
+        }
+    }
+}
+
+// Computes inverse stddev for 3D inputs
+void compute_3D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale, bool isConsecutive = true)
+{
+    Rpp32f *srcPtrTemp = srcPtr;
+    if(isConsecutive)
+    {
+        Rpp32f normFactor = (Rpp32f)(1.0 / dims[2]);
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            float *srcPtrRow = srcPtrTemp;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                Rpp32u index = i * dims[1] + j;
+                stdDevPtr[index] = 0;
+                compute_diff_square_sum(stdDevPtr[index], srcPtrRow, stride[0], dims[2], meanPtr[index]);
+                srcPtrRow += stride[1];
+            }
+            srcPtrTemp += stride[2];
+        }
+        rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0] * dims[1]), 0, normFactor, scale);
+    }
+    else
+    {
+        Rpp32f normFactor = (Rpp32f)(1.0 / (dims[1] * dims[2]));
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            stdDevPtr[i] = 0;
+            Rpp32f *srcPtrRow = srcPtrTemp;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                compute_diff_square_sum(stdDevPtr[i], srcPtrRow, stride[0], dims[2], meanPtr[i]);
+                srcPtrRow += stride[1];
+            }
+            srcPtrTemp += stride[2];
+        }
+        rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0]), 0, normFactor, scale);
+    }
+}
+
+// Computes mean for ND inputs
+template<typename T>
+void compute_ND_mean(T *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis)
+{
+    if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeSum when last dimension is to be normalized
+        compute_sum(meanPtr[index], srcPtr, stride[level], dims[level]);
+    else if(level == tensorDim) // Calls computeSum when only 1st axis need to be normalized
+        compute_sum(meanPtr[index], srcPtr, stride[norm], dims[norm]);
+    else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store mean
+    {
+        for(Rpp32u i = 0; i < dims[level]; i++)
+            compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis);
+    }
+    else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized
+        compute_ND_mean(srcPtr, meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+    else if(axis[level]) // Called when axis at present level needs to be normalized
+    {
+        for(Rpp32u i = 0; i < dims[level]; i++)
+            compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+    }
+}
+
+// Computes inverse stddev for ND inputs
+template<typename T>
+void compute_ND_stddev(T *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis)
+{
+    if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeDiffSumSquare when last dimension is to be normalized
+        compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[level], dims[level], meanPtr[index]);
+    else if(level == tensorDim) // Calls computeDiffSumSquare when only 1st axis need to be normalized
+        compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[norm], dims[norm], meanPtr[index]);
+    else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store stddev
+    {
+        for(Rpp32u i = 0; i < dims[level]; i++)
+            compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis);
+    }
+    else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized
+        compute_ND_stddev(srcPtr, meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+    else if(axis[level]) // Called when axis at present level needs to be normalized
+    {
+        for(Rpp32u i = 0; i < dims[level]; i++)
+            compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+    }
+}
+
+// Computes normalize for 3D non toggle variants
+void normalize_3D_tensor_nontoggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+                         Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length)
+{
+    Rpp32s paramIdx = 0;
+    Rpp32s idx1 = 0;
+
+    for(Rpp32u i = 0; i < length[0]; i++)
+    {
+        Rpp32f *srcPtrRow = srcPtr;
+        Rpp32f *dstPtrRow = dstPtr;
+        for(Rpp32u j = 0; j < length[1]; j++)
+        {
+            Rpp32f *srcPtrRowTemp = srcPtrRow;
+            Rpp32f *dstPtrRowTemp = dstPtrRow;
+            idx1 = paramIdx;
+            for(Rpp32u k = 0; k < length[2]; k++)
+            {
+                *dstPtrRowTemp = ((*srcPtrRowTemp - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift;
+                if(k < length[2] - 1)
+                    paramIdx += paramStride[2];
+                srcPtrRowTemp++;
+                dstPtrRowTemp++;
+            }
+            if(j < length[1] - 1)
+                paramIdx = (!paramStride[1]) ? idx1 : paramIdx + paramStride[1];
+            srcPtrRow += srcGenericDescPtr->strides[2];
+            dstPtrRow += dstGenericDescPtr->strides[2];
+        }
+        if(i < length[0] - 1)
+            paramIdx = (!paramStride[0]) ? 0 : paramIdx + paramStride[0];
+        srcPtr += srcGenericDescPtr->strides[1];
+        dstPtr += dstGenericDescPtr->strides[1];
+    }
+}
+
+// Computes normalize for 3D toggle variants when axis mask is set to 3
+void normalize_3D_tensor_axis3_toggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+                         Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length)
+{
+    Rpp32f *srcPtrTemp = srcPtr;
+    Rpp32f *dstPtrTemp[length[2]];
+    dstPtrTemp[0] = dstPtr;
+    for(Rpp32u i = 1; i < length[2]; i++)
+        dstPtrTemp[i] = dstPtrTemp[i-1] + dstGenericDescPtr->strides[1];
+    Rpp32s paramIdx = 0;
+
+    for(Rpp32u i = 0; i < length[0]; i++)
+    {
+        Rpp32f *srcPtrRow = srcPtrTemp;
+        Rpp32f *dstPtrRow[length[2]];
+        for(Rpp32u l = 0; l < length[2]; l++)
+            dstPtrRow[l] = dstPtrTemp[l];
+        for(Rpp32u j = 0; j < length[1]; j++)
+        {
+            Rpp32f *srcPtrRowTemp = srcPtrRow;
+            Rpp32f *dstPtrRowTemp[length[2]];
+            for(Rpp32u l = 0; l < length[2]; l++)
+                dstPtrRowTemp[l] = dstPtrRow[l];
+            for(Rpp32u k = 0; k < length[2]; k++)
+            {
+                *dstPtrRowTemp[k]++ = ((*srcPtrRowTemp++ - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift;
+                paramIdx += paramStride[2];
+            }
+            paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1];
+            srcPtrRow += srcGenericDescPtr->strides[2];
+            for(Rpp32u l = 0; l < length[2]; l++)
+                dstPtrRow[l] += dstGenericDescPtr->strides[3];
+        }
+        srcPtrTemp += srcGenericDescPtr->strides[1];
+        for(Rpp32u l = 0; l < length[2]; l++)
+            dstPtrTemp[l] += dstGenericDescPtr->strides[2];
+    }
+}
+
+// Computes normalize for 3D non toggle variants, optimized with AVX when axis mask set to 3 and 16 channel normalize
+void normalize_3D_tensor_avx_axis3(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+                                   Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u bufferLength, Rpp32u *length)
+{
+    Rpp32u vectorIncrement = 16;
+    Rpp32u alignedLength = (bufferLength / 16) * 16;
+    Rpp32u outerDim = length[0];
+
+    // set shift, mean and stddev
+    __m256 pShift = _mm256_set1_ps(shift);
+    __m256 pMean1 = _mm256_loadu_ps(meanPtr);
+    __m256 pMean2 = _mm256_loadu_ps(meanPtr + 8);
+    __m256 pMultiplier1 = _mm256_loadu_ps(multiplierPtr);
+    __m256 pMultiplier2 = _mm256_loadu_ps(multiplierPtr + 8);
+
+    for(Rpp32u i = 0; i < outerDim; i++)
+    {
+        Rpp32f *srcPtrTemp = srcPtr + i * srcGenericDescPtr->strides[1];
+        Rpp32f *dstPtrTemp = dstPtr + i * dstGenericDescPtr->strides[1];
+
+        Rpp32u vectorLoopCount = 0;
+        for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement)
+        {
+            __m256 pSrc1 = _mm256_loadu_ps(srcPtrTemp);
+            __m256 pSrc2 = _mm256_loadu_ps(srcPtrTemp + 8);
+            __m256 pDst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc1, pMean1), pMultiplier1), pShift);
+            __m256 pDst2 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc2, pMean2), pMultiplier2), pShift);
+            _mm256_storeu_ps(dstPtrTemp, pDst1);
+            _mm256_storeu_ps(dstPtrTemp + 8, pDst2);
+            srcPtrTemp += vectorIncrement;
+            dstPtrTemp += vectorIncrement;
+        }
+    }
+}
+
+// Computes normalize for ND non toggle variants for i8 dataype
+void normalize_ND_tensor_nontoggle(Rpp32s *srcPtr, Rpp32u *srcStride, Rpp32f *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr,
+                                   Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx)
+{
+    Rpp32u idx1 = 0;
+    if(tensorDim == 1)
+    {
+        for(Rpp32u k = 0; k < length[level]; k++)
+        {
+            *dstPtr++ = (((Rpp32f)(*srcPtr + 128) - meanPtr[idx]) * multiplierPtr[idx]) + shift;
+            if(k < length[level] - 1)
+                idx += paramStride[level];
+            srcPtr++;
+        }
+    }
+    else
+    {
+        idx1 = idx;
+        for (Rpp32u i = 0; i < length[level]; i++)
+        {
+            normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx);
+            if(i < length[level] - 1)
+                idx = (!paramStride[level]) ? idx1 : idx + paramStride[level];
+            dstPtr += srcStride[level];
+            srcPtr += srcStride[level];
+        }
+    }
+}
+
+// Computes normalize for ND non toggle variants
+template<typename T1, typename T2>
+void normalize_ND_tensor_nontoggle(T1 *srcPtr, Rpp32u *srcStride, T2 *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr,
+                                   Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx)
+{
+    Rpp32u idx1 = 0;
+    if(tensorDim == 1)
+    {
+        T1 *srcPtrTemp = srcPtr;
+        T2 *dstPtrTemp = dstPtr;
+
+        for(Rpp32u k = 0; k < length[level]; k++)
+        {
+            *dstPtrTemp = (((T2)*srcPtrTemp - meanPtr[idx]) * multiplierPtr[idx]) + shift;
+            if(k < length[level] - 1)
+                idx += paramStride[level];
+            srcPtrTemp++;
+            dstPtrTemp++;
+        }
+    }
+    else
+    {
+        idx1 = idx;
+        for (Rpp32u i = 0; i < length[level]; i++)
+        {
+            normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx);
+            if(i < length[level] - 1)
+                idx = (!paramStride[level]) ? idx1 : idx + paramStride[level];
+            dstPtr += srcStride[level];
+            srcPtr += srcStride[level];
+        }
+    }
+}
+
+// Computes normalize for 2D
+void normalize_2D_tensor(Rpp32f *srcPtr, RpptGenericDescPtr srcDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstDescPtr,
+                         Rpp32f *meanPtr, Rpp32f *invStdDevPtr, Rpp32f shift, Rpp32u *dims, Rpp32u *paramStride)
+{
+    if (paramStride[1]) // Optimized with AVX when axis mask set to 2
+    {
+        Rpp32u vectorIncrement = 8;
+        Rpp32u bufferLength = dims[1];
+        Rpp32u alignedLength = (bufferLength / 8) * 8;
+
+        __m256 pShift = _mm256_set1_ps(shift);
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            Rpp32f *srcPtrTemp = srcPtr + i * srcDescPtr->strides[1];
+            Rpp32f *dstPtrTemp = dstPtr + i * dstDescPtr->strides[1];
+
+            // set mean and stddev
+            Rpp32f mean = meanPtr[i];
+            Rpp32f invStdDev = invStdDevPtr[i];
+            __m256 pMean, pInvStdDev;
+            pMean = _mm256_set1_ps(mean);
+            pInvStdDev = _mm256_set1_ps(invStdDev);
+
+            Rpp32u vectorLoopCount = 0;
+            for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement)
+            {
+                __m256 pSrc = _mm256_loadu_ps(srcPtrTemp);
+                __m256 pDst = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc, pMean), pInvStdDev), pShift);
+                _mm256_storeu_ps(dstPtrTemp, pDst);
+                srcPtrTemp += vectorIncrement;
+                dstPtrTemp += vectorIncrement;
+            }
+            for(; vectorLoopCount < dims[1] ; vectorLoopCount ++)
+                *dstPtrTemp++ = (*srcPtrTemp++ - mean) * invStdDev + shift;
+        }
+    }
+    else
+    {
+        Rpp32s paramIdx = 0;
+        for(Rpp32u i = 0; i < dims[0]; i++)
+        {
+            Rpp32f *srcPtrTemp = srcPtr;
+            Rpp32f *dstPtrTemp = dstPtr;
+            for(Rpp32u j = 0; j < dims[1]; j++)
+            {
+                *dstPtrTemp++ = (*srcPtrTemp++ - meanPtr[paramIdx]) * invStdDevPtr[paramIdx] + shift;
+                paramIdx += paramStride[0];
+            }
+            paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1];
+            srcPtr += srcDescPtr->strides[1];
+            dstPtr += dstDescPtr->strides[1];
+        }
+    }
+}
+
+// Performs collapse axis operation wherein continuous axis that require normalization are combined together
+void collapse_axis(Rpp32u *tensorDim, Rpp32u *axis, Rpp32u *length, Rpp32u *newAxis, Rpp32u *newDims, Rpp32u *lastNormAxis)
+{
+    int skipped = 0, prev = -2, k = 0;
+    for(Rpp32u i = 0; i < *tensorDim; i++)
+    {
+        if(axis[i])
+        {
+            int temp = i - skipped;
+            if(temp != prev + 1)
+            {
+                newAxis[k] = 1;
+                newDims[k] = length[i];
+                prev = i;
+                k++;
+            }
+            else if(prev >= 0)
+            {
+                newDims[prev] *= length[i];
+                skipped++;
+            }
+        }
+        else
+        {
+            newDims[k] = length[i];
+            k++;
+        }
+    }
+    *tensorDim -= skipped;
+    for(Rpp32u i = 0; i < *tensorDim; i++)
+    {
+        if(newAxis[i])
+            *lastNormAxis = i;
+    }
+}
+
+RppStatus normalize_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                        RpptGenericDescPtr srcGenericDescPtr,
+                                        Rpp32f *dstPtr,
+                                        RpptGenericDescPtr dstGenericDescPtr,
+                                        Rpp32u axisMask,
+                                        Rpp32f *meanTensorPtr,
+                                        Rpp32f *stdDevTensorPtr,
+                                        Rpp8u computeMeanStddev,
+                                        Rpp32f scale,
+                                        Rpp32f shift,
+                                        Rpp32u *roiTensor,
+                                        RppLayoutParams layoutParams,
+                                        rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u tensorDims = srcGenericDescPtr->numDims - 1;
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    Rpp32u maxSize = 1;
+    // Compute maxSize as length of input tensors differ based on axisMask and tensorDims
+    for(int batch = 0; batch < batchSize; batch++)
+    {
+        Rpp32u size = 1;
+        for(int i = 0; i < tensorDims; i++)
+            size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i];
+        maxSize = std::max(maxSize, size);
+    }
+
+    if(!computeMeanStddev)
+    {
+        for(Rpp32u i = 0; i < maxSize; i++)
+            stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i];
+        maxSize = 0;
+    }
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+	{
+        Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[tensorDims];
+
+        Rpp32f *srcPtrTemp, *dstPtrTemp, *meanTensor, *stdDevTensor;
+        srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+        meanTensor = meanTensorPtr + batchCount * maxSize;
+        stdDevTensor = stdDevTensorPtr + batchCount * maxSize;
+
+        // Set all values in dst buffer to 0.0
+        for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++)
+            dstPtrTemp[cnt] = 0.0f;
+
+        Rpp32f *srcPtrChannel = srcPtrTemp;
+
+        if(tensorDims == 2) // Called for audio testcase and for any other 2D case
+        {
+            Rpp32u paramStride[2];
+            Rpp32u srcReductionDims[2], srcStride[2];
+            if (axisMask == 3)
+            {
+                srcStride[0] = srcStride[1] = srcGenericDescPtr->strides[2];
+                srcReductionDims[0] = 1;
+                srcReductionDims[1] = length[0] * length[1];
+                paramStride[0] = paramStride[1] = 0;
+            }
+            else if (axisMask == 1)
+            {
+                srcStride[0] = srcGenericDescPtr->strides[1];
+                srcStride[1] = srcGenericDescPtr->strides[2];
+                srcReductionDims[0] = length[1];
+                srcReductionDims[1] = length[0];
+                paramStride[0] = 1;
+                paramStride[1] = 0;
+            }
+            else if (axisMask == 2)
+            {
+                srcStride[0] = srcGenericDescPtr->strides[2];
+                srcStride[1] = srcGenericDescPtr->strides[1];
+                srcReductionDims[0] = length[0];
+                srcReductionDims[1] = length[1];
+                paramStride[0] = 0;
+                paramStride[1] = 1;
+            }
+
+            if(computeMeanStddev & 1) // Check if mean is to be computed internally
+                compute_2D_mean(srcPtrTemp, meanTensor, srcReductionDims, srcStride);
+            if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+                compute_2D_inv_std_dev(srcPtrTemp, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale);
+
+            normalize_2D_tensor(srcPtrTemp, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, length, paramStride);
+        }
+        else if(tensorDims == 3) // Called when a 3D tensor is passed to kernel
+        {
+            Rpp32u paramStride[3];
+            Rpp32u srcReductionDims[3], srcStride[3];
+            Rpp32u reductionDims;
+            bool isConsecutive = true;
+            switch(axisMask)
+            {
+                case 1: // Normalize axes 0
+                {
+                    reductionDims = length[1] * length[2];
+                    paramStride[0] = 0;
+                    paramStride[1] = paramStride[2] = 1;
+                    srcReductionDims[0] = length[1];
+                    srcReductionDims[1] = length[2];
+                    srcReductionDims[2] = length[0];
+                    srcStride[0] = srcGenericDescPtr->strides[1];
+                    srcStride[1] = srcGenericDescPtr->strides[3];
+                    srcStride[2] = srcGenericDescPtr->strides[2];
+                    break;
+                }
+                case 2: // Normalize axes 1
+                {
+                    reductionDims = length[0] * length[2];
+                    paramStride[1] = 0;
+                    paramStride[0] = paramStride[2] = 1;
+                    srcReductionDims[0] = length[0];
+                    srcReductionDims[1] = length[2];
+                    srcReductionDims[2] = length[1];
+                    srcStride[0] = srcGenericDescPtr->strides[2];
+                    srcStride[1] = srcGenericDescPtr->strides[3];
+                    srcStride[2] = srcGenericDescPtr->strides[1];
+                    break;
+                }
+                case 3: // Normalize axes 0, 1
+                {
+                    reductionDims = length[2];
+                    paramStride[0] = paramStride[1] = 0;
+                    paramStride[2] = 1;
+                    srcReductionDims[0] = 1;
+                    srcReductionDims[1] = length[2];
+                    srcReductionDims[2] = length[0] * length[1];
+                    srcStride[0] = srcGenericDescPtr->strides[2];
+                    srcStride[1] = srcGenericDescPtr->strides[3];
+                    srcStride[2] = srcGenericDescPtr->strides[3];
+                    break;
+                }
+                case 4: // Normalize across 2
+                {
+                    reductionDims = length[0] * length[1];
+                    paramStride[2] = 0;
+                    paramStride[0] = paramStride[1] = 1;
+                    srcReductionDims[0] = length[0];
+                    srcReductionDims[1] = length[1];
+                    srcReductionDims[2] = length[2];
+                    srcStride[0] = srcGenericDescPtr->strides[3];
+                    srcStride[1] = srcGenericDescPtr->strides[2];
+                    srcStride[2] = srcGenericDescPtr->strides[1];
+                    break;
+                }
+                case 5: // Normalize across 0, 2
+                {
+                    reductionDims = length[1];
+                    paramStride[0] = paramStride[2] = 0;
+                    paramStride[1] = 1;
+                    srcReductionDims[0] = length[1];
+                    srcReductionDims[1] = length[0];
+                    srcReductionDims[2] = length[2];
+                    srcStride[0] = srcGenericDescPtr->strides[3];
+                    srcStride[1] = srcGenericDescPtr->strides[1];
+                    srcStride[2] = srcGenericDescPtr->strides[2];
+                    isConsecutive = false;
+                    break;
+                }
+                case 6: // Normalize across 1, 2
+                {
+                    reductionDims = length[0];
+                    paramStride[1] = paramStride[2] = 0;
+                    paramStride[0] = 1;
+                    srcReductionDims[0] = 1;
+                    srcReductionDims[1] = length[0];
+                    srcReductionDims[2] = length[1] * length[2];
+                    srcStride[0] = srcGenericDescPtr->strides[3];
+                    srcStride[1] = srcGenericDescPtr->strides[1];
+                    srcStride[2] = srcGenericDescPtr->strides[3];
+                    break;
+                }
+                case 7: // Normalize across 0, 1, 2
+                {
+                    reductionDims = 1;
+                    paramStride[0] = paramStride[1] = paramStride[2] = 0;
+                    srcReductionDims[0] = 1;
+                    srcReductionDims[1] = 1;
+                    srcReductionDims[2] = length[0] * length[1] * length[2];
+                    srcStride[0] = srcStride[1] = srcStride[2] = srcGenericDescPtr->strides[3];
+                    break;
+                }
+                default:
+                {
+                    std::cout<<"Invalid Axis mask"<<std::endl;
+                }
+            }
+
+            for(Rpp32u i = 1; i < tensorDims; i++)
+                srcPtrChannel += begin[i - 1] * srcGenericDescPtr->strides[i];
+
+            if(computeMeanStddev & 1) // Check if mean is to be computed internally
+                compute_3D_mean(srcPtrChannel, meanTensor, srcReductionDims, srcStride, isConsecutive);
+            if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+                compute_3D_inv_std_dev(srcPtrChannel, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale, isConsecutive);
+
+            if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC) && (srcGenericDescPtr->dims[3] == 16))
+                normalize_3D_tensor_avx_axis3(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length[1] * layoutParams.bufferMultiplier, length);
+            else if((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC))
+                normalize_3D_tensor_nontoggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length);
+            else if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NCHW))
+                normalize_3D_tensor_axis3_toggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length);
+        }
+        else // Handle any other ND tensor is passed to kernel
+        {
+            // Compute length of input tensors as they differ based on axisMask and tensorDims
+            int size = 1;
+            for(int i = 0; i < tensorDims; i++)
+                size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i];
+
+            Rpp32u totalElements = 1;
+            Rpp32u lastNormAxis = 0;
+            Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims];
+            // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis
+            memset(newAxis, 0, tensorDims * sizeof(Rpp32u));
+            memset(newDims, 0, tensorDims * sizeof(Rpp32u));
+
+            for(Rpp32u i = 0; i < tensorDims; i++)
+            {
+                axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+                totalElements *= axis[i] ? length[i] : 1;
+                srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1];
+            }
+
+            Rpp32u paramStride[tensorDims], srcStride[tensorDims];
+            collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis);
+            compute_strides(srcStride, newDims, tensorDims);
+
+            if(computeMeanStddev & 1) // Check if mean is to be computed internally
+            {
+                compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+                Rpp32f normFactor = 1.0 / totalElements;
+                for(int i = 0; i < size; i++)
+                    meanTensor[i] *= normFactor;
+            }
+            if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+            {
+                compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+                Rpp32f normFactor = (Rpp32f)(1.0 / totalElements);
+                rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale);
+            }
+
+            for(Rpp32u i = 0; i < tensorDims; i++)
+                paramStride[i] = !newAxis[i];
+
+            Rpp32u idx = 0;
+            normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+template<typename T1, typename T2>
+RppStatus normalize_generic_host_tensor(T1 *srcPtr,
+                                        RpptGenericDescPtr srcGenericDescPtr,
+                                        T2 *dstPtr,
+                                        RpptGenericDescPtr dstGenericDescPtr,
+                                        Rpp32u axisMask,
+                                        Rpp32f *meanTensorPtr,
+                                        Rpp32f *stdDevTensorPtr,
+                                        Rpp8u computeMeanStddev,
+                                        Rpp32f scale,
+                                        Rpp32f shift,
+                                        Rpp32u *roiTensor,
+                                        RppLayoutParams layoutParams,
+                                        rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    Rpp32u maxSize = 1;
+    for(int batch = 0; batch < batchSize; batch++)
+    {
+        Rpp32u size = 1; // length of input tensors differ based on axisMask and tensorDims
+        for(int i = 0; i < tensorDims; i++)
+            size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i];
+        maxSize = std::max(maxSize, size);
+    }
+    if(!computeMeanStddev)
+    {
+        for(Rpp32u i = 0; i < maxSize; i++)
+            stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i];
+        maxSize = 0;
+    }
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+	{
+        int size = 1;
+        Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[tensorDims];
+
+        for(int i = 0; i < tensorDims; i++)
+            size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i];
+
+        T1 *srcPtrTemp;
+        T2 *dstPtrTemp;
+        Rpp32f *meanTensor, *stdDevTensor;
+        srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+        meanTensor = meanTensorPtr + batchCount * maxSize;
+        stdDevTensor = stdDevTensorPtr + batchCount * maxSize;
+
+        // Set all values in dst buffer to 0.0
+        for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++)
+            dstPtrTemp[cnt] = 0.0f;
+
+        T1 *srcPtrChannel = srcPtrTemp;
+
+        int totalElements = 1;
+        Rpp32u lastNormAxis = 0;
+        Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims];
+        // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis
+        memset(newAxis, 0, sizeof(newAxis));
+        memset(newDims, 0, sizeof(newDims));
+
+        for(int i = 0; i < tensorDims; i++)
+        {
+            axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+            totalElements *= axis[i] ? length[i] : 1;
+            srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1];
+        }
+
+        Rpp32u paramStride[tensorDims], srcStride[tensorDims];
+        collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis);
+        compute_strides(srcStride, newDims, tensorDims);
+
+        if(computeMeanStddev & 1) // Check if mean is to be computed internally
+        {
+            compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+            Rpp32f normFactor = 1.0 / totalElements;
+            for(int i = 0; i < size; i++)
+                meanTensor[i] *= normFactor;
+        }
+        if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+        {
+            compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+            Rpp32f normFactor = (Rpp32f)(1.0 / totalElements);
+            rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale);
+        }
+
+        for(int i = 0; i < tensorDims; i++)
+            paramStride[i] = !newAxis[i];
+
+        Rpp32u idx = 0;
+        normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/pre_emphasis_filter.hpp b/src/modules/cpu/kernel/pre_emphasis_filter.hpp
index 1d25921ad..889cd2dec 100644
--- a/src/modules/cpu/kernel/pre_emphasis_filter.hpp
+++ b/src/modules/cpu/kernel/pre_emphasis_filter.hpp
@@ -50,7 +50,7 @@ RppStatus pre_emphasis_filter_host_tensor(Rpp32f *srcPtr,
         dstPtrTemp[0] = srcPtrTemp[0] - coeff * border;
 
         Rpp32s vectorIncrement = 8;
-        Rpp32s alignedLength = (bufferLength / 8) * 8;
+        Rpp32s alignedLength = (bufferLength / 8) * 8 - 8;
         __m256 pCoeff = _mm256_set1_ps(coeff);
 
         Rpp32s vectorLoopCount = 1;
diff --git a/src/modules/cpu/kernel/slice.hpp b/src/modules/cpu/kernel/slice.hpp
index c451b67b4..37c3097c9 100644
--- a/src/modules/cpu/kernel/slice.hpp
+++ b/src/modules/cpu/kernel/slice.hpp
@@ -26,184 +26,213 @@ SOFTWARE.
 #include "rpp_cpu_simd.hpp"
 #include "rpp_cpu_common.hpp"
 
-RppStatus slice_f32_f32_host_tensor(Rpp32f *srcPtr,
-                                    RpptGenericDescPtr srcGenericDescPtr,
-                                    Rpp32f *dstPtr,
-                                    RpptGenericDescPtr dstGenericDescPtr,
-                                    RpptROI3DPtr roiGenericPtrSrc,
-                                    RpptRoi3DType roiType,
-                                    RppLayoutParams layoutParams,
-                                    rpp::Handle& handle)
+template<typename T>
+RppStatus slice_host_tensor(T *srcPtr,
+                            RpptGenericDescPtr srcGenericDescPtr,
+                            T *dstPtr,
+                            RpptGenericDescPtr dstGenericDescPtr,
+                            Rpp32s *anchorTensor,
+                            Rpp32s *shapeTensor,
+                            T* fillValue,
+                            bool enablePadding,
+                            Rpp32u *roiTensor,
+                            RppLayoutParams layoutParams,
+                            rpp::Handle& handle)
 {
-    RpptROI3D roiDefault;
-    if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
-        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
-    else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
-        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
     Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
 
     omp_set_dynamic(0);
 #pragma omp parallel for num_threads(numThreads)
     for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
     {
-        RpptROI3D roi;
-        RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
-        compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+        T *srcPtrTemp, *dstPtrTemp;
+        srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
 
-        Rpp32f *srcPtrImage, *dstPtrImage;
-        srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
-        dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+        T *srcPtrChannel, *dstPtrChannel;
+        dstPtrChannel = dstPtrTemp;
 
-        Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+        Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+        Rpp32s *shape = &shapeTensor[batchCount * numDims];
 
-        Rpp32f *srcPtrChannel, *dstPtrChannel;
-        dstPtrChannel = dstPtrImage;
+        // get the starting address of length values from roiTensor
+        Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+        Rpp32s *length = reinterpret_cast<Rpp32s *>(&roi[numDims]);
 
-        // Slice without fused output-layout toggle (NCDHW -> NCDHW)
-        if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+        if (numDims == 4)
         {
-            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
-
-            Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f);
-            for(int c = 0; c < layoutParams.channelParam; c++)
+            // order of dims
+            Rpp32s dimsOrder[3];
+            if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+            {
+                dimsOrder[0] = 1;  // depth
+                dimsOrder[1] = 2;  // height
+                dimsOrder[2] = 3;  // width
+            }
+            else
             {
-                Rpp32f *srcPtrDepth, *dstPtrDepth;
-                srcPtrDepth = srcPtrChannel;
-                dstPtrDepth = dstPtrChannel;
+                dimsOrder[0] = 0;  // depth
+                dimsOrder[1] = 1;  // height
+                dimsOrder[2] = 2;  // width
+            }
+            Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+            Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+            Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]);
+            Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier;
+            Rpp32u copyLengthInBytes = bufferLength * sizeof(T);
+
+            // if padding is required, fill the buffer with fill value specified
+            bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+                                ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) ||
+                                ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]]));
+            if (needPadding && enablePadding)
+                std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue);
+
+            // slice without fused output-layout toggle (NCDHW -> NCDHW)
+            if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+            {
+                srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * srcGenericDescPtr->strides[3]) + (anchor[3] * layoutParams.bufferMultiplier);
+                for(int c = 0; c < layoutParams.channelParam; c++)
+                {
+                    T *srcPtrDepth, *dstPtrDepth;
+                    srcPtrDepth = srcPtrChannel;
+                    dstPtrDepth = dstPtrChannel;
+                    for(int i = 0; i < maxDepth; i++)
+                    {
+                        T *srcPtrRow, *dstPtrRow;
+                        srcPtrRow = srcPtrDepth;
+                        dstPtrRow = dstPtrDepth;
+                        for(int j = 0; j < maxHeight; j++)
+                        {
+                            memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+                            srcPtrRow += srcGenericDescPtr->strides[3];
+                            dstPtrRow += dstGenericDescPtr->strides[3];
+                        }
+                        srcPtrDepth += srcGenericDescPtr->strides[2];
+                        dstPtrDepth += dstGenericDescPtr->strides[2];
+                    }
+                    srcPtrChannel += srcGenericDescPtr->strides[1];
+                    dstPtrChannel += srcGenericDescPtr->strides[1];
+                }
+            }
 
-                for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+            // slice without fused output-layout toggle (NDHWC -> NDHWC)
+            else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+            {
+                srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier);
+                T *srcPtrDepth = srcPtrChannel;
+                T *dstPtrDepth = dstPtrChannel;
+                for(int i = 0; i < maxDepth; i++)
                 {
-                    Rpp32f *srcPtrRow, *dstPtrRow;
+                    T *srcPtrRow, *dstPtrRow;
                     srcPtrRow = srcPtrDepth;
                     dstPtrRow = dstPtrDepth;
-
-                    for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+                    for(int j = 0; j < maxHeight; j++)
                     {
                         memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
-
-                        srcPtrRow += srcGenericDescPtr->strides[3];
-                        dstPtrRow += dstGenericDescPtr->strides[3];
+                        srcPtrRow += srcGenericDescPtr->strides[2];
+                        dstPtrRow += dstGenericDescPtr->strides[2];
                     }
-                    srcPtrDepth += srcGenericDescPtr->strides[2];
-                    dstPtrDepth += dstGenericDescPtr->strides[2];
+                    srcPtrDepth += srcGenericDescPtr->strides[1];
+                    dstPtrDepth += dstGenericDescPtr->strides[1];
                 }
-
-                srcPtrChannel += srcGenericDescPtr->strides[1];
-                dstPtrChannel += srcGenericDescPtr->strides[1];
             }
         }
-        // Slice without fused output-layout toggle (NDHWC -> NDHWC)
-        else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+        else if (numDims == 3)
         {
-            Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f);
-            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
-            Rpp32f *srcPtrDepth = srcPtrChannel;
-            Rpp32f *dstPtrDepth = dstPtrChannel;
-
-            for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+            // order of dims
+            Rpp32s dimsOrder[2];
+            if (dstGenericDescPtr->layout == RpptLayout::NCHW)
             {
-                Rpp32f *srcPtrRow, *dstPtrRow;
-                srcPtrRow = srcPtrDepth;
-                dstPtrRow = dstPtrDepth;
-
-                for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
-                {
-                    memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
-
-                    srcPtrRow += srcGenericDescPtr->strides[2];
-                    dstPtrRow += dstGenericDescPtr->strides[2];
-                }
-                srcPtrDepth += srcGenericDescPtr->strides[1];
-                dstPtrDepth += dstGenericDescPtr->strides[1];
+                dimsOrder[0] = 1;  // height
+                dimsOrder[1] = 2;  // width
+            }
+            else
+            {
+                dimsOrder[0] = 0;  // height
+                dimsOrder[1] = 1;  // width
             }
-        }
-    }
-
-    return RPP_SUCCESS;
-}
-
-RppStatus slice_u8_u8_host_tensor(Rpp8u *srcPtr,
-                                  RpptGenericDescPtr srcGenericDescPtr,
-                                  Rpp8u *dstPtr,
-                                  RpptGenericDescPtr dstGenericDescPtr,
-                                  RpptROI3DPtr roiGenericPtrSrc,
-                                  RpptRoi3DType roiType,
-                                  RppLayoutParams layoutParams,
-                                  rpp::Handle& handle)
-{
-    RpptROI3D roiDefault;
-    if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
-        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
-    else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
-        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
-    Rpp32u numThreads = handle.GetNumThreads();
-
-    omp_set_dynamic(0);
-#pragma omp parallel for num_threads(numThreads)
-    for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
-    {
-        RpptROI3D roi;
-        RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
-        compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
 
-        Rpp8u *srcPtrImage, *dstPtrImage;
-        srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
-        dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+            Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+            Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+            Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier;
+            Rpp32u copyLengthInBytes = bufferLength * sizeof(T);
 
-        Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
-        Rpp8u *srcPtrChannel, *dstPtrChannel;
-        dstPtrChannel = dstPtrImage;
+            // if padding is required, fill the buffer with fill value specified
+            bool needPadding = ((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+                               ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]);
+            if (needPadding && enablePadding)
+                std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue);
 
-        // Slice without fused output-layout toggle (NCDHW -> NCDHW)
-        if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
-        {
-            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
-            for(int c = 0; c < layoutParams.channelParam; c++)
+            // slice without fused output-layout toggle (NCHW -> NCHW)
+            if (dstGenericDescPtr->layout == RpptLayout::NCHW)
             {
-                Rpp8u *srcPtrDepth, *dstPtrDepth;
-                srcPtrDepth = srcPtrChannel;
-                dstPtrDepth = dstPtrChannel;
-                for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+                srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier);
+                for(int c = 0; c < layoutParams.channelParam; c++)
                 {
-                    Rpp8u *srcPtrRow, *dstPtrRow;
-                    srcPtrRow = srcPtrDepth;
-                    dstPtrRow = dstPtrDepth;
-                    for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+                    T *srcPtrRow, *dstPtrRow;
+                    srcPtrRow = srcPtrChannel;
+                    dstPtrRow = dstPtrChannel;
+                    for(int j = 0; j < maxHeight; j++)
                     {
-                        memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u));
-                        srcPtrRow += srcGenericDescPtr->strides[3];
-                        dstPtrRow += dstGenericDescPtr->strides[3];
+                        memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+                        srcPtrRow += srcGenericDescPtr->strides[2];
+                        dstPtrRow += dstGenericDescPtr->strides[2];
                     }
-                    srcPtrDepth += srcGenericDescPtr->strides[2];
-                    dstPtrDepth += dstGenericDescPtr->strides[2];
+                    srcPtrChannel += srcGenericDescPtr->strides[1];
+                    dstPtrChannel += srcGenericDescPtr->strides[1];
                 }
-                srcPtrChannel += srcGenericDescPtr->strides[1];
-                dstPtrChannel += srcGenericDescPtr->strides[1];
             }
-        }
-        // Slice without fused output-layout toggle (NDHWC -> NDHWC)
-        else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
-        {
-            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
-            Rpp8u *srcPtrDepth = srcPtrChannel;
-            Rpp8u *dstPtrDepth = dstPtrChannel;
 
-            for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+            // slice without fused output-layout toggle (NHWC -> NHWC)
+            else if (dstGenericDescPtr->layout == RpptLayout::NHWC)
             {
-                Rpp8u *srcPtrRow, *dstPtrRow;
-                srcPtrRow = srcPtrDepth;
-                dstPtrRow = dstPtrDepth;
-
-                for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+                srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * layoutParams.bufferMultiplier);
+                T *srcPtrRow = srcPtrChannel;
+                T *dstPtrRow = dstPtrChannel;
+                for(int j = 0; j < maxHeight; j++)
                 {
-                    memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u));
-                    srcPtrRow += srcGenericDescPtr->strides[2];
-                    dstPtrRow += dstGenericDescPtr->strides[2];
+                    memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+                    srcPtrRow += srcGenericDescPtr->strides[1];
+                    dstPtrRow += dstGenericDescPtr->strides[1];
                 }
-                srcPtrDepth += srcGenericDescPtr->strides[1];
-                dstPtrDepth += dstGenericDescPtr->strides[1];
             }
         }
+        else if (numDims == 2)
+        {
+            srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + anchor[1];
+            Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]);
+            Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]);
+            Rpp32u copyLengthInBytes = maxWidth * sizeof(T);
+
+            // if padding is required, fill the buffer with fill value specified
+            bool needPadding = ((anchor[0] + shape[0]) > length[0]) ||
+                                ((anchor[1] + shape[1]) > length[1]);
+            if (needPadding && enablePadding)
+                std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue);
+
+            T *srcPtrRow = srcPtrChannel;
+            T *dstPtrRow = dstPtrChannel;
+            for(int j = 0; j < maxHeight; j++)
+            {
+                memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+                srcPtrRow += srcGenericDescPtr->strides[1];
+                dstPtrRow += dstGenericDescPtr->strides[1];
+            }
+        }
+        else if (numDims == 1)
+        {
+            srcPtrChannel = srcPtrTemp + anchor[0];
+            Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]);
+            Rpp32u copyLengthInBytes = maxLength * sizeof(T);
+
+            // if padding is required, fill the buffer with fill value specified
+            bool needPadding = ((anchor[0] + shape[0]) > length[0]);
+            if (needPadding && enablePadding)
+                std::fill(dstPtrTemp, dstPtrTemp + dstGenericDescPtr->strides[0] - 1, *fillValue);
+            memcpy(dstPtrChannel, srcPtrChannel, copyLengthInBytes);
+        }
     }
 
     return RPP_SUCCESS;
diff --git a/src/modules/cpu/kernel/spectrogram.hpp b/src/modules/cpu/kernel/spectrogram.hpp
new file mode 100644
index 000000000..2489d2180
--- /dev/null
+++ b/src/modules/cpu/kernel/spectrogram.hpp
@@ -0,0 +1,245 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "third_party/ffts/ffts.h"
+#include "third_party/ffts/ffts_attributes.h"
+#include <complex>
+
+bool is_pow2(Rpp64s n) { return (n & (n-1)) == 0; }
+inline bool can_use_real_impl(Rpp64s n) { return is_pow2(n); }
+inline Rpp64s size_in_buf(Rpp64s n) { return can_use_real_impl(n) ? n : 2 * n; }
+inline Rpp64s size_out_buf(Rpp64s n) { return can_use_real_impl(n) ? n + 2 : 2 * n; }
+
+// Compute hanning window
+inline void hann_window(Rpp32f *output, Rpp32s windowSize)
+{
+    Rpp64f a = (2.0 * M_PI) / windowSize;
+    for (Rpp32s t = 0; t < windowSize; t++)
+    {
+        Rpp64f phase = a * (t + 0.5);
+        output[t] = (0.5 * (1.0 - std::cos(phase)));
+    }
+}
+
+// Compute number of spectrogram windows
+inline Rpp32s get_num_windows(Rpp32s length, Rpp32s windowLength, Rpp32s windowStep, bool centerWindows)
+{
+    if (!centerWindows)
+        length -= windowLength;
+    return ((length / windowStep) + 1);
+}
+
+// Compute reflect start idx to pad
+inline Rpp32s get_idx_reflect(Rpp32s loc, Rpp32s minLoc, Rpp32s maxLoc)
+{
+    if (maxLoc - minLoc < 2)
+        return maxLoc - 1;
+    for (;;)
+    {
+        if (loc < minLoc)
+            loc = 2 * minLoc - loc;
+        else if (loc >= maxLoc)
+            loc = 2 * maxLoc - 2 - loc;
+        else
+            break;
+    }
+    return loc;
+}
+
+RppStatus spectrogram_host_tensor(Rpp32f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp32f *dstPtr,
+                                  RpptDescPtr dstDescPtr,
+                                  Rpp32s *srcLengthTensor,
+                                  bool centerWindows,
+                                  bool reflectPadding,
+                                  Rpp32f *windowFunction,
+                                  Rpp32s nfft,
+                                  Rpp32s power,
+                                  Rpp32s windowLength,
+                                  Rpp32s windowStep,
+                                  rpp::Handle& handle)
+{
+    Rpp32s windowCenterOffset = 0;
+    bool vertical = (dstDescPtr->layout == RpptLayout::NFT);
+    if (centerWindows) windowCenterOffset = windowLength / 2;
+    if (nfft == 0) nfft = windowLength;
+    const Rpp32s numBins = nfft / 2 + 1;
+    const Rpp32f mulFactor = (2.0 * M_PI) / nfft;
+    const Rpp32u hStride = dstDescPtr->strides.hStride;
+    const Rpp32s alignedNfftLength = nfft & ~7;
+    const Rpp32s alignedNbinsLength = numBins & ~7;
+    const Rpp32s alignedWindowLength = windowLength & ~7;
+    bool useRealImpl = can_use_real_impl(nfft);
+    const auto fftInSize = size_in_buf(nfft);
+    const auto fftOutSize = size_out_buf(nfft);
+
+    Rpp32f *windowFn = static_cast<Rpp32f *>(calloc(windowLength, sizeof(Rpp32f)));
+
+    // Generate hanning window
+    if (windowFunction == NULL)
+        hann_window(windowFn, windowLength);
+    else
+        memcpy(windowFn, windowFunction, windowLength * sizeof(Rpp32f));
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    // Get windows output
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for (Rpp32s batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride;
+        Rpp32s bufferLength = srcLengthTensor[batchCount];
+        Rpp32s numWindows = get_num_windows(bufferLength, windowLength, windowStep, centerWindows);
+        Rpp32f windowOutput[numWindows * nfft];
+        std::fill_n(windowOutput, numWindows * nfft, 0);
+        for (Rpp32s w = 0; w < numWindows; w++)
+        {
+            Rpp32s windowStart = w * windowStep - windowCenterOffset;
+            Rpp32f *windowOutputTemp = windowOutput + (w * nfft);
+            // Pad when either windowStart less than zero or length greater than input srclength
+            if (windowStart < 0 || (windowStart + windowLength) > bufferLength)
+            {
+                for (Rpp32s t = 0; t < windowLength; t++)
+                {
+                    Rpp32s inIdx = windowStart + t;
+                    if (reflectPadding)
+                    {
+                        inIdx = get_idx_reflect(inIdx, 0, bufferLength);
+                        *windowOutputTemp++ = windowFn[t] * srcPtrTemp[inIdx];
+                    }
+                    else
+                    {
+                        if (inIdx >= 0 && inIdx < bufferLength)
+                            *windowOutputTemp++ = windowFn[t] * srcPtrTemp[inIdx];
+                        else
+                            *windowOutputTemp++ = 0;
+                    }
+                }
+            }
+            else
+            {
+                Rpp32f *srcPtrWindowTemp = srcPtrTemp + windowStart;
+                Rpp32f *windowFnTemp = windowFn;
+                Rpp32s t = 0;
+                for (; t < alignedWindowLength; t += 8)
+                {
+                    __m256 pSrc, pWindowFn;
+                    pSrc = _mm256_loadu_ps(srcPtrWindowTemp);
+                    pWindowFn = _mm256_loadu_ps(windowFnTemp);
+                    pSrc = _mm256_mul_ps(pSrc, pWindowFn);
+                    _mm256_storeu_ps(windowOutputTemp, pSrc);
+                    srcPtrWindowTemp += 8;
+                    windowFnTemp += 8;
+                    windowOutputTemp += 8;
+                }
+                for (; t < windowLength; t++)
+                    *windowOutputTemp++ = (*windowFnTemp++) * (*srcPtrWindowTemp++);
+            }
+        }
+
+        // Generate FFT output
+        ffts_plan_t *p;
+        if(useRealImpl)
+            p = ffts_init_1d_real(nfft, FFTS_FORWARD);
+        else
+            p = ffts_init_1d(nfft, FFTS_FORWARD);
+
+        if (!p)
+        {
+            printf("FFT Plan is unsupported. Exiting the code\n");
+            exit(0);
+        }
+
+        // Set temporary buffers to 0
+        Rpp32f FFTS_ALIGN(32) *fftInBuf = static_cast<Rpp32f*>(_mm_malloc(fftInSize * sizeof(Rpp32f), 32)); // ffts requires 32-byte aligned memory
+        Rpp32f FFTS_ALIGN(32) *fftOutBuf = static_cast<Rpp32f*>(_mm_malloc(fftOutSize * sizeof(Rpp32f), 32)); // ffts requires 32-byte aligned memory
+
+        for (Rpp32s w = 0; w < numWindows; w++)
+        {
+            Rpp32f *dstPtrBinTemp = dstPtrTemp + (w * hStride);
+            Rpp32f *windowOutputTemp = windowOutput + (w * nfft);
+            for(int k = 0; k < fftInSize; k++)
+                fftInBuf[k] = 0.0f;
+
+            for(int k = 0; k < fftOutSize; k++)
+                fftOutBuf[k] = 0.0f;
+
+            Rpp32s inWindowStart = windowLength < nfft ? (nfft - windowLength) / 2 : 0;
+            // Copy the window input to fftInBuf
+            if (useRealImpl)
+            {
+                for (int i = 0; i < windowLength; i++)
+                    fftInBuf[inWindowStart + i] = windowOutputTemp[i];
+            }
+            else
+            {
+                for (int i = 0; i < windowLength; i++)
+                {
+                    Rpp32s off = 2 * (inWindowStart + i);
+                    fftInBuf[off] = windowOutputTemp[i];
+                    fftInBuf[off + 1] = 0.0f;
+                }
+            }
+
+            ffts_execute(p, fftInBuf, fftOutBuf);
+            auto *complexFft = reinterpret_cast<std::complex<Rpp32f> *>(fftOutBuf);
+            Rpp32s outIdx = w;
+            if (vertical)
+            {
+                if (power == 1)
+                {
+                    for (int i = 0; i < numBins; i++, outIdx += hStride)
+                        dstPtrTemp[outIdx] = std::abs(complexFft[i]);
+                }
+                else
+                {
+                    for (int i = 0; i < numBins; i++, outIdx += hStride)
+                        dstPtrTemp[outIdx] = std::norm(complexFft[i]);
+                }
+            }
+            else
+            {
+                if (power == 1)
+                {
+                    for (int i = 0; i < numBins; i++)
+                        *dstPtrBinTemp++ = std::abs(complexFft[i]);
+                }
+                else
+                {
+                    for (int i = 0; i < numBins; i++)
+                        *dstPtrBinTemp++ = std::norm(complexFft[i]);
+                }
+            }
+        }
+        ffts_free(p);
+        _mm_free(fftInBuf);
+        _mm_free(fftOutBuf);
+    }
+    if(windowFn)
+        free(windowFn);
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/tensor_stddev.hpp b/src/modules/cpu/kernel/tensor_stddev.hpp
index 3013d9e54..2f64e93ab 100644
--- a/src/modules/cpu/kernel/tensor_stddev.hpp
+++ b/src/modules/cpu/kernel/tensor_stddev.hpp
@@ -147,8 +147,8 @@ RppStatus tensor_stddev_u8_f32_host(Rpp8u *srcPtr,
                 {
                     __m256d p[6];
                     rpp_simd_load(rpp_load24_u8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
-                    compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
-                    compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+                    compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+                    compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
                     srcPtrTempR += vectorIncrementPerChannel;
                     srcPtrTempG += vectorIncrementPerChannel;
                     srcPtrTempB += vectorIncrementPerChannel;
@@ -239,8 +239,8 @@ RppStatus tensor_stddev_u8_f32_host(Rpp8u *srcPtr,
                 {
                     __m256d p[6];
                     rpp_simd_load(rpp_load24_u8pkd3_to_f64pln3_avx, srcPtrTemp, p);
-                    compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
-                    compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+                    compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+                    compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
                     srcPtrTemp += vectorIncrement;
                 }
 #endif
@@ -408,8 +408,8 @@ RppStatus tensor_stddev_f32_f32_host(Rpp32f *srcPtr,
                 {
                     __m256d p[6];
                     rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
-                    compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
-                    compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+                    compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+                    compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
                     srcPtrTempR += vectorIncrementPerChannel;
                     srcPtrTempG += vectorIncrementPerChannel;
                     srcPtrTempB += vectorIncrementPerChannel;
@@ -500,8 +500,8 @@ RppStatus tensor_stddev_f32_f32_host(Rpp32f *srcPtr,
                 {
                     __m256d p[6];
                     rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp, p);
-                    compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
-                    compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+                    compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+                    compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
                     srcPtrTemp += vectorIncrement;
                 }
 #endif
@@ -682,8 +682,8 @@ RppStatus tensor_stddev_f16_f32_host(Rpp16f *srcPtr,
 
                     __m256d p[6];
                     rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
-                    compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
-                    compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+                    compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+                    compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
                     srcPtrTempR += vectorIncrementPerChannel;
                     srcPtrTempG += vectorIncrementPerChannel;
                     srcPtrTempB += vectorIncrementPerChannel;
@@ -778,8 +778,8 @@ RppStatus tensor_stddev_f16_f32_host(Rpp16f *srcPtr,
 
                     __m256d p[6];
                     rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp_ps, p);
-                    compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
-                    compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+                    compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+                    compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
 
                     srcPtrTemp += vectorIncrement;
                 }
@@ -949,8 +949,8 @@ RppStatus tensor_stddev_i8_f32_host(Rpp8s *srcPtr,
                 {
                     __m256d p[6];
                     rpp_simd_load(rpp_load24_i8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
-                    compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
-                    compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+                    compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+                    compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
                     srcPtrTempR += vectorIncrementPerChannel;
                     srcPtrTempG += vectorIncrementPerChannel;
                     srcPtrTempB += vectorIncrementPerChannel;
@@ -1041,8 +1041,8 @@ RppStatus tensor_stddev_i8_f32_host(Rpp8s *srcPtr,
                 {
                     __m256d p[6];
                     rpp_simd_load(rpp_load24_i8pkd3_to_f64pln3_avx, srcPtrTemp, p);
-                    compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
-                    compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+                    compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+                    compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
                     srcPtrTemp += vectorIncrement;
                 }
 #endif
diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp
index c0a27806e..a0f50ee7e 100644
--- a/src/modules/hip/hip_tensor_statistical_operations.hpp
+++ b/src/modules/hip/hip_tensor_statistical_operations.hpp
@@ -29,5 +29,6 @@ SOFTWARE.
 #include "kernel/tensor_max.hpp"
 #include "kernel/tensor_mean.hpp"
 #include "kernel/tensor_stddev.hpp"
+#include "kernel/normalize.hpp"
 
 #endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/normalize.hpp b/src/modules/hip/kernel/normalize.hpp
new file mode 100644
index 000000000..384454ca7
--- /dev/null
+++ b/src/modules/hip/kernel/normalize.hpp
@@ -0,0 +1,1921 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+#define MAX_SHARED_MEMORY_SIZE 1024
+
+// -------------------- Set 0 - normalization kernels device helpers --------------------
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, uchar* dst)
+{
+    outVal = fmax(fminf(outVal, 255), 0);
+    *dst = static_cast<uchar>(outVal);
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, schar* dst)
+{
+    outVal = fmax(fminf(outVal, 127), -128);
+    *dst = static_cast<schar>(outVal);
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, float* dst)
+{
+    *dst = outVal;
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, half* dst)
+{
+    *dst = static_cast<half>(outVal);
+}
+
+// -------------------- Set 1 - normalization kernel host helpers --------------------
+
+// setup function needed for 2D/3D optimized kernel variants when mean and stddev needs to be internally computed
+void normalize_setup_2d_and_3d(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims,
+                               Rpp32u axisMask, Rpp32u &maxParamVolume)
+{
+    maxParamVolume = 1;
+    uint axisSet[RPPT_MAX_DIMS];
+    for(int i = 0; i < tensorDims; i++)
+        axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+
+    for(uint i = 0; i < batchSize; i++)
+    {
+        // calculate the max param volume
+        Rpp32u paramVolume = 1;
+        Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims];
+        for(uint j = 0; j < tensorDims; j++)
+            paramVolume *= (axisSet[j]) ? 1 : roi[j];
+        maxParamVolume = std::max(maxParamVolume, paramVolume);
+    }
+}
+
+// setup function needed for ND generic kernel variants
+void normalize_setup_nd(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims, Rpp32u axisMask,
+                        Rpp32u *paramShapeTensor, Rpp32u *paramStridesTensor, Rpp32u &maxParamVolume)
+{
+    maxParamVolume = 1;
+    uint axisSet[RPPT_MAX_DIMS];
+    for(int i = 0; i < tensorDims; i++)
+        axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+
+    for(uint i = 0; i < batchSize; i++)
+    {
+        // calculate the param shape and param volume based on the axis mask
+        Rpp32u paramVolume = 1;
+        Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims];
+        Rpp32u *paramShape = &paramShapeTensor[i * tensorDims];
+        for(uint j = 0; j < tensorDims; j++)
+        {
+            paramShape[j] = (axisSet[j]) ? 1 : roi[j];
+            paramVolume *= paramShape[j];
+        }
+        maxParamVolume = std::max(maxParamVolume, paramVolume);
+
+        // calculate the param strides from the param shape
+        Rpp32u *paramStrides = &paramStridesTensor[i * tensorDims];
+        Rpp32u val = 1;
+        for(uint j = tensorDims - 1; j > 0; j--)
+        {
+            paramStrides[j] = val;
+            val *= paramShape[j];
+        }
+        paramStrides[0] = val;
+    }
+}
+
+// -------------------- Set 2 - normalization kernels --------------------
+
+template <typename T>
+__global__ void normalize_2d_hip_tensor(T *srcPtr,
+                                        uint2 srcStridesNH,
+                                        T *dstPtr,
+                                        uint2 dstStridesNH,
+                                        float *meanTensor,
+                                        float *stdDevTensor,
+                                        float2 scaleAndShift,
+                                        uint *roiTensor,
+                                        uint2 maxParamVolumeAndAxisMask,
+                                        bool computeStdDev)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // width
+    uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+    uint *roi = &roiTensor[id_z * 4];
+    uint yBegin = roi[0];
+    uint xBegin = roi[1];
+    uint height = roi[2];
+    uint width = roi[3];
+
+    if (id_x >= width || id_y >= height)
+        return;
+
+    uint maxParamVolume = maxParamVolumeAndAxisMask.x;
+    uint axisMask = maxParamVolumeAndAxisMask.y;
+    uint paramIndex = id_z * maxParamVolume;
+    // update paramIndex based on axisMask value
+    if (axisMask == 1)
+        paramIndex += id_x;
+    else if (axisMask == 2)
+        paramIndex += id_y;
+
+    uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + id_x + xBegin;
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
+    float mean = meanTensor[paramIndex];
+    float stdDev = stdDevTensor[paramIndex];
+    float scale = scaleAndShift.x;
+    float shift = scaleAndShift.y;
+    float invStdDev;
+    if (computeStdDev)
+    {
+        float stdDevSquare = stdDev * stdDev;
+        invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+    }
+    else
+    {
+        invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+    }
+    float outVal = fmaf((static_cast<float>(srcPtr[srcIdx]) - mean), invStdDev, shift);
+    normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+template <typename T>
+__global__ void normalize_3d_hip_tensor(T *srcPtr,
+                                        uint2 srcStridesDH,
+                                        T *dstPtr,
+                                        uint2 dstStridesDH,
+                                        float *meanTensor,
+                                        float *stdDevTensor,
+                                        float2 scaleAndShift,
+                                        uint *roiTensor,
+                                        uint axisMask,
+                                        bool computeStdDev)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX
+    uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ
+
+    uint *roi = roiTensor;
+    uint zBegin = roi[0];
+    uint yBegin = roi[1];
+    uint xBegin = roi[2];
+    uint lengthZ = roi[3];
+    uint lengthY = roi[4];
+    uint lengthX = roi[5];
+
+    if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ)
+        return;
+
+    uint paramIndex = 0;
+    // update paramIndex based on axisMask value
+    if (axisMask == 1)
+        paramIndex += id_y * lengthX + id_x;
+    else if (axisMask == 2)
+        paramIndex += id_z * lengthX + id_x;
+    else if (axisMask == 4)
+        paramIndex += id_z * lengthY + id_y;
+    else if (axisMask == 3)
+        paramIndex += id_x;
+    else if (axisMask == 5)
+        paramIndex += id_y;
+    else if (axisMask == 6)
+        paramIndex += id_z;
+
+    uint srcIdx = ((id_z + zBegin) * srcStridesDH.x) + ((id_y + yBegin) * srcStridesDH.y) + id_x + xBegin;
+    uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x;
+    float mean = meanTensor[paramIndex];
+    float stdDev = stdDevTensor[paramIndex];
+    float scale = scaleAndShift.x;
+    float shift = scaleAndShift.y;
+    float invStdDev;
+    if (computeStdDev)
+    {
+        float stdDevSquare = stdDev * stdDev;
+        invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+    }
+    else
+    {
+        invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+    }
+    float outVal = fmaf((static_cast<float>(srcPtr[srcIdx]) - mean), invStdDev, shift);
+    normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+template <typename T>
+__global__ void normalize_nd_hip_tensor(T *srcPtr,
+                                        uint *srcMaxDims,
+                                        uint *srcStrides,
+                                        T *dstPtr,
+                                        float *meanTensor,
+                                        float *stdDevTensor,
+                                        float2 scaleAndShift,
+                                        uint *roiTensor,
+                                        uint *paramShapeTensor,
+                                        uint *paramStridesTensor,
+                                        uint2 maxParamVolumeAndBufferLength,
+                                        uint tensorDims,
+                                        bool computeStdDev)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    uint maxBufferLength = maxParamVolumeAndBufferLength.y;
+
+    if (id_x >= maxBufferLength)
+        return;
+
+    uint *begin = &roiTensor[id_z * tensorDims * 2];
+    uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+    uint *paramShape = &paramShapeTensor[id_z * tensorDims];
+    uint *paramStrides = &paramStridesTensor[id_z * tensorDims];
+    uint maxParamVolume = maxParamVolumeAndBufferLength.x;
+    uint srcIdx = id_z * maxBufferLength;
+
+    uint paramIndex = id_z * maxParamVolume;
+    for (int i = 0; i < tensorDims; i++)
+    {
+        uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+        srcIdx += ((begin[i] + coord) * srcStrides[i]);
+        if (coord >= length[i])
+            return;
+        paramIndex += (maxParamVolume != 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+    }
+
+    float mean = meanTensor[paramIndex];
+    float stdDev = stdDevTensor[paramIndex];
+    float scale = scaleAndShift.x;
+    float shift = scaleAndShift.y;
+    float invStdDev;
+    if (computeStdDev)
+    {
+        float stdDevSquare = stdDev * stdDev;
+        invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+    }
+    else
+    {
+        invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+    }
+    uint dstIdx = id_z * maxBufferLength + id_x;
+    float outVal = fmaf((static_cast<float>(srcPtr[srcIdx]) - mean), invStdDev, shift);
+    normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+// -------------------- Set 3 - mean and stddev compute kernels device helpers --------------------
+
+__device__ __forceinline__ void reduction_sum_x_hip(float *partialSum_smem)
+{
+    for(uint threadMax = hipBlockDim_x / 2; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialSum_smem[hipThreadIdx_x] += partialSum_smem[hipThreadIdx_x + threadMax];
+        __syncthreads();
+    }
+}
+
+// -------------------- Set 4 - mean compute kernels (reduction stage 1) --------------------
+
+template <typename T>
+__global__ void compute_mean_2d_hip_tensor(T *srcPtr,
+                                           uint2 srcStridesNH,
+                                           float *meanTensor,
+                                           float *partialSumTensor,
+                                           uint *roiTensor,
+                                           uint maxParamVolume,
+                                           uint axisMask)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *roi = &roiTensor[id_z * 4];
+    uint yBegin = roi[0];
+    uint xBegin = roi[1];
+    uint height = roi[2];
+    uint width = roi[3];
+
+    // compute column wise mean
+    if (axisMask == 1)
+    {
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin);
+        uint dstIdx = id_z * maxParamVolume + id_x;
+        if (id_x < width)
+        {
+            float accum = 0.0f;
+            for(int i = 0; i < height; i++)
+            {
+                accum += static_cast<float>(srcPtr[srcIdx]);
+                srcIdx += srcStridesNH.y;
+            }
+            meanTensor[dstIdx] = accum / static_cast<float>(height);
+        }
+    }
+    // compute partial sums needed for row wise mean
+    else if (axisMask == 2)
+    {
+        id_x *= 8;
+        __shared__ float partialRowSum_smem[256];
+        partialRowSum_smem[hipThreadIdx_x] = 0.0f;
+
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        int xAlignedLength =  width & ~7;      // alignedLength for vectorized global loads
+        int xDiff = width - xAlignedLength;    // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        if (id_x + 8 > width)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];                                           // perform small work of vectorized float4 addition
+        partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                              src_f8.f1[1] +
+                                              src_f8.f1[2] +
+                                              src_f8.f1[3]);                    // perform small work of reducing float4s to float using 256 threads and store in Shared
+        __syncthreads();
+
+        // Now do block level reduction sum
+        reduction_sum_x_hip(partialRowSum_smem);
+
+        // Final store to dst
+        if (hipThreadIdx_x == 0)
+        {
+            uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x;
+            partialSumTensor[paramIndex] = partialRowSum_smem[0];
+        }
+    }
+    // compute partial sums need for computing mean over entire rows and columns
+    else if (axisMask == 3)
+    {
+        id_x *= 8;
+        __shared__ float partialSum_smem[16][16];                               // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+        float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];     // float pointer to beginning of each row in Shared
+        partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                           // initialization of Shared to 0.0f using all 16 x 16 threads
+
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        int xAlignedLength =  width & ~7;       // alignedLength for vectorized global loads
+        int xDiff = width - xAlignedLength;     // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        if (id_x + 8 > width)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];                                           // perform small work of vectorized float4 addition
+        partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                                src_f8.f1[1] +
+                                                src_f8.f1[2] +
+                                                src_f8.f1[3]);                 // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+        __syncthreads();                                                       // syncthreads after Shared load
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSumRowPtr_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            // Reduction of 16 floats on 16 threads per block in y dimension
+            for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+            {
+                if (hipThreadIdx_y < threadMax)
+                    partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                __syncthreads();
+            }
+
+            // Final store to dst
+            if (hipThreadIdx_y == 0)
+                partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void compute_mean_3d_hip_tensor(T *srcPtr,
+                                           uint3 srcStridesNZY,
+                                           float *meanTensor,
+                                           uint *roiTensor,
+                                           float *partialSumTensor,
+                                           uint maxParamVolume,
+                                           uint axisMask)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *roi = &roiTensor[id_z * 6];
+    uint zBegin = roi[0];
+    uint yBegin = roi[1];
+    uint xBegin = roi[2];
+    uint lengthZ = roi[3];
+    uint lengthY = roi[4];
+    uint lengthX = roi[5];
+
+    // compute mean along z direction
+    if (axisMask == 1)
+    {
+        if (id_x >= lengthX || id_y >= lengthY)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin);
+        uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x;
+        float accum = 0.0f;
+        for(uint i = 0; i < lengthZ; i++)
+        {
+            accum += static_cast<float>(srcPtr[srcIdx]);
+            srcIdx += srcStridesNZY.y;
+        }
+        meanTensor[dstIdx] = accum / static_cast<float>(lengthZ);
+    }
+    // compute mean along y direction
+    else if (axisMask == 2)
+    {
+        if (id_x >= lengthX || id_y >= lengthZ)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin);
+        uint dstIdx = id_z * maxParamVolume +  id_y * lengthX + id_x;
+        float accum = 0.0f;
+        for(uint i = 0; i < lengthY; i++)
+        {
+            accum += static_cast<float>(srcPtr[srcIdx]);
+            srcIdx += srcStridesNZY.z;
+        }
+        meanTensor[dstIdx] = accum / static_cast<float>(lengthY);
+    }
+    // compute mean along x direction
+    else if (axisMask == 4)
+    {
+        if (id_x >= lengthY || id_y >= lengthZ)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+        d_float8 accum_f8;
+        accum_f8.f4[0] = (float4)0.0f;
+        accum_f8.f4[1] = (float4)0.0f;
+        for(int i = 0; i < lengthX; i += 8)
+        {
+            d_float8 src_f8;
+            rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+            if (i + 8 > lengthX)
+            {
+                int xDiff = i + 8 - lengthX;
+                for(int i = xDiff; i < 8; i++)
+                    src_f8.f1[i] = 0.0f;
+            }
+            accum_f8.f4[0] += src_f8.f4[0];
+            accum_f8.f4[1] += src_f8.f4[1];
+            srcIdx += 8;
+        }
+        accum_f8.f4[0] += accum_f8.f4[1];
+        accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+        uint dstIdx = id_z * maxParamVolume + id_y * lengthY + id_x;
+        meanTensor[dstIdx] = accum_f8.f1[0] / static_cast<float>(lengthX);
+    }
+    // compute partial sums required for computing mean along z-y direction
+    else if (axisMask == 3)
+    {
+        for(uint xIndex = 0; xIndex < lengthX; xIndex++)
+        {
+            __shared__ float partialSum_smem[16][16];
+            float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];              // float pointer to beginning of each row in Shared
+            partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                                    // initialization of Shared to 0.0f using all 16 x 16 threads
+
+            if ((id_x >= lengthY) || (id_y >= lengthZ))
+            {
+                return;
+            }
+
+            uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex);
+            partialSumRowPtr_smem[hipThreadIdx_x] = static_cast<float>(srcPtr[srcIdx]);
+            __syncthreads();                                                                      // syncthreads after Shared load
+
+            // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+            for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+            {
+                if (hipThreadIdx_x < threadMax)
+                    partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax];
+                __syncthreads();
+            }
+
+            if (hipThreadIdx_x == 0)
+            {
+                // Reduction of 16 floats on 16 threads per block in z dimension
+                for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+                {
+                    if (hipThreadIdx_y < threadMax)
+                        partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                    __syncthreads();
+                }
+
+                // Final store to dst
+                if (hipThreadIdx_y == 0)
+                {
+                    uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x);
+                    partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+                }
+            }
+            __syncthreads();
+        }
+    }
+    // compute partial sums required for computing mean along y-x direction
+    else if (axisMask == 6)
+    {
+        __shared__ float partialSum_smem[256];
+        partialSum_smem[hipThreadIdx_x] = 0.0f;
+        __syncthreads();
+
+        if (id_x >= lengthY || id_y >= lengthZ)
+            return;
+
+        uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y;
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+        d_float8 accum_f8;
+        accum_f8.f4[0] = (float4)0.0f;
+        accum_f8.f4[1] = (float4)0.0f;
+        for(int i = 0; i < lengthX; i += 8)
+        {
+            d_float8 src_f8;
+            rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+            if (i + 8 > lengthX)
+            {
+                int xDiff = lengthX - i;
+                for(int j = xDiff; j < 8; j++)
+                    src_f8.f1[j] = 0.0f;
+            }
+            accum_f8.f4[0] += src_f8.f4[0];
+            accum_f8.f4[1] += src_f8.f4[1];
+            srcIdx += 8;
+        }
+        accum_f8.f4[0] += accum_f8.f4[1];
+        accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+        partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0];
+        __syncthreads();
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSum_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
+            partialSumTensor[dstIdx] = partialSum_smem[0];
+        }
+    }
+    // compute mean along z-x direction
+    else if (axisMask == 5)
+    {
+        __shared__ float partialSum_smem[32];
+        partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+        if (hipBlockIdx_x >= lengthY)
+            return;
+
+        uint dstIdx = id_z * maxParamVolume +  hipBlockIdx_x;
+        float accum = 0.0f;
+        for (uint i = 0; i < lengthZ; i++)
+        {
+            uint tid_x = hipThreadIdx_x;
+            uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin;
+            while (tid_x < lengthX)
+            {
+                accum += static_cast<float>(srcPtr[srcIdx + tid_x]);
+                tid_x += hipBlockDim_x;
+            }
+        }
+        partialSum_smem[hipThreadIdx_x] = accum;
+        __syncthreads();
+
+        // perform reduction on shared memory sums
+        reduction_sum_x_hip(partialSum_smem);
+
+        if (hipThreadIdx_x == 0)
+            meanTensor[dstIdx] = partialSum_smem[0] / static_cast<float>(lengthX * lengthZ);
+    }
+    // compute partial sums required for computing mean along z-y-x direction
+    else if (axisMask == 7)
+    {
+        id_x *= 8;
+        __shared__ float partialSum_smem[16][16];
+        float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];              // float pointer to beginning of each row in Shared
+        partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                                    // initialization of Shared to 0.0f using all 16 x 16 threads
+
+        uint xIndex = id_x % srcStridesNZY.z;
+        uint yIndex = id_x / srcStridesNZY.z;
+        if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ))
+        {
+            return;
+        }
+
+        int xAlignedLength =  lengthX & ~7;       // alignedLength for vectorized global loads
+        int xDiff = lengthX - xAlignedLength;     // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        if (xIndex + 8 > lengthX)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];
+        partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                                 src_f8.f1[1] +
+                                                 src_f8.f1[2] +
+                                                 src_f8.f1[3]);
+        __syncthreads();
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSumRowPtr_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            // Reduction of 16 floats on 16 threads per block in y dimension
+            for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+            {
+                if (hipThreadIdx_y < threadMax)
+                    partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                __syncthreads();
+            }
+
+            // Final store to dst
+            if (hipThreadIdx_y == 0)
+            {
+                uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x);
+                partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+            }
+        }
+    }
+}
+
+template <typename T>
+__global__ void compute_mean_nd_hip_tensor(T *srcPtr,
+                                           uint *srcMaxDims,
+                                           uint *srcStrides,
+                                           float *meanTensor,
+                                           uint *roiTensor,
+                                           uint *paramShapeTensor,
+                                           uint *paramStridesTensor,
+                                           uint maxParamVolume,
+                                           uint tensorDims,
+                                           uint maxBufferLength)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *begin = &roiTensor[id_z * tensorDims * 2];
+    uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+    uint *paramShape = &paramShapeTensor[id_z * tensorDims];
+    uint *paramStrides = &paramStridesTensor[id_z * tensorDims];
+    uint srcIdx = id_z * maxBufferLength;
+    uint paramBase = id_z * maxParamVolume;
+    uint paramIndex = 0;
+
+    if (maxParamVolume > MAX_SHARED_MEMORY_SIZE)
+    {
+        if (id_x >= maxBufferLength)
+            return;
+
+        // validate if id_x is within the roi of input and compute paramIndex if valid
+        for (int i = 0; i < tensorDims; i++)
+        {
+            uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+            srcIdx += ((begin[i] + coord) * srcStrides[i]);
+            if (coord >= length[i])
+                return;
+            paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+        }
+        atomicAdd(&meanTensor[paramBase + paramIndex], static_cast<float>(srcPtr[srcIdx]));
+    }
+    else
+    {
+
+        if (id_x >= (hipBlockDim_x * hipGridDim_x))
+            return;
+
+        // if number of means needed to compute is within in the max shared memory size
+        // use shared memory for atomic addition to reduce global memory traffic
+        bool isValid = true;
+        for (int i = 0; i < tensorDims; i++)
+        {
+            uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+            srcIdx += ((begin[i] + coord) * srcStrides[i]);
+            if (coord >= length[i])
+            {
+                isValid = false;
+                break;
+            }
+            paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+        }
+
+        extern __shared__ float sh_mem[];
+        sh_mem[hipThreadIdx_x] = 0.0f;
+        __syncthreads();
+
+        if (isValid && id_x < maxBufferLength)
+            atomicAdd(&sh_mem[paramIndex], static_cast<float>(srcPtr[srcIdx]));
+        __syncthreads();
+
+        if (hipThreadIdx_x < maxParamVolume)
+            atomicAdd(&meanTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]);
+    }
+}
+
+// -------------------- Set 5 - stddev compute kernels (reduction stage 1) --------------------
+
+template <typename T>
+__global__ void compute_stddev_2d_hip_tensor(T *srcPtr,
+                                             uint2 srcStridesNH,
+                                             float *meanTensor,
+                                             float *stdDevTensor,
+                                             float *partialSumTensor,
+                                             uint *roiTensor,
+                                             uint maxParamVolume,
+                                             uint axisMask)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *roi = &roiTensor[id_z * 4];
+    uint yBegin = roi[0];
+    uint xBegin = roi[1];
+    uint height = roi[2];
+    uint width = roi[3];
+
+    // compute column wise stdDev
+    if (axisMask == 1)
+    {
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin);
+        uint paramIndex = id_z * maxParamVolume + id_x;
+        float mean = meanTensor[paramIndex];
+        if (id_x < width)
+        {
+            float accum = 0.0f;
+            for(int i = 0; i < height; i++)
+            {
+                float val = (static_cast<float>(srcPtr[srcIdx]) - mean);
+                accum += (val * val);
+                srcIdx += srcStridesNH.y;
+            }
+            stdDevTensor[paramIndex] = sqrtf(accum / static_cast<float>(height));
+        }
+    }
+    // compute partial mean subtracted squared sums needed for row wise stdDev
+    else if (axisMask == 2)
+    {
+        id_x *= 8;
+        __shared__ float partialRowSum_smem[256];
+        partialRowSum_smem[hipThreadIdx_x] = 0.0f;
+
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        int xAlignedLength =  width & ~7;      // alignedLength for vectorized global loads
+        int xDiff = width - xAlignedLength;    // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+        uint paramIndex = id_z * maxParamVolume + id_y;
+        float mean = meanTensor[paramIndex];
+        float4 mean_f4 = static_cast<float4>(mean);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+        rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+
+        if (id_x + 8 > width)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];                                           // perform small work of vectorized float4 addition
+        partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                              src_f8.f1[1] +
+                                              src_f8.f1[2] +
+                                              src_f8.f1[3]);                    // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+        __syncthreads();
+
+        // Now do block level reduction sum
+        reduction_sum_x_hip(partialRowSum_smem);
+
+        // Final store to dst
+        if (hipThreadIdx_x == 0)
+        {
+            uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x;
+            partialSumTensor[paramIndex] = partialRowSum_smem[0];
+        }
+    }
+    // compute partial mean subtracted squared sums need for computing stdDev over entire rows and columns
+    else if (axisMask == 3)
+    {
+        id_x *= 8;
+        __shared__ float partialSum_smem[16][16];                               // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+        float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];     // float pointer to beginning of each row in Shared
+        partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                           // initialization of Shared to 0.0f using all 16 x 16 threads
+
+        if ((id_y >= height) || (id_x >= width))
+        {
+            return;
+        }
+
+        int xAlignedLength =  width & ~7;       // alignedLength for vectorized global loads
+        int xDiff = width - xAlignedLength;     // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+        float mean = meanTensor[id_z];
+        float4 mean_f4 = static_cast<float4>(mean);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+        rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+        if (id_x + 8 > width)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];                                           // perform small work of vectorized float4 addition
+        partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                                src_f8.f1[1] +
+                                                src_f8.f1[2] +
+                                                src_f8.f1[3]);                  // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+        __syncthreads();                                                        // syncthreads after Shared load
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSumRowPtr_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            // Reduction of 16 floats on 16 threads per block in y dimension
+            for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+            {
+                if (hipThreadIdx_y < threadMax)
+                    partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                __syncthreads();
+            }
+
+            // Final store to dst
+            if (hipThreadIdx_y == 0)
+                partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void compute_stddev_3d_hip_tensor(T *srcPtr,
+                                             uint3 srcStridesNZY,
+                                             float *meanTensor,
+                                             float *stdDevTensor,
+                                             uint *roiTensor,
+                                             float *partialSumTensor,
+                                             uint maxParamVolume,
+                                             uint axisMask)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *roi = &roiTensor[id_z * 6];
+    uint zBegin = roi[0];
+    uint yBegin = roi[1];
+    uint xBegin = roi[2];
+    uint lengthZ = roi[3];
+    uint lengthY = roi[4];
+    uint lengthX = roi[5];
+
+    // compute stddev along z direction
+    if (axisMask == 1)
+    {
+        if (id_x >= lengthX || id_y >= lengthY)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin);
+        uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x;
+        float mean = meanTensor[paramIndex];
+        float accum = 0.0f;
+        for(uint i = 0; i < lengthZ; i++)
+        {
+            float val = (static_cast<float>(srcPtr[srcIdx]) - mean);
+            accum += (val * val);
+            srcIdx += srcStridesNZY.y;
+        }
+        stdDevTensor[paramIndex] = sqrtf(accum / static_cast<float>(lengthZ));
+    }
+    // compute stddev along y direction
+    else if (axisMask == 2)
+    {
+        if (id_x >= lengthX || id_y >= lengthZ)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin);
+        uint paramIndex = id_z * maxParamVolume +  id_y * lengthX + id_x;
+        float mean = meanTensor[paramIndex];
+        float accum = 0.0f;
+        for(uint i = 0; i < lengthY; i++)
+        {
+            float val = (static_cast<float>(srcPtr[srcIdx]) - mean);
+            accum += (val * val);
+            srcIdx += srcStridesNZY.z;
+        }
+        stdDevTensor[paramIndex] = sqrtf(accum / static_cast<float>(lengthY));
+    }
+    // compute stddev along x direction
+    else if (axisMask == 4)
+    {
+        if (id_x >= lengthY || id_y >= lengthZ)
+            return;
+
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+        uint paramIndex = id_z * maxParamVolume + id_y * lengthY + id_x;
+        float mean = meanTensor[paramIndex];
+        float4 mean_f4 = static_cast<float4>(mean);
+        d_float8 accum_f8;
+        accum_f8.f4[0] = (float4)0.0f;
+        accum_f8.f4[1] = (float4)0.0f;
+        for(int i = 0; i < lengthX; i += 8)
+        {
+            d_float8 src_f8;
+            rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+            rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+            rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+            if (i + 8 > lengthX)
+            {
+                int xDiff = i + 8 - lengthX;
+                for(int i = xDiff; i < 8; i++)
+                    src_f8.f1[i] = 0.0f;
+            }
+            accum_f8.f4[0] += src_f8.f4[0];
+            accum_f8.f4[1] += src_f8.f4[1];
+            srcIdx += 8;
+        }
+        accum_f8.f4[0] += accum_f8.f4[1];
+        accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+
+        stdDevTensor[paramIndex] =  sqrtf(accum_f8.f1[0] / static_cast<float>(lengthX));
+    }
+    // compute partial mean subtracted squared sums required for computing stdDev along z-y direction
+    else if (axisMask == 3)
+    {
+        for(uint xIndex = 0; xIndex < lengthX; xIndex++)
+        {
+            __shared__ float partialSum_smem[16][16];
+            float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];              // float pointer to beginning of each row in Shared
+            partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                                    // initialization of Shared to 0.0f using all 16 x 16 threads
+
+            if ((id_x >= lengthY) || (id_y >= lengthZ))
+            {
+                return;
+            }
+
+            uint paramIndex = id_z * maxParamVolume + xIndex;
+            float mean = meanTensor[paramIndex];
+            uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex);
+            float val = static_cast<float>(srcPtr[srcIdx]) - mean;
+            partialSumRowPtr_smem[hipThreadIdx_x] = (val * val);
+            __syncthreads();                                                                      // syncthreads after Shared load
+
+            // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+            for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+            {
+                if (hipThreadIdx_x < threadMax)
+                    partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax];
+                __syncthreads();
+            }
+
+            if (hipThreadIdx_x == 0)
+            {
+                // Reduction of 16 floats on 16 threads per block in z dimension
+                for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+                {
+                    if (hipThreadIdx_y < threadMax)
+                        partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                    __syncthreads();
+                }
+
+                // Final store to dst
+                if (hipThreadIdx_y == 0)
+                {
+                    uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x);
+                    partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+                }
+            }
+            __syncthreads();
+        }
+    }
+    // compute partial mean subtracted squared sums required for computing stdDev along y-x direction
+    else if (axisMask == 6)
+    {
+        __shared__ float partialSum_smem[256];
+        partialSum_smem[hipThreadIdx_x] = 0.0f;
+        __syncthreads();
+
+        if (id_x >= lengthY || id_y >= lengthZ)
+            return;
+
+        uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y;
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+
+        uint paramIndex = id_z * maxParamVolume + id_y;
+        float mean = meanTensor[paramIndex];
+        float4 mean_f4 = static_cast<float4>(mean);
+
+        d_float8 accum_f8;
+        accum_f8.f4[0] = (float4)0.0f;
+        accum_f8.f4[1] = (float4)0.0f;
+        for(int i = 0; i < lengthX; i += 8)
+        {
+            d_float8 src_f8;
+            rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+            rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+            rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+            if (i + 8 > lengthX)
+            {
+                int xDiff = lengthX - i;
+                for(int j = xDiff; j < 8; j++)
+                    src_f8.f1[j] = 0.0f;
+            }
+            accum_f8.f4[0] += src_f8.f4[0];
+            accum_f8.f4[1] += src_f8.f4[1];
+            srcIdx += 8;
+        }
+        accum_f8.f4[0] += accum_f8.f4[1];
+        accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+        partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0];
+        __syncthreads();
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSum_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
+            partialSumTensor[dstIdx] = partialSum_smem[0];
+        }
+    }
+    // compute stddev along z-x direction
+    else if (axisMask == 5)
+    {
+        __shared__ float partialSum_smem[32];
+        partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+        if (hipBlockIdx_x >= lengthY)
+            return;
+
+        uint paramIndex = id_z * maxParamVolume +  hipBlockIdx_x;
+        float mean = meanTensor[paramIndex];
+        float accum = 0.0f;
+        for (uint i = 0; i < lengthZ; i++)
+        {
+            uint tid_x = hipThreadIdx_x;
+            uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin;
+            while (tid_x < lengthX)
+            {
+                float val = (static_cast<float>(srcPtr[srcIdx + tid_x]) - mean);
+                accum += (val * val);
+                tid_x += hipBlockDim_x;
+            }
+        }
+        partialSum_smem[hipThreadIdx_x] = accum;
+        __syncthreads();
+
+        // perform reduction on shared memory sums
+        reduction_sum_x_hip(partialSum_smem);
+
+        if (hipThreadIdx_x == 0)
+            stdDevTensor[paramIndex] = sqrtf(partialSum_smem[0] / static_cast<float>(lengthX * lengthZ));
+    }
+    // compute partial mean subtracted squared sums required for computing stdDev along z-y-x direction
+    else if (axisMask == 7)
+    {
+       id_x *= 8;
+        __shared__ float partialSum_smem[16][16];
+        float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0];              // float pointer to beginning of each row in Shared
+        partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f;                                    // initialization of Shared to 0.0f using all 16 x 16 threads
+
+        uint xIndex = id_x % srcStridesNZY.z;
+        uint yIndex = id_x / srcStridesNZY.z;
+        if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ))
+        {
+            return;
+        }
+
+        int xAlignedLength =  lengthX & ~7;       // alignedLength for vectorized global loads
+        int xDiff = lengthX - xAlignedLength;     // difference between roiWidth and alignedLength
+        uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin);
+
+        uint paramIndex = id_z * maxParamVolume;
+        float mean = meanTensor[paramIndex];
+        float4 mean_f4 = static_cast<float4>(mean);
+
+        d_float8 src_f8;
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+        rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+        rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+
+        if (xIndex + 8 > lengthX)
+        {
+            for(int i = xDiff; i < 8; i++)
+                src_f8.f1[i] = 0.0f;                                            // local memory reset of invalid values (from the vectorized global load) to 0.0f
+        }
+        src_f8.f4[0] += src_f8.f4[1];
+        partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+                                                 src_f8.f1[1] +
+                                                 src_f8.f1[2] +
+                                                 src_f8.f1[3]);                 // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+        __syncthreads();                                                        // syncthreads after Shared load
+
+        // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+        reduction_sum_x_hip(partialSumRowPtr_smem);
+
+        if (hipThreadIdx_x == 0)
+        {
+            // Reduction of 16 floats on 16 threads per block in y dimension
+            for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+            {
+                if (hipThreadIdx_y < threadMax)
+                    partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+                __syncthreads();
+            }
+
+            // Final store to dst
+            if (hipThreadIdx_y == 0)
+            {
+                uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x);
+                partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+            }
+        }
+    }
+}
+
+template <typename T>
+__global__ void compute_stddev_nd_hip_tensor(T *srcPtr,
+                                             uint *srcMaxDims,
+                                             uint *srcStrides,
+                                             float *meanTensor,
+                                             float *stdDevTensor,
+                                             uint *roiTensor,
+                                             uint *paramShapeTensor,
+                                             uint *paramStridesTensor,
+                                             uint maxParamVolume,
+                                             uint tensorDims,
+                                             uint maxBufferLength)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *begin = &roiTensor[id_z * tensorDims * 2];
+    uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+    uint *paramShape = &paramShapeTensor[id_z * tensorDims];
+    uint *paramStrides = &paramStridesTensor[id_z * tensorDims];
+    uint srcIdx = id_z * maxBufferLength;
+    uint paramBase = id_z * maxParamVolume;
+    uint paramIndex = 0;
+
+    if (maxParamVolume > MAX_SHARED_MEMORY_SIZE)
+    {
+        if (id_x >= maxBufferLength)
+            return;
+
+        // validate if id_x is within the roi of input and compute paramIndex if valid
+        for (int i = 0; i < tensorDims; i++)
+        {
+            uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+            srcIdx += ((begin[i] + coord) * srcStrides[i]);
+            if (coord >= length[i])
+                return;
+            paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+        }
+        float val = static_cast<float>(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex];
+        atomicAdd(&stdDevTensor[paramBase + paramIndex], (val * val));
+    }
+    else
+    {
+
+        if (id_x >= (hipBlockDim_x * hipGridDim_x))
+            return;
+
+        // if number of means needed to compute is within in the max shared memory size
+        // use shared memory for atomic addition to reduce global memory traffic
+        bool isValid = true;
+        for (int i = 0; i < tensorDims; i++)
+        {
+            uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+            srcIdx += ((begin[i] + coord) * srcStrides[i]);
+            if (coord >= length[i])
+            {
+                isValid = false;
+                break;
+            }
+            paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+        }
+
+        extern __shared__ float sh_mem[];
+        sh_mem[hipThreadIdx_x] = 0.0f;
+        __syncthreads();
+
+        if (isValid && id_x < maxBufferLength)
+        {
+            float val = static_cast<float>(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex];
+            atomicAdd(&sh_mem[paramIndex], (val * val));
+        }
+        __syncthreads();
+
+        if (hipThreadIdx_x < maxParamVolume)
+            atomicAdd(&stdDevTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]);
+    }
+}
+
+// -------------------- Set 6 - mean and stddev compute kernels (reduction stage 2) --------------------
+
+__global__ void reduce_final_result_hip(float *partialSumTensor,
+                                        uint numPartialSums,
+                                        float *meanTensor,
+                                        float *stdDevTensor,
+                                        bool isMean,
+                                        uint *roiTensor,
+                                        uint axisMask,
+                                        uint tensorDims)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+
+    uint meanFactor;
+    if (tensorDims == 3)
+    {
+        uint lengthZ = roi[0];
+        uint lengthY = roi[1];
+        uint lengthX = roi[2];
+
+        if (axisMask == 3)
+            meanFactor = lengthZ * lengthY;
+        else if (axisMask == 6)
+            meanFactor = lengthY * lengthX;
+        else if (axisMask == 7)
+            meanFactor = lengthZ * lengthY * lengthX;
+    }
+    else if (tensorDims == 2)
+    {
+        uint lengthY = roi[0];
+        uint lengthX = roi[1];
+
+        if (axisMask == 2)
+            meanFactor = lengthX;
+        else if (axisMask == 3)
+            meanFactor = lengthY * lengthX;
+    }
+
+    __shared__ float partialSum_smem[16];
+    partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+    float accum = 0.0f;
+    while(id_x < numPartialSums)
+    {
+        uint srcIdx = (id_z * hipGridDim_y * numPartialSums) + (id_y * numPartialSums) + id_x;
+        accum += partialSumTensor[srcIdx];
+        id_x += hipBlockDim_x;
+    }
+    partialSum_smem[hipThreadIdx_x] = accum;
+    __syncthreads();
+
+    // Now do block level reduction sum
+    reduction_sum_x_hip(partialSum_smem);
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+    {
+        if (isMean)
+            meanTensor[id_z * hipGridDim_y + id_y] = partialSum_smem[0] / meanFactor;
+        else
+            stdDevTensor[id_z * hipGridDim_y + id_y] = sqrtf(partialSum_smem[0] / meanFactor);
+    }
+}
+
+__global__ void final_reduction_nd_hip_tensor(float *meanTensor,
+                                              float *stdDevTensor,
+                                              uint *paramShapeTensor,
+                                              uint *roiTensor,
+                                              uint tensorDims,
+                                              uint maxParamVolume,
+                                              bool isMean)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    uint *paramShape = &paramShapeTensor[id_z * tensorDims];
+    uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+
+    uint divisionFactor = 1;
+    uint paramVolume = 1;
+    for(int i = 0; i < tensorDims; i++)
+    {
+        paramVolume *= paramShape[i];
+        if (paramShape[i] == 1)
+            divisionFactor *= roi[i];
+    }
+
+    if (id_x >= paramVolume)
+        return;
+
+    uint paramIndex = id_z * maxParamVolume + id_x;
+    if (isMean)
+        meanTensor[paramIndex] = meanTensor[paramIndex] / divisionFactor;
+    else
+        stdDevTensor[paramIndex] = sqrtf(stdDevTensor[paramIndex] / divisionFactor);
+}
+
+// -------------------- Set 7 - mean and stddev compute kernels launch helpers --------------------
+
+void set_kernel_launch_config_2d(RpptGenericDescPtr srcGenericDescPtr,
+                                 int &globalThreads_x,
+                                 int &globalThreads_y,
+                                 int &globalThreads_z,
+                                 int &localThreads_x,
+                                 int &localThreads_y,
+                                 int &localThreads_z,
+                                 Rpp32u axisMask,
+                                 Rpp32f *partialSumArr,
+                                 rpp::Handle& handle)
+{
+    switch (axisMask)
+    {
+        // compute along Y direction
+        case 1:
+        {
+            localThreads_x = 256;
+            localThreads_y = 1;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int>(ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+            globalThreads_y = 1;
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along X direction
+        case 2:
+        {
+            localThreads_x = 256;
+            localThreads_y = 1;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / 256));
+            globalThreads_y = srcGenericDescPtr->dims[1];
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = srcGenericDescPtr->dims[0] * srcGenericDescPtr->dims[1] * globalThreads_x;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+        // compute along XY direction
+        case 3:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+    }
+}
+
+void set_kernel_launch_config_3d(RpptGenericDescPtr srcGenericDescPtr,
+                                 int &globalThreads_x,
+                                 int &globalThreads_y,
+                                 int &globalThreads_z,
+                                 int &localThreads_x,
+                                 int &localThreads_y,
+                                 int &localThreads_z,
+                                 Rpp32u axisMask,
+                                 Rpp32f *partialSumArr,
+                                 rpp::Handle& handle)
+{
+    switch (axisMask)
+    {
+        // compute along Z direction
+        case 1:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[2] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along Y direction
+        case 2:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along YZ direction
+        case 3:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+        // compute along X direction
+        case 4:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along XZ direction
+        case 5:
+        {
+            localThreads_x = 32;
+            localThreads_y = 1;
+            localThreads_z = 1;
+            globalThreads_x = srcGenericDescPtr->dims[2];
+            globalThreads_y = 1;
+            globalThreads_z = srcGenericDescPtr->dims[0];
+            break;
+        }
+        // compute along XY direction
+        case 6:
+        {
+            localThreads_x = 256;
+            localThreads_y = 1;
+            localThreads_z = 1;
+            globalThreads_x = static_cast<int> (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+            globalThreads_y = srcGenericDescPtr->dims[1];
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+        // compute along XYZ direction
+        case 7:
+        {
+            localThreads_x = 16;
+            localThreads_y = 16;
+            localThreads_z = 1;
+            Rpp32u numValues = (srcGenericDescPtr->dims[2] * srcGenericDescPtr->dims[3] + 7) >> 3;
+            globalThreads_x = static_cast<int> (ceil((float)numValues / localThreads_x));
+            globalThreads_y = static_cast<int> (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+            globalThreads_z = srcGenericDescPtr->dims[0];
+
+            Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+            hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+            hipStreamSynchronize(handle.GetStream());
+            break;
+        }
+    }
+}
+
+// -------------------- Set 8 - mean and stddev compute kernels executor --------------------
+
+template <typename T>
+RppStatus hip_exec_compute_mean_stddev_tensor(T *srcPtr,
+                                              RpptGenericDescPtr srcGenericDescPtr,
+                                              Rpp32f *meanTensor,
+                                              Rpp32f *stdDevTensor,
+                                              bool isMean,
+                                              Rpp32u *roiTensor,
+                                              Rpp32u axisMask,
+                                              Rpp32u tensorDims,
+                                              Rpp32u maxParamVolume,
+                                              Rpp32u *paramShape,
+                                              Rpp32u *paramStrides,
+                                              rpp::Handle& handle)
+{
+    Rpp32f *partialSumArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+    Rpp32u partialSumArrLength, partialSumBlocksPerSample;
+
+    int globalThreads_x, globalThreads_y, globalThreads_z;
+    int localThreads_x, localThreads_y, localThreads_z;
+    // based on number of dimensions call the corresponding kernel
+    if (tensorDims == 2)
+    {
+        // set the block and grid configuration based on axisMask
+        set_kernel_launch_config_2d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z,
+                                    localThreads_x, localThreads_y, localThreads_z, axisMask,
+                                    partialSumArr, handle);
+
+        if (isMean)
+        {
+            hipLaunchKernelGGL(compute_mean_2d_hip_tensor,
+                               dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+                               meanTensor,
+                               partialSumArr,
+                               roiTensor,
+                               maxParamVolume,
+                               axisMask);
+        }
+        else
+        {
+            hipLaunchKernelGGL(compute_stddev_2d_hip_tensor,
+                               dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+                               meanTensor,
+                               stdDevTensor,
+                               partialSumArr,
+                               roiTensor,
+                               maxParamVolume,
+                               axisMask);
+        }
+
+        if (axisMask == 2)
+        {
+            partialSumBlocksPerSample = globalThreads_x;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+        else if (axisMask == 3)
+        {
+            partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), 1, globalThreads_z),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+    }
+    else if (tensorDims == 3)
+    {
+        // set the block and grid configuration based on axisMask
+        set_kernel_launch_config_3d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z,
+                                    localThreads_x, localThreads_y, localThreads_z, axisMask,
+                                    partialSumArr, handle);
+
+        if (isMean)
+        {
+            hipLaunchKernelGGL(compute_mean_3d_hip_tensor,
+                               dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               meanTensor,
+                               roiTensor,
+                               partialSumArr,
+                               maxParamVolume,
+                               axisMask);
+        }
+        else
+        {
+            hipLaunchKernelGGL(compute_stddev_3d_hip_tensor,
+                               dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               meanTensor,
+                               stdDevTensor,
+                               roiTensor,
+                               partialSumArr,
+                               maxParamVolume,
+                               axisMask);
+        }
+
+        // perform final reduction on block wise sums for below cases
+        // reduce on YZ partial sums
+        if (axisMask == 3)
+        {
+            partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[3], srcGenericDescPtr->dims[0]),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+        // reduce on XY partial sums
+        if (axisMask == 6)
+        {
+            partialSumBlocksPerSample = globalThreads_x;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[1], srcGenericDescPtr->dims[0]),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+        // reduce on XYZ block partial sums
+        else if (axisMask == 7)
+        {
+            partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+            hipLaunchKernelGGL(reduce_final_result_hip,
+                               dim3(ceil((float)partialSumBlocksPerSample/16), 1, srcGenericDescPtr->dims[0]),
+                               dim3(16, 1, 1),
+                               0,
+                               handle.GetStream(),
+                               partialSumArr,
+                               partialSumBlocksPerSample,
+                               meanTensor,
+                               stdDevTensor,
+                               isMean,
+                               roiTensor,
+                               axisMask,
+                               tensorDims);
+        }
+    }
+    else
+    {
+        // interpret the input as 1D tensor
+        globalThreads_x = srcGenericDescPtr->strides[0];
+        globalThreads_y = 1;
+        globalThreads_z = srcGenericDescPtr->dims[0];
+        Rpp32u batchSize = globalThreads_z;
+
+        // allocate tensor for src strides
+        Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1];
+        Rpp32u *srcStrides = &srcGenericDescPtr->strides[1];
+
+        Rpp32u shared_memory_size = 0;
+        Rpp32u block_size = 1024;
+        if (maxParamVolume <= MAX_SHARED_MEMORY_SIZE)
+        {
+            if (maxParamVolume <= 32)
+                shared_memory_size = 32;
+            else if (maxParamVolume <= 64)
+                shared_memory_size = 64;
+            else if (maxParamVolume <= 128)
+                shared_memory_size = 128;
+            else if (maxParamVolume <= 256)
+                shared_memory_size = 256;
+            else if (maxParamVolume <= 512)
+                shared_memory_size = 512;
+            else
+                shared_memory_size = MAX_SHARED_MEMORY_SIZE;
+            block_size = shared_memory_size;
+        }
+
+        if (isMean)
+        {
+            hipLaunchKernelGGL(compute_mean_nd_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+                               dim3(block_size, 1, 1),
+                               shared_memory_size,
+                               handle.GetStream(),
+                               srcPtr,
+                               srcMaxDims,
+                               srcStrides,
+                               meanTensor,
+                               roiTensor,
+                               paramShape,
+                               paramStrides,
+                               maxParamVolume,
+                               tensorDims,
+                               srcGenericDescPtr->strides[0]);
+        }
+        else
+        {
+            hipLaunchKernelGGL(compute_stddev_nd_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+                               dim3(block_size, 1, 1),
+                               shared_memory_size,
+                               handle.GetStream(),
+                               srcPtr,
+                               srcMaxDims,
+                               srcStrides,
+                               meanTensor,
+                               stdDevTensor,
+                               roiTensor,
+                               paramShape,
+                               paramStrides,
+                               maxParamVolume,
+                               tensorDims,
+                               srcGenericDescPtr->strides[0]);
+        }
+        hipLaunchKernelGGL(final_reduction_nd_hip_tensor,
+                           dim3(ceil((float)maxParamVolume/1024), 1, globalThreads_z),
+                           dim3(1024, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           meanTensor,
+                           stdDevTensor,
+                           paramShape,
+                           roiTensor,
+                           tensorDims,
+                           maxParamVolume,
+                           isMean);
+    }
+    hipStreamSynchronize(handle.GetStream());
+    return RPP_SUCCESS;
+}
+
+// -------------------- Set 9 - normalization kernel executor --------------------
+
+template <typename T>
+RppStatus hip_exec_normalize_tensor(T *srcPtr,
+                                    RpptGenericDescPtr srcGenericDescPtr,
+                                    T *dstPtr,
+                                    RpptGenericDescPtr dstGenericDescPtr,
+                                    Rpp32u axisMask,
+                                    Rpp32f *meanTensor,
+                                    Rpp32f *stdDevTensor,
+                                    Rpp8u computeMeanStddev,
+                                    Rpp32f scale,
+                                    Rpp32f shift,
+                                    Rpp32u *roiTensor,
+                                    rpp::Handle& handle)
+{
+    Rpp32u batchSize = srcGenericDescPtr->dims[0];
+    Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+
+    // create buffer for paramShape and paramStride needed for generic kernel
+    Rpp32u *paramShape, *paramStrides;
+    paramShape =  handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem;
+    paramStrides = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem + (batchSize * tensorDims);
+
+    // do initial preprocessing, compute maxParamVolue and fill the values for paramShape and paramStrides
+    Rpp32u maxParamVolume;
+    if (tensorDims == 2 || tensorDims == 3)
+        normalize_setup_2d_and_3d(roiTensor, batchSize, tensorDims,
+                                  axisMask, maxParamVolume);
+    else
+        normalize_setup_nd(roiTensor, batchSize, tensorDims, axisMask,
+                           paramShape, paramStrides, maxParamVolume);
+
+    bool computeMean = computeMeanStddev & 1;   // if 0th bit in computeMeanStddev is set, computeMean is set to true. Otherwise it is set to false
+    bool computeStdDev = computeMeanStddev & 2; // if 1st bit in computeMeanStddev is set, computeStdDev is set to true. Otherwise it is set to false
+    if ((!computeMean) && (!computeStdDev))
+        maxParamVolume = 0;
+
+    // if computeMean is set, compute mean values by processing over input based on axisMask values
+    if (computeMean)
+        hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, true,
+                                            roiTensor, axisMask, tensorDims, maxParamVolume,
+                                            paramShape, paramStrides, handle);
+
+    // if computeStdDev is set, compute stdDev values by processing over input based on axisMask values
+    if (computeStdDev)
+        hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, false,
+                                            roiTensor, axisMask, tensorDims, maxParamVolume,
+                                            paramShape, paramStrides, handle);
+
+    // based on number of dimensions call the corresponding kernel
+    if (tensorDims == 2)
+    {
+        // NHW
+        int globalThreads_x = dstGenericDescPtr->dims[2];
+        int globalThreads_y = dstGenericDescPtr->dims[1];
+        int globalThreads_z = dstGenericDescPtr->dims[0];
+
+        hipLaunchKernelGGL(normalize_2d_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+                           dstPtr,
+                           make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]),
+                           meanTensor,
+                           stdDevTensor,
+                           make_float2(scale, shift),
+                           roiTensor,
+                           make_uint2(maxParamVolume, axisMask),
+                           computeStdDev);
+    }
+    else if (tensorDims == 3)
+    {
+        // NDHW
+        int globalThreads_x = dstGenericDescPtr->dims[3];
+        int globalThreads_y = dstGenericDescPtr->dims[2];
+        int globalThreads_z = dstGenericDescPtr->dims[1];
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            hipLaunchKernelGGL(normalize_3d_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+                               make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                               make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+                               &meanTensor[batchCount * maxParamVolume],
+                               &stdDevTensor[batchCount * maxParamVolume],
+                               make_float2(scale, shift),
+                               &roiTensor[batchCount * 6],
+                               axisMask,
+                               computeStdDev);
+        }
+    }
+    else
+    {
+        // interpret the input as 1D tensor
+        int globalThreads_x = dstGenericDescPtr->strides[0];
+        int globalThreads_y = 1;
+        int globalThreads_z = dstGenericDescPtr->dims[0];
+
+        // allocate tensor for src strides
+        Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1];
+        Rpp32u *srcStrides = &srcGenericDescPtr->strides[1];
+        hipLaunchKernelGGL(normalize_nd_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+                           dim3(1024, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           srcMaxDims,
+                           srcStrides,
+                           dstPtr,
+                           meanTensor,
+                           stdDevTensor,
+                           make_float2(scale, shift),
+                           roiTensor,
+                           paramShape,
+                           paramStrides,
+                           make_uint2(maxParamVolume, srcGenericDescPtr->strides[0]),
+                           tensorDims,
+                           computeStdDev);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/slice.hpp b/src/modules/hip/kernel/slice.hpp
index 8deb52bbb..c1b7a6c41 100644
--- a/src/modules/hip/kernel/slice.hpp
+++ b/src/modules/hip/kernel/slice.hpp
@@ -2,24 +2,79 @@
 #include <omp.h>
 #include "rpp_hip_common.hpp"
 
+template <typename T>
+__global__ void fill_value_ncdhw_hip_tensor(T *dstPtr,
+                                            uint3 dstStridesCDH,
+                                            int channels,
+                                            uint3 dstDimsDHW,
+                                            T *fillValue)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // W - inner most dim vectorized
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
+
+    if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z))
+    {
+        return;
+    }
+
+    uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+    d_float8 val_f8;
+    val_f8.f4[0] = (float4)(*fillValue);
+    val_f8.f4[1] = val_f8.f4[0];
+    for(int c = 0; c < channels; c++)
+    {
+        rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+        dstIdx += dstStridesCDH.x;
+    }
+}
+
+
+template <typename T>
+__global__ void fill_value_ndhwc_hip_tensor(T *dstPtr,
+                                            uint2 dstStridesDH,
+                                            uint3 dstDimsDHW,
+                                            T *fillValue)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // W - inner most dim vectorized
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
+
+    if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z))
+    {
+        return;
+    }
+
+    uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+    d_float24 val_f24;
+    val_f24.f4[0] = (float4)(*fillValue);
+    val_f24.f4[1] = val_f24.f4[0];
+    val_f24.f4[2] = val_f24.f4[0];
+    val_f24.f4[3] = val_f24.f4[0];
+    val_f24.f4[4] = val_f24.f4[0];
+    val_f24.f4[5] = val_f24.f4[0];
+    rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+
 template <typename T>
 __global__ void slice_ncdhw_hip_tensor(T *srcPtr,
                                        uint3 srcStridesCDH,
                                        T *dstPtr,
                                        uint3 dstStridesCDH,
                                        int channels,
-                                       RpptROI3DPtr roiGenericSrc)
+                                       uint3 validShapeDHW)
 {
     int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // W - inner most dim vectorized
     int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
     int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
 
-    if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth))
+    if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z))
     {
         return;
     }
 
-    uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x);
+    uint srcIdx = (id_z * srcStridesCDH.y) + (id_y * srcStridesCDH.z) + id_x;
     uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
 
     d_float8 val_f8;
@@ -32,77 +87,439 @@ __global__ void slice_ncdhw_hip_tensor(T *srcPtr,
     }
 }
 
+
 template <typename T>
 __global__ void slice_ndhwc_hip_tensor(T *srcPtr,
                                        uint2 srcStridesDH,
                                        T *dstPtr,
                                        uint2 dstStridesDH,
-                                       RpptROI3DPtr roiGenericSrc)
+                                       uint3 validShapeDHW)
 {
     int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // WC - inner most dim vectorized
     int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
     int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
 
-    if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth))
+    if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z))
     {
         return;
     }
 
-    uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x) * 3;
-    uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+    uint srcIdx = (id_z * srcStridesDH.x) + (id_y * srcStridesDH.y) + (id_x * 3);
+    uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + (id_x * 3);
 
     d_float24 val_f24;
     rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
     rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
 }
 
+template <typename T>
+RppStatus hip_exec_fill_value_tensor(T *dstPtr,
+                                     RpptGenericDescPtr dstGenericDescPtr,
+                                     Rpp32s *anchorTensor,
+                                     Rpp32s *shapeTensor,
+                                     T *fillValue,
+                                     Rpp32u *roiTensor,
+                                     rpp::Handle& handle,
+                                     Rpp32u numDims)
+{
+    if (numDims == 4)
+    {
+        // set the dimsOrder and globalthreads values required for NDHWC layout
+        Rpp32s dimsOrder[3] = {0, 1, 2};
+        int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[2];                   // H - height (y direction)
+        int globalThreads_z = dstGenericDescPtr->dims[1];                   // D - depth (z direction)
+
+        // change the dimsOrder and globalthreads values if layout is NCDHW
+        if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+        {
+            dimsOrder[0] = 1;  // depth
+            dimsOrder[1] = 2;  // height
+            dimsOrder[2] = 3;  // width
+            globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+            globalThreads_y = dstGenericDescPtr->dims[3];               // H - height (y direction)
+            globalThreads_z = dstGenericDescPtr->dims[2];               // D - depth (z direction)
+        }
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+            Rpp32s *shape = &shapeTensor[batchCount * numDims];
+            Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+            Rpp32s *length = reinterpret_cast<Rpp32s *>(&roi[numDims]);
+            Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+            Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+            Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]);
+
+            // checking if padding is required
+            bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+                                ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) ||
+                                ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]]));
+
+            // if needPadding is set, launch kernel for filling the padded region with fill value specified
+            if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCDHW)
+            {
+                hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+                                   dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                                   0,
+                                   handle.GetStream(),
+                                   dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                                   make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+                                   dstGenericDescPtr->dims[1],
+                                   make_uint3(maxDepth, maxHeight, maxWidth),
+                                   fillValue);
+            }
+            else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NDHWC)
+            {
+                hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor,
+                                   dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                                   0,
+                                   handle.GetStream(),
+                                   dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                                   make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+                                   make_uint3(maxDepth, maxHeight, maxWidth),
+                                   fillValue);
+            }
+        }
+    }
+    else if (numDims == 3)
+    {
+        // set the dimsOrder and globalthreads values required for NHWC layout
+        Rpp32s dimsOrder[2] = {0, 1};
+        int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width  (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[1];                   // H - height (y direction)
+        int globalThreads_z = 1;
+
+        // change the dimsOrder and globalthreads values if layout is NCHW
+        if (dstGenericDescPtr->layout == RpptLayout::NCHW)
+        {
+            dimsOrder[0] = 1;  // height
+            dimsOrder[1] = 2;  // width
+            globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width  (x direction) - vectorized for 8 element loads/stores per channel
+            globalThreads_y = dstGenericDescPtr->dims[2];               // H - height (y direction)
+            globalThreads_z = 1;
+        }
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+            Rpp32s *shape = &shapeTensor[batchCount * numDims];
+            Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+            Rpp32s *length = reinterpret_cast<Rpp32s *>(&roi[numDims]);
+            Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+            Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+
+            // check if padding is needed
+            bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+                                ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]));
+
+            // launch kernel for filling the padded region with fill value specified
+            if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCHW)
+            {
+                hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+                                   dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+                                   0,
+                                   handle.GetStream(),
+                                   dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                                   make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]),
+                                   dstGenericDescPtr->dims[1],
+                                   make_uint3(1, shape[1], shape[2]),
+                                   fillValue);
+            }
+            else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NHWC)
+            {
+                hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor,
+                                   dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+                                   0,
+                                   handle.GetStream(),
+                                   dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                                   make_uint2(1, dstGenericDescPtr->strides[1]),
+                                   make_uint3(1, maxHeight, maxWidth),
+                                   fillValue);
+            }
+        }
+    }
+    else if (numDims == 2)
+    {
+        // NHW
+        int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[1];               // H - height (y direction)
+        int globalThreads_z = 1;
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+            Rpp32s *shape = &shapeTensor[batchCount * numDims];
+            Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+            Rpp32s *length = reinterpret_cast<Rpp32s *>(&roi[numDims]);
+            Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]);
+            Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]);
+
+            // check if padding is needed
+            bool needPadding = (((anchor[0] + shape[0]) > length[0]) ||
+                                ((anchor[1] + shape[1]) > length[1]));
+
+            // launch kernel for filling the padded region with fill value specified
+            if (needPadding)
+            {
+                hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+                                    dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                    dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+                                    0,
+                                    handle.GetStream(),
+                                    dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                                    make_uint3(0, 0, dstGenericDescPtr->strides[1]),
+                                    1,
+                                    make_uint3(1, shape[0], shape[1]),
+                                    fillValue);
+            }
+        }
+    }
+    else if (numDims == 1)
+    {
+        int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = 1;
+        int globalThreads_z = 1;
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+            Rpp32s *shape = &shapeTensor[batchCount * numDims];
+            Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+            Rpp32s *length = reinterpret_cast<Rpp32s *>(&roi[numDims]);
+            Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]);
+
+            // check if padding is needed
+            bool needPadding = ((anchor[0] + shape[0]) > length[0]);
+
+            // launch kernel for filling the padded region with fill value specified
+            if (needPadding)
+            {
+                hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+                                   dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, 1, 1),
+                                   0,
+                                   handle.GetStream(),
+                                   dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                                   make_uint3(0, 0, 1),
+                                   1,
+                                   make_uint3(1, 1, shape[0]),
+                                   fillValue);
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
 template <typename T>
 RppStatus hip_exec_slice_tensor(T *srcPtr,
                                 RpptGenericDescPtr srcGenericDescPtr,
                                 T *dstPtr,
                                 RpptGenericDescPtr dstGenericDescPtr,
-                                RpptROI3DPtr roiGenericPtrSrc,
+                                Rpp32s *anchorTensor,
+                                Rpp32s *shapeTensor,
+                                T *fillValue,
+                                bool enablePadding,
+                                Rpp32u *roiTensor,
                                 rpp::Handle& handle)
 {
-    if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+    Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+
+    /* if enabledPadding is set to true, launch kernel to fill the output buffers with fill value specified.
+    This will be only done if shapeTensor[d] > roiTensor[d] where d is the dimension*/
+    if (enablePadding)
+    {
+        hip_exec_fill_value_tensor(dstPtr,
+                                   dstGenericDescPtr,
+                                   anchorTensor,
+                                   shapeTensor,
+                                   fillValue,
+                                   roiTensor,
+                                   handle,
+                                   numDims);
+        hipStreamSynchronize(handle.GetStream());
+    }
+
+    if(numDims == 4)
     {
-        int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
-        int globalThreads_y = dstGenericDescPtr->dims[3];               // H - height (y direction)
-        int globalThreads_z = dstGenericDescPtr->dims[2];               // D - depth (z direction)
+        // set the dimsOrder and globalthreads values required for NDHWC layout
+        Rpp32s dimsOrder[3] = {0, 1, 2};
+        int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[2];                   // H - height (y direction)
+        int globalThreads_z = dstGenericDescPtr->dims[1];                   // D - depth (z direction)
+
+        // change the dimsOrder and globalthreads values if layout is NCDHW
+        if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+        {
+            dimsOrder[0] = 1;  // depth
+            dimsOrder[1] = 2;  // height
+            dimsOrder[2] = 3;  // width
+            globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+            globalThreads_y = dstGenericDescPtr->dims[3];               // H - height (y direction)
+            globalThreads_z = dstGenericDescPtr->dims[2];               // D - depth (z direction)
+        }
 
         for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
         {
+            Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+            Rpp32s *shape = &shapeTensor[batchCount * numDims];
+            Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+            Rpp32s *length = reinterpret_cast<Rpp32s *>(&roi[numDims]);
+            Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+            Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+            Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]);
+            if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+            {
+                T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2] * srcGenericDescPtr->strides[3] + anchor[3];
+                T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+                hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
+                                   dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                                   0,
+                                   handle.GetStream(),
+                                   srcPtrTemp,
+                                   make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+                                   dstPtrTemp,
+                                   make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+                                   dstGenericDescPtr->dims[1],
+                                   make_uint3(maxDepth, maxHeight, maxWidth));
+            }
+            else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+            {
+                T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2];
+                T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+                hipLaunchKernelGGL(slice_ndhwc_hip_tensor,
+                                   dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                                   0,
+                                   handle.GetStream(),
+                                   srcPtrTemp,
+                                   make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                                   dstPtrTemp,
+                                   make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+                                   make_uint3(maxDepth, maxHeight, maxWidth));
+            }
+        }
+    }
+    else if (numDims == 3)
+    {
+        // set the dimsOrder and globalthreads values required for NHWC layout
+        Rpp32s dimsOrder[2] = {0, 1};
+        int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width  (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[1];                   // H - height (y direction)
+        int globalThreads_z = 1;
+
+        // change the dimsOrder and globalthreads values if layout is NCHW
+        if (dstGenericDescPtr->layout == RpptLayout::NCHW)
+        {
+            dimsOrder[0] = 1;  // height
+            dimsOrder[1] = 2;  // width
+            globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width  (x direction) - vectorized for 8 element loads/stores per channel
+            globalThreads_y = dstGenericDescPtr->dims[2];               // H - height (y direction)
+            globalThreads_z = 1;
+        }
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+            Rpp32s *shape = &shapeTensor[batchCount * numDims];
+            Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+            Rpp32s *length = reinterpret_cast<Rpp32s *>(&roi[numDims]);
+            Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+            Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+            if (dstGenericDescPtr->layout == RpptLayout::NCHW)
+            {
+                T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2];
+                T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+                hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
+                                   dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+                                   0,
+                                   handle.GetStream(),
+                                   srcPtrTemp,
+                                   make_uint3(srcGenericDescPtr->strides[1], 0, srcGenericDescPtr->strides[2]),
+                                   dstPtrTemp,
+                                   make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]),
+                                   dstGenericDescPtr->dims[1],
+                                   make_uint3(1, maxHeight, maxWidth));
+            }
+            else if (dstGenericDescPtr->layout == RpptLayout::NHWC)
+            {
+                T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1];
+                T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+                hipLaunchKernelGGL(slice_ndhwc_hip_tensor,
+                                   dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z),
+                                   dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+                                   0,
+                                   handle.GetStream(),
+                                   srcPtrTemp,
+                                   make_uint2(1, srcGenericDescPtr->strides[1]),
+                                   dstPtrTemp,
+                                   make_uint2(1, dstGenericDescPtr->strides[1]),
+                                   make_uint3(1, maxHeight, maxWidth));
+            }
+        }
+    }
+    else if (numDims == 2)
+    {
+        // NHW
+        int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[1];               // H - height (y direction)
+        int globalThreads_z = 1;
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+            Rpp32s *shape = &shapeTensor[batchCount * numDims];
+            Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+            Rpp32s *length = reinterpret_cast<Rpp32s *>(&roi[numDims]);
+            Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]);
+            Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]);
+            T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[2] + anchor[1];
+            T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+
             hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
                                dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
-                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
                                0,
                                handle.GetStream(),
-                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
-                               make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
-                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
-                               make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
-                               dstGenericDescPtr->dims[1],
-                               &roiGenericPtrSrc[batchCount]);
+                               srcPtrTemp,
+                               make_uint3(0, 0, srcGenericDescPtr->strides[1]),
+                               dstPtrTemp,
+                               make_uint3(0, 0, dstGenericDescPtr->strides[1]),
+                               1,
+                               make_uint3(1, maxHeight, maxWidth));
         }
     }
-    else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+    else if (numDims == 1)
     {
-        int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
-        int globalThreads_y = dstGenericDescPtr->dims[2];               // H - height (y direction)
-        int globalThreads_z = dstGenericDescPtr->dims[1];               // D - depth (z direction)
-
+        int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = 1;
+        int globalThreads_z = 1;
         for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
         {
-            hipLaunchKernelGGL(slice_ndhwc_hip_tensor,
+            Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+            Rpp32s *shape = &shapeTensor[batchCount * numDims];
+            Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+            Rpp32s *length = reinterpret_cast<Rpp32s *>(&roi[numDims]);
+            Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]);
+            T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0];
+            T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+
+            hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
                                dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
-                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               dim3(LOCAL_THREADS_X, 1, 1),
                                0,
                                handle.GetStream(),
-                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
-                               make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
-                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
-                               make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
-                               &roiGenericPtrSrc[batchCount]);
+                               srcPtrTemp,
+                               make_uint3(0, 0, 1),
+                               dstPtrTemp,
+                               make_uint3(0, 0, 1),
+                               1,
+                               make_uint3(1, 1, maxLength));
         }
     }
 
diff --git a/src/modules/rppi_validate.hpp b/src/modules/rppi_validate.hpp
index e35e5c514..3285ee756 100644
--- a/src/modules/rppi_validate.hpp
+++ b/src/modules/rppi_validate.hpp
@@ -56,11 +56,9 @@ inline RppLayoutParams get_layout_params(RpptLayout layout, Rpp32u channels)
     }
     else if(layout == RpptLayout::NHWC || layout == RpptLayout::NDHWC)
     {
-        if (channels == 3) // PKD3
-        {
-            layoutParams.channelParam = 1;
-            layoutParams.bufferMultiplier = 3;
-        }
+        //PKD
+        layoutParams.channelParam = 1;
+        layoutParams.bufferMultiplier = channels;
     }
     return layoutParams;
 }
diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
index e2c9b18f7..0267985e5 100644
--- a/src/modules/rppt_tensor_audio_augmentations.cpp
+++ b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -32,8 +32,8 @@ SOFTWARE.
 RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr,
                                                 RpptDescPtr srcDescPtr,
                                                 Rpp32s *srcLengthTensor,
-                                                Rpp32f *detectedIndexTensor,
-                                                Rpp32f *detectionLengthTensor,
+                                                Rpp32s *detectedIndexTensor,
+                                                Rpp32s *detectionLengthTensor,
                                                 Rpp32f cutOffDB,
                                                 Rpp32s windowLength,
                                                 Rpp32f referencePower,
@@ -155,6 +155,88 @@ RppStatus rppt_down_mixing_host(RppPtr_t srcPtr,
     }
 }
 
+/******************** spectrogram ********************/
+
+RppStatus rppt_spectrogram_host(RppPtr_t srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                RppPtr_t dstPtr,
+                                RpptDescPtr dstDescPtr,
+                                Rpp32s *srcLengthTensor,
+                                bool centerWindows,
+                                bool reflectPadding,
+                                Rpp32f *windowFunction,
+                                Rpp32s nfft,
+                                Rpp32s power,
+                                Rpp32s windowLength,
+                                Rpp32s windowStep,
+                                rppHandle_t rppHandle)
+{
+    if ((dstDescPtr->layout != RpptLayout::NFT) && (dstDescPtr->layout != RpptLayout::NTF)) return RPP_ERROR_INVALID_DST_LAYOUT;
+
+    if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        spectrogram_host_tensor(static_cast<Rpp32f*>(srcPtr),
+                                srcDescPtr,
+                                static_cast<Rpp32f*>(dstPtr),
+                                dstDescPtr,
+                                srcLengthTensor,
+                                centerWindows,
+                                reflectPadding,
+                                windowFunction,
+                                nfft,
+                                power,
+                                windowLength,
+                                windowStep,
+                                rpp::deref(rppHandle));
+
+        return RPP_SUCCESS;
+    }
+    else
+    {
+        return RPP_ERROR_NOT_IMPLEMENTED;
+    }
+}
+
+/******************** mel_filter_bank ********************/
+
+RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr,
+                                    RpptDescPtr srcDescPtr,
+                                    RppPtr_t dstPtr,
+                                    RpptDescPtr dstDescPtr,
+                                    Rpp32s* srcDimsTensor,
+                                    Rpp32f maxFreq,
+                                    Rpp32f minFreq,
+                                    RpptMelScaleFormula melFormula,
+                                    Rpp32s numFilter,
+                                    Rpp32f sampleRate,
+                                    bool normalize,
+                                    rppHandle_t rppHandle)
+{
+    if (srcDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if (dstDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_DST_LAYOUT;
+
+    if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        mel_filter_bank_host_tensor(static_cast<Rpp32f*>(srcPtr),
+                                    srcDescPtr,
+                                    static_cast<Rpp32f*>(dstPtr),
+                                    dstDescPtr,
+                                    srcDimsTensor,
+                                    maxFreq,
+                                    minFreq,
+                                    melFormula,
+                                    numFilter,
+                                    sampleRate,
+                                    normalize,
+                                    rpp::deref(rppHandle));
+        return RPP_SUCCESS;
+    }
+    else
+    {
+        return RPP_ERROR_NOT_IMPLEMENTED;
+    }
+}
+
 /******************** resample ********************/
 
 RppStatus rppt_resample_host(RppPtr_t srcPtr,
diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp
index 45a0d5221..d758aa676 100644
--- a/src/modules/rppt_tensor_geometric_augmentations.cpp
+++ b/src/modules/rppt_tensor_geometric_augmentations.cpp
@@ -1050,43 +1050,54 @@ RppStatus rppt_slice_host(RppPtr_t srcPtr,
                           RpptGenericDescPtr srcGenericDescPtr,
                           RppPtr_t dstPtr,
                           RpptGenericDescPtr dstGenericDescPtr,
-                          RpptROI3DPtr roiGenericPtrSrc,
-                          RpptRoi3DType roiType,
+                          Rpp32s *anchorTensor,
+                          Rpp32s *shapeTensor,
+                          RppPtr_t fillValue,
+                          bool enablePadding,
+                          Rpp32u *roiTensor,
                           rppHandle_t rppHandle)
 {
+    if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
+    if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH;
+
     RppLayoutParams layoutParams;
     if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
         layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
     else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
         layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
-
-    if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
-    if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
-    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
-    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
-    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+    else if ((srcGenericDescPtr->layout == RpptLayout::NCHW) && (dstGenericDescPtr->layout == RpptLayout::NCHW))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+    else if ((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]);
 
     if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
     {
-        slice_f32_f32_host_tensor((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
-                                  srcGenericDescPtr,
-                                  (Rpp32f*) (static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
-                                  dstGenericDescPtr,
-                                  roiGenericPtrSrc,
-                                  roiType,
-                                  layoutParams,
-                                  rpp::deref(rppHandle));
+        slice_host_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                          srcGenericDescPtr,
+                          reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                          dstGenericDescPtr,
+                          anchorTensor,
+                          shapeTensor,
+                          static_cast<Rpp32f *>(fillValue),
+                          enablePadding,
+                          roiTensor,
+                          layoutParams,
+                          rpp::deref(rppHandle));
     }
     else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
     {
-        slice_u8_u8_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
-                                srcGenericDescPtr,
-                                static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
-                                dstGenericDescPtr,
-                                roiGenericPtrSrc,
-                                roiType,
-                                layoutParams,
-                                rpp::deref(rppHandle));
+        slice_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                          srcGenericDescPtr,
+                          static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                          dstGenericDescPtr,
+                          anchorTensor,
+                          shapeTensor,
+                          static_cast<Rpp8u *>(fillValue),
+                          enablePadding,
+                          roiTensor,
+                          layoutParams,
+                          rpp::deref(rppHandle));
     }
 
     return RPP_SUCCESS;
@@ -1848,24 +1859,29 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr,
                          RpptGenericDescPtr srcGenericDescPtr,
                          RppPtr_t dstPtr,
                          RpptGenericDescPtr dstGenericDescPtr,
-                         RpptROI3DPtr roiGenericPtrSrc,
-                         RpptRoi3DType roiType,
+                         Rpp32s *anchorTensor,
+                         Rpp32s *shapeTensor,
+                         RppPtr_t fillValue,
+                         bool enablePadding,
+                         Rpp32u *roiTensor,
                          rppHandle_t rppHandle)
 {
 #ifdef HIP_COMPILE
-    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
-    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
-    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
     if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
     if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH;
 
     if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
     {
-        hip_exec_slice_tensor((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+        hip_exec_slice_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
                               srcGenericDescPtr,
-                              (Rpp32f*) (static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                              reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
                               dstGenericDescPtr,
-                              roiGenericPtrSrc,
+                              anchorTensor,
+                              shapeTensor,
+                              static_cast<Rpp32f *>(fillValue),
+                              enablePadding,
+                              roiTensor,
                               rpp::deref(rppHandle));
     }
     else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
@@ -1874,7 +1890,11 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr,
                               srcGenericDescPtr,
                               static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
                               dstGenericDescPtr,
-                              roiGenericPtrSrc,
+                              anchorTensor,
+                              shapeTensor,
+                              static_cast<Rpp8u *>(fillValue),
+                              enablePadding,
+                              roiTensor,
                               rpp::deref(rppHandle));
     }
 
diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp
index f14e73a1a..dd151ec7d 100644
--- a/src/modules/rppt_tensor_statistical_operations.cpp
+++ b/src/modules/rppt_tensor_statistical_operations.cpp
@@ -241,6 +241,100 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr,
     return RPP_SUCCESS;
 }
 
+/******************** normalize_ND ********************/
+
+RppStatus rppt_normalize_host(RppPtr_t srcPtr,
+                              RpptGenericDescPtr srcGenericDescPtr,
+                              RppPtr_t dstPtr,
+                              RpptGenericDescPtr dstGenericDescPtr,
+                              Rpp32u axisMask,
+                              Rpp32f *meanTensor,
+                              Rpp32f *stdDevTensor,
+                              Rpp8u computeMeanStddev,
+                              Rpp32f scale,
+                              Rpp32f shift,
+                              Rpp32u *roiTensor,
+                              rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams;
+    Rpp32u tensorDim = srcGenericDescPtr->numDims - 1;
+    if (tensorDim == 3 && (srcGenericDescPtr->layout == RpptLayout::NHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]);
+    else if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+    else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+    else if(tensorDim == 2 && (srcGenericDescPtr->layout == RpptLayout::NHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[2]);
+
+    if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+    {
+        normalize_generic_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                      srcGenericDescPtr,
+                                      static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                      dstGenericDescPtr,
+                                      axisMask,
+                                      meanTensor,
+                                      stdDevTensor,
+                                      computeMeanStddev,
+                                      scale,
+                                      shift,
+                                      roiTensor,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+    {
+        normalize_generic_host_tensor(reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                      srcGenericDescPtr,
+                                      reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                      dstGenericDescPtr,
+                                      axisMask,
+                                      meanTensor,
+                                      stdDevTensor,
+                                      computeMeanStddev,
+                                      scale,
+                                      shift,
+                                      roiTensor,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        normalize_f32_f32_host_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                      srcGenericDescPtr,
+                                      reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                      dstGenericDescPtr,
+                                      axisMask,
+                                      meanTensor,
+                                      stdDevTensor,
+                                      computeMeanStddev,
+                                      scale,
+                                      shift,
+                                      roiTensor,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+    {
+        normalize_generic_host_tensor(static_cast<Rpp8s*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                      srcGenericDescPtr,
+                                      static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                      dstGenericDescPtr,
+                                      axisMask,
+                                      meanTensor,
+                                      stdDevTensor,
+                                      computeMeanStddev,
+                                      scale,
+                                      shift,
+                                      roiTensor,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
 
 /******************** tensor_mean ********************/
 
@@ -402,6 +496,7 @@ RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr,
 /********************************************************************************************************************/
 
 #ifdef GPU_SUPPORT
+
 /******************** tensor_sum ********************/
 
 RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
@@ -480,7 +575,6 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
 }
 
 /******************** tensor_min ********************/
-
 RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
                               RpptDescPtr srcDescPtr,
                               RppPtr_t imageMinArr,
@@ -609,6 +703,87 @@ RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
 #endif // backend
 }
 
+RppStatus rppt_normalize_gpu(RppPtr_t srcPtr,
+                             RpptGenericDescPtr srcGenericDescPtr,
+                             RppPtr_t dstPtr,
+                             RpptGenericDescPtr dstGenericDescPtr,
+                             Rpp32u axisMask,
+                             Rpp32f *meanTensor,
+                             Rpp32f *stdDevTensor,
+                             Rpp8u computeMeanStddev,
+                             Rpp32f scale,
+                             Rpp32f shift,
+                             Rpp32u *roiTensor,
+                             rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+    {
+        hip_exec_normalize_tensor(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                  srcGenericDescPtr,
+                                  static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                  dstGenericDescPtr,
+                                  axisMask,
+                                  meanTensor,
+                                  stdDevTensor,
+                                  computeMeanStddev,
+                                  scale,
+                                  shift,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_normalize_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                  srcGenericDescPtr,
+                                  reinterpret_cast<half*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                  dstGenericDescPtr,
+                                  axisMask,
+                                  meanTensor,
+                                  stdDevTensor,
+                                  computeMeanStddev,
+                                  scale,
+                                  shift,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_normalize_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                  srcGenericDescPtr,
+                                  reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                  dstGenericDescPtr,
+                                  axisMask,
+                                  meanTensor,
+                                  stdDevTensor,
+                                  computeMeanStddev,
+                                  scale,
+                                  shift,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+    {
+        hip_exec_normalize_tensor(static_cast<Rpp8s*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                  srcGenericDescPtr,
+                                  static_cast<Rpp8s*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                  dstGenericDescPtr,
+                                  axisMask,
+                                  meanTensor,
+                                  stdDevTensor,
+                                  computeMeanStddev,
+                                  scale,
+                                  shift,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
 /******************** tensor_mean ********************/
 
 RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr,
@@ -761,5 +936,5 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr,
     return RPP_ERROR_NOT_IMPLEMENTED;
 #endif // backend
 }
-#endif // GPU_SUPPORT
 
+#endif // GPU_SUPPORT
\ No newline at end of file
diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt
index 82ed65309..77052cabe 100644
--- a/utilities/test_suite/CMakeLists.txt
+++ b/utilities/test_suite/CMakeLists.txt
@@ -83,7 +83,7 @@ if(Python3_FOUND)
     if(NIFTI_FOUND)
         add_test(
             NAME rpp_qa_tests_tensor_voxel_host_all
-            COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runTests_voxel.py --qa_mode 1 --batch_size 3
+            COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runVoxelTests.py --qa_mode 1 --batch_size 3
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
         )
     endif(NIFTI_FOUND)
@@ -105,7 +105,7 @@ if(Python3_FOUND)
         if(NIFTI_FOUND)
             add_test(
                 NAME rpp_qa_tests_tensor_voxel_hip_all
-                COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runTests_voxel.py --qa_mode 1 --batch_size 3
+                COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runVoxelTests.py --qa_mode 1 --batch_size 3
                 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
             )
         endif(NIFTI_FOUND)
diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt
index 26017065e..a0bd42fa0 100644
--- a/utilities/test_suite/HIP/CMakeLists.txt
+++ b/utilities/test_suite/HIP/CMakeLists.txt
@@ -83,8 +83,10 @@ if (hip_FOUND AND OpenCV_FOUND)
     link_directories(${ROCM_PATH}/lib /usr/local/lib)
 
     add_executable(Tensor_hip Tensor_hip.cpp)
+    add_executable(Tensor_misc_hip Tensor_misc_hip.cpp)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPU_SUPPORT=1 -DRPP_BACKEND_HIP=1 -std=gnu++17")
     target_link_libraries(Tensor_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device)
+    target_link_libraries(Tensor_misc_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device)
 else()
     message(FATAL_ERROR "-- ${Red}Error: OpenCV and hip must be installed to install ${PROJECT_NAME} successfully!${ColourReset}")
 endif()
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 6008f33d3..fca310947 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -334,11 +334,20 @@ int main(int argc, char **argv)
             bitDepthByteSize = sizeof(Rpp32f);  // using 32f outputs for 16f and 32f, for testCase 90, 91
         else if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
             bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+
         CHECK_RETURN_STATUS(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
         if(testCase == 91)
             CHECK_RETURN_STATUS(hipHostMalloc(&mean, reductionFuncResultArrLength * bitDepthByteSize));
     }
 
+    // create generic descriptor and params in case of slice
+    RpptGenericDesc descriptor3D;
+    RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
+    Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
+    Rpp32u *roiTensor = NULL;
+    if(testCase == 92)
+        set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize);
+
     // Allocate hip memory for src/dst
     CHECK_RETURN_STATUS(hipMalloc(&d_input, inputBufferSize));
     CHECK_RETURN_STATUS(hipMalloc(&d_output, outputBufferSize));
@@ -349,6 +358,7 @@ int main(int argc, char **argv)
     if(testCase == 82)
         CHECK_RETURN_STATUS(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI)));
 
+    // create cropRoi and patchRoi in case of crop_and_patch
     RpptROI *cropRoi, *patchRoi;
     if(testCase == 33)
     {
@@ -1168,6 +1178,28 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 92:
+                {
+                    testCaseName = "slice";
+                    Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims
+                    if(anchorTensor == NULL)
+                        CHECK_RETURN_STATUS(hipHostMalloc(&anchorTensor, batchSize * numDims * sizeof(Rpp32s)));
+                    if(shapeTensor == NULL)
+                        CHECK_RETURN_STATUS(hipHostMalloc(&shapeTensor, batchSize * numDims * sizeof(Rpp32s)));
+                    if(roiTensor == NULL)
+                        CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, batchSize * numDims * 2 * sizeof(Rpp32u)));
+                    bool enablePadding = false;
+                    auto fillValue = 0;
+                    init_slice(descriptorPtr3D, roiTensorPtrSrc, roiTensor, anchorTensor, shapeTensor);
+
+                    startWallTime = omp_get_wtime();
+                    if((inputBitDepth == 0 || inputBitDepth == 2) && srcDescPtr->layout == dstDescPtr->layout)
+                        rppt_slice_gpu(d_input, descriptorPtr3D, d_output, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 default:
                 {
                     missingFuncFlag = 1;
@@ -1262,6 +1294,42 @@ int main(int argc, char **argv)
                     refFile.close();
                 }
 
+                // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values
+                // for output display and comparision purposes
+                if (testCase == 92)
+                {
+                    if (dstDescPtr->layout == RpptLayout::NCHW)
+                    {
+                        if (dstDescPtr->c == 3)
+                        {
+                            for(int i = 0; i < batchSize; i++)
+                            {
+                                int idx1 = i * 3;
+                                dstImgSizes[i].height = shapeTensor[idx1 + 1];
+                                dstImgSizes[i].width = shapeTensor[idx1 + 2];
+                            }
+                        }
+                        else
+                        {
+                            for(int i = 0; i < batchSize; i++)
+                            {
+                                int idx1 = i * 2;
+                                dstImgSizes[i].height = shapeTensor[idx1];
+                                dstImgSizes[i].width = shapeTensor[idx1 + 1];
+                            }
+                        }
+                    }
+                    else if (dstDescPtr->layout == RpptLayout::NHWC)
+                    {
+                        for(int i = 0; i < batchSize; i++)
+                        {
+                            int idx1 = i * 3;
+                            dstImgSizes[i].height = shapeTensor[idx1];
+                            dstImgSizes[i].width = shapeTensor[idx1 + 1];
+                        }
+                    }
+                }
+
                 /*Compare the output of the function with golden outputs only if
                 1.QA Flag is set
                 2.input bit depth 0 (Input U8 && Output U8)
@@ -1328,6 +1396,12 @@ int main(int argc, char **argv)
         if(testCase == 91)
             CHECK_RETURN_STATUS(hipHostFree(mean));
     }
+    if(anchorTensor != NULL)
+        CHECK_RETURN_STATUS(hipHostFree(anchorTensor));
+    if(shapeTensor != NULL)
+        CHECK_RETURN_STATUS(hipHostFree(shapeTensor));
+    if(roiTensor != NULL)
+        CHECK_RETURN_STATUS(hipHostFree(roiTensor));
     free(input);
     free(input_second);
     free(output);
diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
new file mode 100644
index 000000000..96197f432
--- /dev/null
+++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
@@ -0,0 +1,231 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "../rpp_test_suite_misc.h"
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+    const int MIN_ARG_COUNT = 9;
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_misc_hip <case number = 0:0> <test type 0/1> <toggle 0/1> <number of dimensions> <batch size> <num runs> <dst path> <script path>\n");
+        return -1;
+    }
+    Rpp32u testCase, testType, nDim, batchSize, numRuns, toggle;
+    bool qaMode;
+
+    testCase = atoi(argv[1]);
+    testType = atoi(argv[2]);
+    toggle = atoi(argv[3]);
+    nDim = atoi(argv[4]);
+    batchSize = atoi(argv[5]);
+    numRuns = atoi(argv[6]);
+    string dst = argv[8];
+    string scriptPath = argv[9];
+    qaMode = (testType == 0);
+    bool axisMaskCase = (testCase == 1);
+    int axisMask = (axisMaskCase) ? atoi(argv[7]) : 1;
+
+    if (qaMode && batchSize != 3)
+    {
+        std::cout<<"QA mode can only run with batchsize 3"<<std::endl;
+        return -1;
+    }
+
+    string funcName = augmentationMiscMap[testCase];
+    if (funcName.empty())
+    {
+        printf("\ncase %d is not supported\n", testCase);
+        return -1;
+    }
+
+    string func = funcName;
+    if (axisMaskCase)
+    {
+        char additionalParam_char[2];
+        std::sprintf(additionalParam_char, "%d", axisMask);
+        func += "_" + std::to_string(nDim) + "d" + "_axisMask";
+        func += additionalParam_char;
+    }
+
+    // fill roi based on mode and number of dimensions
+    Rpp32u *roiTensor;
+    CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, nDim * 2 * batchSize, sizeof(Rpp32u)));
+    fill_roi_values(nDim, batchSize, roiTensor, qaMode);
+
+    // set src/dst generic tensor descriptors
+    RpptGenericDescPtr srcDescriptorPtrND, dstDescriptorPtrND;
+    CHECK_RETURN_STATUS(hipHostMalloc(&srcDescriptorPtrND, sizeof(RpptGenericDesc)));
+    CHECK_RETURN_STATUS(hipHostMalloc(&dstDescriptorPtrND, sizeof(RpptGenericDesc)));
+
+    // set dims and compute strides
+    int bitDepth = 2, offSetInBytes = 0;
+    set_generic_descriptor(srcDescriptorPtrND, nDim, offSetInBytes, bitDepth, batchSize, roiTensor);
+    set_generic_descriptor(dstDescriptorPtrND, nDim, offSetInBytes, bitDepth, batchSize, roiTensor);
+    set_generic_descriptor_layout(srcDescriptorPtrND, dstDescriptorPtrND, nDim, toggle, qaMode);
+
+    Rpp32u bufferSize = 1;
+    for(int i = 0; i <= nDim; i++)
+        bufferSize *= srcDescriptorPtrND->dims[i];
+
+    // allocate memory for input / output
+    Rpp32f *inputF32 = NULL, *outputF32 = NULL;
+    inputF32 = static_cast<Rpp32f *>(calloc(bufferSize, sizeof(Rpp32f)));
+    outputF32 = static_cast<Rpp32f *>(calloc(bufferSize, sizeof(Rpp32f)));
+
+    void *d_inputF32, *d_outputF32;
+    CHECK_RETURN_STATUS(hipMalloc(&d_inputF32, bufferSize * sizeof(Rpp32f)));
+    CHECK_RETURN_STATUS(hipMalloc(&d_outputF32, bufferSize * sizeof(Rpp32f)));
+
+    // read input data
+    if(qaMode)
+        read_data(inputF32, nDim, 0, scriptPath);
+    else
+    {
+        std::srand(0);
+        for(int i = 0; i < bufferSize; i++)
+            inputF32[i] = static_cast<float>((std::rand() % 255));
+    }
+
+    // copy data from HOST to HIP
+    CHECK_RETURN_STATUS(hipMemcpy(d_inputF32, inputF32, bufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice));
+    CHECK_RETURN_STATUS(hipDeviceSynchronize());
+
+    rppHandle_t handle;
+    hipStream_t stream;
+    CHECK_RETURN_STATUS(hipStreamCreate(&stream));
+    rppCreateWithStreamAndBatchSize(&handle, stream, batchSize);
+
+    Rpp32f *meanTensor = nullptr, *stdDevTensor = nullptr;
+    Rpp32f *meanTensorCPU = nullptr, *stdDevTensorCPU = nullptr;
+    double startWallTime, endWallTime;
+    double maxWallTime = 0, minWallTime = 500, avgWallTime = 0, wallTime = 0;
+
+    // case-wise RPP API and measure time script for Unit and Performance test
+    printf("\nRunning %s %d times (each time with a batch size of %d) and computing mean statistics...", func.c_str(), numRuns, batchSize);
+    for(int perfCount = 0; perfCount < numRuns; perfCount++)
+    {
+        switch(testCase)
+        {
+            case 1:
+            {
+                float scale = 1.0;
+                float shift = 0.0;
+
+                // computeMeanStddev set to 3 means both mean and stddev should be computed internally.
+                // Wherein 0th bit used to represent computeMean and 1st bit for computeStddev.
+                Rpp8u computeMeanStddev = 3;
+
+                Rpp32u size = 1; // length of mean and stddev tensors differ based on axisMask and nDim
+                Rpp32u maxSize = 1;
+                for(int batch = 0; batch < batchSize; batch++)
+                {
+                    size = 1;
+                    for(int i = 0; i < nDim; i++)
+                        size *= ((axisMask & (int)(pow(2,i))) >= 1) ? 1 : roiTensor[(nDim * 2 * batch) + nDim + i];
+                    maxSize = max(maxSize, size);
+                }
+
+                // allocate memory if no memory is allocated
+                if(meanTensor == nullptr)
+                    CHECK_RETURN_STATUS(hipMalloc(&meanTensor, maxSize * batchSize * sizeof(Rpp32f)));
+
+                if(stdDevTensor == nullptr)
+                    CHECK_RETURN_STATUS(hipMalloc(&stdDevTensor, maxSize * batchSize * sizeof(Rpp32f)));
+
+                if(!computeMeanStddev)
+                {
+                    if(meanTensorCPU == nullptr)
+                        meanTensorCPU = static_cast<Rpp32f *>(malloc(maxSize * sizeof(Rpp32f)));
+                    if(stdDevTensorCPU == nullptr)
+                        stdDevTensorCPU = static_cast<Rpp32f *>(malloc(maxSize * sizeof(Rpp32f)));
+                    fill_mean_stddev_values(nDim, maxSize, meanTensorCPU, stdDevTensorCPU, qaMode, axisMask, scriptPath);
+                    CHECK_RETURN_STATUS(hipMemcpy(meanTensor, meanTensorCPU, maxSize * sizeof(Rpp32f), hipMemcpyHostToDevice));
+                    CHECK_RETURN_STATUS(hipMemcpy(stdDevTensor, stdDevTensorCPU, maxSize * sizeof(Rpp32f), hipMemcpyHostToDevice));
+                    CHECK_RETURN_STATUS(hipDeviceSynchronize());
+                }
+
+                startWallTime = omp_get_wtime();
+                rppt_normalize_gpu(d_inputF32, srcDescriptorPtrND, d_outputF32, dstDescriptorPtrND, axisMask, meanTensor, stdDevTensor, computeMeanStddev, scale, shift, roiTensor, handle);
+
+                // compare outputs if qaMode is true
+                if(qaMode)
+                {
+                    CHECK_RETURN_STATUS(hipDeviceSynchronize());
+                    CHECK_RETURN_STATUS(hipMemcpy(outputF32, d_outputF32, bufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost));
+                    CHECK_RETURN_STATUS(hipDeviceSynchronize());
+                    bool externalMeanStd = !computeMeanStddev; // when mean and stddev is passed from user
+                    compare_output(outputF32, nDim, batchSize, bufferSize, dst, func, axisMask, scriptPath, externalMeanStd);
+                }
+                break;
+            }
+            default:
+            {
+                cout << "functionality is not supported" <<std::endl;
+                exit(0);
+            }
+        }
+        if(!qaMode)
+        {
+            CHECK_RETURN_STATUS(hipDeviceSynchronize());
+            endWallTime = omp_get_wtime();
+
+            wallTime = endWallTime - startWallTime;
+            maxWallTime = std::max(maxWallTime, wallTime);
+            minWallTime = std::min(minWallTime, wallTime);
+            avgWallTime += wallTime;
+        }
+    }
+    rppDestroyGPU(handle);
+
+    if(!qaMode)
+    {
+        maxWallTime *= 1000;
+        minWallTime *= 1000;
+        avgWallTime *= 1000;
+        avgWallTime /= numRuns;
+        cout << fixed << "\nmax,min,avg wall times in ms/batch = " << maxWallTime << "," << minWallTime << "," << avgWallTime;
+    }
+
+
+    free(inputF32);
+    free(outputF32);
+    CHECK_RETURN_STATUS(hipHostFree(srcDescriptorPtrND));
+    CHECK_RETURN_STATUS(hipHostFree(dstDescriptorPtrND));
+    CHECK_RETURN_STATUS(hipHostFree(roiTensor));
+    CHECK_RETURN_STATUS(hipFree(d_inputF32));
+    CHECK_RETURN_STATUS(hipFree(d_outputF32));
+    if(meanTensor != nullptr)
+        CHECK_RETURN_STATUS(hipFree(meanTensor));
+    if(stdDevTensor != nullptr)
+        CHECK_RETURN_STATUS(hipFree(stdDevTensor));
+    if(meanTensorCPU != nullptr)
+        free(meanTensorCPU);
+    if(stdDevTensorCPU != nullptr)
+        free(stdDevTensorCPU);
+
+    return 0;
+}
diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
index b39f461cc..44f2dea72 100644
--- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
@@ -146,6 +146,10 @@ int main(int argc, char * argv[])
     void *pinnedMemArgs;
     CHECK_RETURN_STATUS(hipHostMalloc(&pinnedMemArgs, 2 * noOfFiles * sizeof(Rpp32f)));
 
+    // arguments required for slice
+    Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
+    Rpp32u *roiTensor = NULL;
+
     rppHandle_t handle;
     hipStream_t stream;
     CHECK_RETURN_STATUS(hipStreamCreate(&stream));
@@ -254,12 +258,21 @@ int main(int argc, char * argv[])
                 case 1:
                 {
                     testCaseName = "slice";
+                    if(anchorTensor == NULL)
+                        CHECK_RETURN_STATUS(hipHostMalloc(&anchorTensor, batchSize * 4 * sizeof(Rpp32s)));
+                    if(shapeTensor == NULL)
+                        CHECK_RETURN_STATUS(hipHostMalloc(&shapeTensor, batchSize * 4 * sizeof(Rpp32s)));
+                    if(roiTensor == NULL)
+                        CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, batchSize * 8 * sizeof(Rpp32u)));
+                    bool enablePadding = false;
+                    auto fillValue = 0;
+                    init_slice_voxel(descriptorPtr3D, roiGenericSrcPtr, roiTensor, anchorTensor, shapeTensor);
 
                     startWallTime = omp_get_wtime();
                     if (inputBitDepth == 0)
-                        rppt_slice_gpu(d_inputU8, descriptorPtr3D, d_outputU8, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle);
+                        rppt_slice_gpu(d_inputU8, descriptorPtr3D, d_outputU8, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
                     else if(inputBitDepth == 2)
-                        rppt_slice_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle);
+                        rppt_slice_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
                     else
                         missingFuncFlag = 1;
 
@@ -413,6 +426,39 @@ int main(int argc, char * argv[])
                     outputF32[i] = static_cast<float>(outputU8[i]);
             }
 
+            // if test case is slice and qaFlag is set, update the ROI with shapeTensor values
+            // for output display and comparison purposes
+            if(testCase == 1)
+            {
+                // update the roi for comparision with the shapeTensor values
+                if (descriptorPtr3D->layout == RpptLayout::NCDHW)
+                {
+                    for(int i = 0; i < batchSize; i++)
+                    {
+                        int idx1 = i * 4;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1 + 1];
+                        roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 2];
+                        roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 3];
+                    }
+                }
+                else if(descriptorPtr3D->layout == RpptLayout::NDHWC)
+                {
+                    for(int i = 0; i < batchSize; i++)
+                    {
+                        int idx1 = i * 4;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1];
+                        roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 1];
+                        roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 2];
+                    }
+                }
+            }
+
             /*Compare the output of the function with golden outputs only if
             1.QA Flag is set
             2.input bit depth 2 (F32)*/
@@ -502,6 +548,12 @@ int main(int argc, char * argv[])
     CHECK_RETURN_STATUS(hipHostFree(pinnedMemArgs));
     CHECK_RETURN_STATUS(hipFree(d_inputF32));
     CHECK_RETURN_STATUS(hipFree(d_outputF32));
+    if(anchorTensor != NULL)
+        CHECK_RETURN_STATUS(hipHostFree(anchorTensor));
+    if(shapeTensor != NULL)
+        CHECK_RETURN_STATUS(hipHostFree(shapeTensor));
+    if(roiTensor != NULL)
+        CHECK_RETURN_STATUS(hipHostFree(roiTensor));
     if(inputBitDepth == 0)
     {
         if(inputU8 != NULL)
diff --git a/utilities/test_suite/HIP/runMiscTests.py b/utilities/test_suite/HIP/runMiscTests.py
new file mode 100644
index 000000000..bbe037b06
--- /dev/null
+++ b/utilities/test_suite/HIP/runMiscTests.py
@@ -0,0 +1,281 @@
+"""
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import os
+import subprocess  # nosec
+import argparse
+import datetime
+import shutil
+import sys
+sys.dont_write_bytecode = True
+sys.path.append(os.path.join(os.path.dirname( __file__ ), '..' ))
+from common import *
+
+# Set the timestamp
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+scriptPath = os.path.dirname(os.path.realpath(__file__))
+outFolderPath = os.getcwd()
+buildFolderPath = os.getcwd()
+caseMin = 1
+caseMax = 1
+
+# Get a list of log files based on a flag for preserving output
+def get_log_file_list():
+    return [
+        outFolderPath + "/OUTPUT_PERFORMANCE_MISC_LOGS_HIP_" + timestamp + "/Tensor_misc_hip_raw_performance_log.txt",
+    ]
+
+def case_file_check(CASE_FILE_PATH, new_file):
+    try:
+        case_file = open(CASE_FILE_PATH,'r')
+        for line in case_file:
+            print(line)
+            if not(line.startswith('"Name"')):
+                new_file.write(line)
+        case_file.close()
+        return True
+    except IOError:
+        print("Unable to open case results")
+        return False
+
+# Generate performance reports based on counters and a list of types
+def generate_performance_reports(RESULTS_DIR):
+    import pandas as pd
+    pd.options.display.max_rows = None
+    # Generate performance report
+    df = pd.read_csv(RESULTS_DIR + "/consolidated_results.stats.csv")
+    df["AverageMs"] = df["AverageNs"] / 1000000
+    dfPrint = df.drop(['Percentage'], axis = 1)
+    dfPrint["HIP Kernel Name"] = dfPrint.iloc[:,0].str.lstrip("Hip_")
+    dfPrint_noIndices = dfPrint.astype(str)
+    dfPrint_noIndices.replace(['0', '0.0'], '', inplace = True)
+    dfPrint_noIndices = dfPrint_noIndices.to_string(index = False)
+    print(dfPrint_noIndices)
+
+def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+    print(result.stdout.decode())
+    print("------------------------------------------------------------------------------------------")
+
+def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
+        print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+        read_from_subprocess_and_write_to_log(process, logFile)
+
+def run_performance_test_with_profiler_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    if not os.path.exists(f"{outFilePath}/case_{case}"):
+        os.mkdir(f"{outFilePath}/case_{case}")
+
+    with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
+        print(f"\nrocprof --basenames on --timestamp on --stats -o {outFilePath}/case_{case}/output_case{case}.csv ./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+        process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{outFilePath}/case_{case}/output_case{case}.csv", "./Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+        read_from_subprocess_and_write_to_log(process, logFile)
+    print("------------------------------------------------------------------------------------------")
+
+def run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg, profilingOption = 'NO'):
+    print("\n\n\n\n")
+    print("--------------------------------")
+    print("Running a New Functionality...")
+    print("--------------------------------")
+    if testType == 0:
+        run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+    elif testType == 1 and profilingOption == "NO":
+        run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+    elif testType == 1 and profilingOption == "YES":
+        run_performance_test_with_profiler_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+
+# Parse and validate command-line arguments for the RPP test suite
+def rpp_test_suite_parser_and_validator():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = QA tests / 1 = Performance tests)")
+    parser.add_argument('--toggle', type = int, default = 0, help = "Toggle outputs")
+    parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to test", required = False)
+    parser.add_argument("--num_dims", type = int, default = 2, help = "Number of dimensions for input")
+    parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
+    parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
+    parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Outputs from tests will be compared with golden outputs - (0 / 1)", required = False)
+    parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
+    parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
+    args = parser.parse_args()
+
+    # validate the parameters passed by user
+    if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
+        print("Starting case# and Ending case# must be in the 0:1 range. Aborting!")
+        exit(0)
+    elif args.case_end < args.case_start:
+        print("Ending case# must be greater than starting case#. Aborting!")
+        exit(0)
+    elif args.test_type < 0 or args.test_type > 1:
+        print("Test Type# must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.qa_mode < 0 or args.qa_mode > 1:
+        print("QA mode must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.case_list is not None and args.case_start > caseMin and args.case_end < caseMax:
+        print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
+        exit(0)
+    elif args.num_runs <= 0:
+        print("Number of Runs must be greater than 0. Aborting!")
+        exit(0)
+    elif args.batch_size <= 0:
+        print("Batch size must be greater than 0. Aborting!")
+        exit(0)
+    elif args.profiling != 'YES' and args.profiling != 'NO':
+        print("Profiling option value must be either 'YES' or 'NO'.")
+        exit(0)
+
+    if args.case_list is None:
+        args.case_list = range(args.case_start, args.case_end + 1)
+        args.case_list = [str(x) for x in args.case_list]
+    else:
+        for case in args.case_list:
+            if int(case) < caseMin or int(case) > caseMax:
+                print("The case# must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+                exit(0)
+    return args
+
+args = rpp_test_suite_parser_and_validator()
+caseStart = args.case_start
+caseEnd = args.case_end
+testType = args.test_type
+toggle = args.toggle
+caseList = args.case_list
+numDims = args.num_dims
+numRuns = args.num_runs
+profilingOption = args.profiling
+batchSize = args.batch_size
+qaMode = args.qa_mode
+if qaMode:
+    testType = 0
+preserveOutput = args.preserve_output
+outFilePath = " "
+
+if testType == 0 and batchSize != 3:
+    print("QA mode can only run with a batch size of 3.")
+    exit(0)
+
+if preserveOutput == 0:
+    validate_and_remove_folders(outFolderPath, "QA_RESULTS_MISC_HIP")
+    validate_and_remove_folders(outFolderPath, "OUTPUT_PERFORMANCE_MISC_LOGS_HIP")
+
+if(testType == 0):
+    outFilePath = outFolderPath + '/QA_RESULTS_MISC_HIP_' + timestamp
+    numRuns = 1
+elif(testType == 1):
+    if "--num_runs" not in sys.argv:
+        numRuns = 100   #default numRuns for running performance tests
+    outFilePath = outFolderPath + '/OUTPUT_PERFORMANCE_MISC_LOGS_HIP_' + timestamp
+else:
+    print("Invalid TEST_TYPE specified. TEST_TYPE should be 0/1 (0 = QA tests / 1 = Performance tests)")
+    exit(0)
+
+os.mkdir(outFilePath)
+loggingFolder = outFilePath
+dstPath = outFilePath
+
+# Enable extglob
+if os.path.exists(buildFolderPath + "/build"):
+    shutil.rmtree(buildFolderPath + "/build")
+os.makedirs(buildFolderPath + "/build")
+os.chdir(buildFolderPath + "/build")
+
+# Run cmake and make commands
+subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.run(["make", "-j16"], cwd=".")    # nosec
+
+supportedCaseList = ['1']
+for case in caseList:
+    if case not in supportedCaseList:
+        continue
+    if case == "1":
+        for axisMask in range(1, pow(2, numDims)):
+            run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, axisMask, profilingOption)
+    else:
+        run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, "", profilingOption)
+
+if (testType == 1 and profilingOption == "YES"):
+    RESULTS_DIR = outFolderPath + "/OUTPUT_PERFORMANCE_MISC_LOGS_HIP_" + timestamp
+    print("RESULTS_DIR = " + RESULTS_DIR)
+    CONSOLIDATED_FILE = RESULTS_DIR + "/consolidated_results.stats.csv"
+
+    CASE_NUM_LIST = caseList
+    BIT_DEPTH_LIST = [2]
+    OFT_LIST = [0]
+
+    # Open csv file
+    new_file = open(CONSOLIDATED_FILE, 'w')
+    new_file.write('"HIP Kernel Name","Calls","TotalDurationNs","AverageNs","Percentage"\n')
+
+    # Loop through cases
+    for CASE_NUM in CASE_NUM_LIST:
+        # Set results directory
+        CASE_RESULTS_DIR = RESULTS_DIR + "/case_" + str(CASE_NUM)
+        print("CASE_RESULTS_DIR = " + CASE_RESULTS_DIR)
+
+        # Loop through bit depths
+        for BIT_DEPTH in BIT_DEPTH_LIST:
+            # Loop through output format toggle cases
+            for OFT in OFT_LIST:
+                # Write into csv file
+                CASE_FILE_PATH = CASE_RESULTS_DIR + "/output_case" + str(CASE_NUM) + ".stats.csv"
+                print("CASE_FILE_PATH = " + CASE_FILE_PATH)
+                fileCheck = case_file_check(CASE_FILE_PATH, new_file)
+                if fileCheck == False:
+                    continue
+
+    new_file.close()
+    subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), CONSOLIDATED_FILE])  # nosec
+    try:
+        generate_performance_reports(RESULTS_DIR)
+    except ImportError:
+        print("\nPandas not available! Results of GPU profiling experiment are available in the following files:\n" + \
+                CONSOLIDATED_FILE + "\n")
+    except IOError:
+        print("Unable to open results in " + CONSOLIDATED_FILE)
+
+# print the results of qa tests
+nonQACaseList = []
+supportedCases = 0
+for num in caseList:
+    if num in supportedCaseList:
+        supportedCases += 1
+caseInfo = "Tests are run for " + str(supportedCases) + " supported cases out of the " + str(len(caseList)) + " cases requested"
+if testType == 0:
+    qaFilePath = os.path.join(outFilePath, "QA_results.txt")
+    checkFile = os.path.isfile(qaFilePath)
+    if checkFile:
+        print("---------------------------------- Results of QA Test - Tensor_misc_hip ----------------------------------\n")
+        print_qa_tests_summary(qaFilePath, supportedCaseList, nonQACaseList)
+
+# Performance tests
+if (testType == 1 and profilingOption == "NO"):
+    logFileList = get_log_file_list()
+    functionalityGroupList = ["statiscal_operations"]
+
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index 12c3b9464..cbb6a3f94 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -39,7 +39,7 @@
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
 caseMin = 0
-caseMax = 91
+caseMax = 92
 
 # Get a list of log files based on a flag for preserving output
 def get_log_file_list(preserveOutput):
@@ -116,18 +116,13 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
 
             print("------------------------------------------------------------------------------------------")
 
-def run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
-    with open("{}/Tensor_hip_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file:
+def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+    with open("{}/Tensor_hip_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
         print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ")
         process = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)   # nosec
-        while True:
-            output = process.stdout.readline()
-            if not output and process.poll() is not None:
-                break
-            print(output.strip())
-            log_file.write(output)
+        read_from_subprocess_and_write_to_log(process, logFile)
 
-def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     print("\n\n\n\n")
     print("--------------------------------")
     print("Running a New Functionality...")
@@ -143,60 +138,34 @@ def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dst
 
             if case == "40" or case == "41" or case == "49" or case == "54":
                 for kernelSize in range(3, 10, 2):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, kernelSize, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, kernelSize, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             elif case == "8":
                 # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
                 for noiseType in range(3):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             elif case == "21" or case == "23" or case == "24":
                 # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
                 for interpolationType in range(6):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             else:
-                run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "0", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "0", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
                 print("------------------------------------------------------------------------------------------")
 
-def run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, additionalParamType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+def run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, additionalParamType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     addtionalParamString = additionalParamType + str(additionalParam)
-    if layout == 0:
-        if not os.path.isdir(f"{dstPath}/Tensor_PKD3/case_{case}"):
-            os.mkdir(f"{dstPath}/Tensor_PKD3/case_{case}")
-        with open(f"{loggingFolder}/Tensor_hip_{log_file_layout}_raw_performance_log.txt", "a") as log_file:
-            print(f'rocprof --basenames on --timestamp on --stats -o {dstPath}/Tensor_PKD3/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv ./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0')
-            process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_PKD3/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)   # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                output_str = output.decode('utf-8')
-                log_file.write(output_str)
-    elif layout == 1:
-        if not os.path.isdir(f"{dstPath}/Tensor_PLN3/case_{case}"):
-            os.mkdir(f"{dstPath}/Tensor_PLN3/case_{case}")
-        with open(f"{loggingFolder}/Tensor_hip_{log_file_layout}_raw_performance_log.txt", "a") as log_file:
-            print(f'rocprof --basenames on --timestamp on --stats -o {dstPath}/Tensor_PLN3/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv ./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0')
-            process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_PLN3/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)    # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                output_str = output.decode('utf-8')
-                log_file.write(output_str)
-    elif layout == 2:
-        if not os.path.isdir(f"{dstPath}/Tensor_PLN1/case_{case}"):
-            os.mkdir(f"{dstPath}/Tensor_PLN1/case_{case}")
-        with open(f"{loggingFolder}/Tensor_hip_{log_file_layout}_raw_performance_log.txt", "a") as log_file:
-            print(f'rocprof --basenames on --timestamp on --stats -o "{dstPath}/Tensor_PLN1/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv" "./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0"')
-            process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_PLN1/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)   # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                output_str = output.decode('utf-8')
-                log_file.write(output_str)
+    layoutName = get_layout_name(layout)
+    if not os.path.isdir(f"{dstPath}/Tensor_{layoutName}/case_{case}"):
+        os.mkdir(f"{dstPath}/Tensor_{layoutName}/case_{case}")
+    with open(f"{loggingFolder}/Tensor_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
+        print(f'rocprof --basenames on --timestamp on --stats -o {dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv ./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0')
+        process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)   # nosec
+        while True:
+            output = process.stdout.readline()
+            if not output and process.poll() is not None:
+                break
+            print(output.strip())
+            output_str = output.decode('utf-8')
+            logFile.write(output_str)
 
 # Parse and validate command-line arguments for the RPP test suite
 def rpp_test_suite_parser_and_validator():
@@ -327,7 +296,7 @@ def rpp_test_suite_parser_and_validator():
 subprocess.run(["make", "-j16"], cwd=".")    # nosec
 
 # List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
 
 # Create folders based on testType and profilingOption
 if testType == 1 and profilingOption == "YES":
@@ -352,7 +321,7 @@ def rpp_test_suite_parser_and_validator():
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
         for layout in range(3):
-            dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
+            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
 
             if qaMode == 0:
                 if not os.path.isdir(dstPathTemp):
@@ -372,9 +341,9 @@ def rpp_test_suite_parser_and_validator():
                 srcPath1 = ricapInFilePath
                 srcPath2 = ricapInFilePath
             for layout in range(3):
-                dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
+                dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
 
-                run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
 
     elif (testType == 1 and profilingOption == "YES"):
         NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69]
@@ -386,7 +355,7 @@ def rpp_test_suite_parser_and_validator():
                 srcPath1 = ricapInFilePath
                 srcPath2 = ricapInFilePath
             for layout in range(3):
-                dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
+                dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
 
                 print("\n\n\n\n")
                 print("--------------------------------")
@@ -403,17 +372,17 @@ def rpp_test_suite_parser_and_validator():
 
                         if case == "40" or case == "41" or case == "49" or case == "54":
                             for kernelSize in range(3, 10, 2):
-                                run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, kernelSize, "_kernelSize", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                                run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, kernelSize, "_kernelSize", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
                         elif case == "8":
                             # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
                             for noiseType in range(3):
-                                run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, "_noiseType", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                                run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, "_noiseType", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
                         elif case == "21" or case == "23" or case == "24":
                             # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
                             for interpolationType in range(6):
-                                run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, "_interpolationType", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                                run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, "_interpolationType", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
                         else:
-                            run_performance_test_with_profiler(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "", "", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                            run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "", "", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
 
                         print("------------------------------------------------------------------------------------------")
 
@@ -509,9 +478,9 @@ def rpp_test_suite_parser_and_validator():
             print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv")
 
 if (testType == 1 and profilingOption == "NO"):
-    log_file_list = get_log_file_list(preserveOutput)
+    logFileList = get_log_file_list(preserveOutput)
 
-    functionality_group_list = [
+    functionalityGroupList = [
     "color_augmentations",
     "data_exchange_operations",
     "effects_augmentations",
@@ -522,8 +491,8 @@ def rpp_test_suite_parser_and_validator():
     "logical_operations",
     "statistical_operations"
     ]
-    for log_file in log_file_list:
-        print_performance_tests_summary(log_file, functionality_group_list, numRuns)
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
 
 # print the results of qa tests
 nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
diff --git a/utilities/test_suite/HIP/runTests_voxel.py b/utilities/test_suite/HIP/runVoxelTests.py
similarity index 66%
rename from utilities/test_suite/HIP/runTests_voxel.py
rename to utilities/test_suite/HIP/runVoxelTests.py
index 7f1dff6f4..b6f2ebe5a 100644
--- a/utilities/test_suite/HIP/runTests_voxel.py
+++ b/utilities/test_suite/HIP/runVoxelTests.py
@@ -56,89 +56,55 @@ def func_group_finder(case_number):
     else:
         return "miscellaneous"
 
-def run_unit_test(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    bitDepths = [0, 2]
-    if qaMode:
-        bitDepths = [2]
-    for bitDepth in bitDepths:
-        print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
+def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
+    print(result.stdout.decode())
+    print("------------------------------------------------------------------------------------------")
+
+def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
         print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-        result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
-        print(result.stdout.decode())
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+        while True:
+            output = process.stdout.readline()
+            if not output and process.poll() is not None:
+                break
+            print(output.strip())
+            if "Running" in output or "max,min,avg wall times" in output:
+                cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
+                cleanedOutput = cleanedOutput.strip()  # Remove leading/trailing whitespace
+                logFile.write(cleanedOutput + '\n')
+                if "max,min,avg wall times" in output:
+                    logFile.write("\n")
         print("------------------------------------------------------------------------------------------")
 
-def run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
+def run_performance_test_with_profiler_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    layoutName = get_layout_name(layout)
+    if not os.path.exists(f"{loggingFolder}/Tensor_{layoutName}/case_{case}"):
+        os.mkdir(f"{loggingFolder}/Tensor_{layoutName}/case_{case}")
+
     bitDepths = [0, 2]
     for bitDepth in bitDepths:
-        with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-            print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-            process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+        with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
+            print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_{layoutName}/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp}  {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
+            process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
             while True:
                 output = process.stdout.readline()
                 if not output and process.poll() is not None:
                     break
                 print(output.strip())
-                if "Running" in output or "max,min,avg wall times" in output:
-                    cleaned_output = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
-                    cleaned_output = cleaned_output.strip()  # Remove leading/trailing whitespace
-                    log_file.write(cleaned_output + '\n')
-                    if "max,min,avg wall times" in output:
-                        log_file.write("\n")
-        print("------------------------------------------------------------------------------------------")
-
-def run_performance_test_with_profiler(loggingFolder, logFileLayout, dstPath, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    bitDepths = [0, 2]
-    for bitDepth in bitDepths:
-        if layout == 0:
-            if not os.path.exists(f"{dstPath}/Tensor_PKD3/case_{case}"):
-                os.mkdir(f"{dstPath}/Tensor_PKD3/case_{case}")
-            with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-                print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_PKD3/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp}  {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-                process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_PKD3/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
-                while True:
-                    output = process.stdout.readline()
-                    if not output and process.poll() is not None:
-                        break
-                    print(output.strip())
-                    log_file.write(output.decode('utf-8'))
-        elif layout == 1:
-            if not os.path.exists(f"{dstPath}/Tensor_PLN3/case_{case}"):
-                os.mkdir(f"{dstPath}/Tensor_PLN3/case_{case}")
-            with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-                print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_PLN3/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp}  {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-                process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_PLN3/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
-                while True:
-                    output = process.stdout.readline()
-                    if not output and process.poll() is not None:
-                        break
-                    print(output.strip())
-                    log_file.write(output.decode('utf-8'))
-        elif layout == 2:
-            if not os.path.exists(f"{dstPath}/Tensor_PLN1/case_{case}"):
-                os.mkdir(f"{dstPath}/Tensor_PLN1/case_{case}")
-            with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-                print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_PLN1/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp}  {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-                process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_PLN1/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
-                while True:
-                    output = process.stdout.readline()
-                    if not output and process.poll() is not None:
-                        break
-                    print(output.strip())
-                    log_file.write(output.decode('utf-8'))
+                logFile.write(output.decode('utf-8'))
     print("------------------------------------------------------------------------------------------")
 
+def run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize, profilingOption = 'NO'):
+    if testType == 0:
+        run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+    elif testType == 1 and profilingOption == "NO":
+        run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+    elif testType == 1 and profilingOption == "YES":
+        run_performance_test_with_profiler_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+
 # Parse and validate command-line arguments for the RPP test suite
 def rpp_test_suite_parser_and_validator():
     parser = argparse.ArgumentParser()
@@ -277,24 +243,28 @@ def rpp_test_suite_parser_and_validator():
 print("Running all layout Inputs...")
 print("##########################################################################################")
 
-if testType == 0:
+bitDepths = [0, 2]
+if (testType == 0 or (testType == 1 and profilingOption == "NO")):
     for case in caseList:
         if case not in supportedCaseList:
             continue
+        print("\n\n\n\n")
+        print("--------------------------------")
+        print("Running a New Functionality...")
+        print("--------------------------------")
         for layout in range(3):
             dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
-            if qaMode == 0:
+            if testType == 0 and qaMode == 0:
                 if not os.path.isdir(dstPathTemp):
                     os.mkdir(dstPathTemp)
 
-            run_unit_test(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
-elif (testType == 1 and profilingOption == "NO"):
-    for case in caseList:
-        if case not in supportedCaseList:
-            continue
-        for layout in range(3):
-            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
-            run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+            bitDepths = [0, 2]
+            if testType == 0 and qaMode:
+                bitDepths = [2]
+            for bitDepth in bitDepths:
+                print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
+                print("\n\n\n\n")
+                run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
 elif (testType == 1 and profilingOption == "YES"):
     NEW_FUNC_GROUP_LIST = [0, 1]
     for case in caseList:
@@ -302,7 +272,7 @@ def rpp_test_suite_parser_and_validator():
             continue
         for layout in range(3):
             dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
-            run_performance_test_with_profiler(loggingFolder, logFileLayout, dstPath, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+            run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize, profilingOption)
 
         RESULTS_DIR = ""
         RESULTS_DIR = outFolderPath + "/OUTPUT_PERFORMANCE_LOGS_HIP_VOXEL_" + timestamp
@@ -378,13 +348,8 @@ def rpp_test_suite_parser_and_validator():
 if (testType == 0 and qaMode == 0): # Unit tests
     create_layout_directories(dstPath, layoutDict)
 elif (testType == 1 and profilingOption == "NO"): # Performance tests
-    log_file_list = get_log_file_list(preserveOutput)
-
-    functionality_group_list = [
-        "arithmetic_operations",
-        "geometric_augmentations",
-        "effects_augmentations"
-    ]
+    logFileList = get_log_file_list(preserveOutput)
+    functionalityGroupList = ["arithmetic_operations", "geometric_augmentations", "effects_augmentations"]
 
-    for log_file in log_file_list:
-        print_performance_tests_summary(log_file, functionality_group_list, numRuns)
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/HOST/CMakeLists.txt b/utilities/test_suite/HOST/CMakeLists.txt
index b7abf5d77..13a9b5d1c 100644
--- a/utilities/test_suite/HOST/CMakeLists.txt
+++ b/utilities/test_suite/HOST/CMakeLists.txt
@@ -82,6 +82,7 @@ if (OpenCV_FOUND)
     link_directories(${ROCM_PATH}/lib /usr/local/lib)
 
     add_executable(Tensor_host Tensor_host.cpp)
+    add_executable(Tensor_misc_host Tensor_misc_host.cpp)
     add_executable(BatchPD_host_pkd3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp)
     add_executable(BatchPD_host_pln1 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp)
     add_executable(BatchPD_host_pln3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp)
@@ -91,6 +92,7 @@ if (OpenCV_FOUND)
     target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
     target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
     target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
+    target_link_libraries(Tensor_misc_host ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
 else()
     message("-- ${Red}Error: OpenCV must be installed to install ${PROJECT_NAME} successfully!${ColourReset}")
 endif()
@@ -117,10 +119,10 @@ else()
     include_directories(${SndFile_INCLUDE_DIRS})
     link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/)
 
-    add_executable(Tensor_host_audio Tensor_host_audio.cpp)
+    add_executable(Tensor_audio_host Tensor_audio_host.cpp)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
     if(NOT APPLE)
         set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs)
     endif()
-    target_link_libraries(Tensor_host_audio ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST})
+    target_link_libraries(Tensor_audio_host ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST})
 endif()
diff --git a/utilities/test_suite/HOST/Tensor_host_audio.cpp b/utilities/test_suite/HOST/Tensor_audio_host.cpp
similarity index 62%
rename from utilities/test_suite/HOST/Tensor_host_audio.cpp
rename to utilities/test_suite/HOST/Tensor_audio_host.cpp
index df3fb8dbd..3ec2e0060 100644
--- a/utilities/test_suite/HOST/Tensor_host_audio.cpp
+++ b/utilities/test_suite/HOST/Tensor_audio_host.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv)
     if (argc < MIN_ARG_COUNT)
     {
         printf("\nImproper Usage! Needs all arguments!\n");
-        printf("\nUsage: ./Tensor_host_audio <src folder> <case number = 0:0> <test type 0/1> <numRuns> <batchSize> <dst folder>\n");
+        printf("\nUsage: ./Tensor_host_audio <src folder> <case number = 0:7> <test type 0/1> <numRuns> <batchSize> <dst folder>\n");
         return -1;
     }
 
@@ -109,6 +109,19 @@ int main(int argc, char **argv)
         maxDstChannels = 1;
     set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes);
 
+    // create generic descriptor in case of slice
+    RpptGenericDesc descriptor3D;
+    RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
+    if(testCase == 5)
+    {
+        descriptorPtr3D->numDims = 2;
+        descriptorPtr3D->offsetInBytes = 0;
+        descriptorPtr3D->dataType = RpptDataType::F32;
+        descriptorPtr3D->dims[0] = batchSize;
+        descriptorPtr3D->dims[1] = maxSrcWidth;
+        descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1];
+    }
+
     // set buffer sizes for src/dst
     iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n;
     oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n;
@@ -125,13 +138,17 @@ int main(int argc, char **argv)
     RpptImagePatch *srcDims = (RpptImagePatch *) calloc(batchSize, sizeof(RpptImagePatch));
     RpptImagePatch *dstDims = (RpptImagePatch *) calloc(batchSize, sizeof(RpptImagePatch));
 
+    // buffers used for non silent region detection
+    Rpp32s detectedIndex[batchSize], detectionLength[batchSize];
+
     // run case-wise RPP API and measure time
     rppHandle_t handle;
     rppCreateWithBatchSize(&handle, srcDescPtr->n, 3);
+
     int noOfIterations = (int)audioNames.size() / batchSize;
     double maxWallTime = 0, minWallTime = 500, avgWallTime = 0;
     string testCaseName;
-    printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize);
+    printf("\nRunning %s %d times (each time with a batch size of %d audio files) and computing mean statistics...", func.c_str(), numRuns, batchSize);
     for (int iterCount = 0; iterCount < noOfIterations; iterCount++)
     {
         // read and decode audio and fill the audio dim values
@@ -145,8 +162,6 @@ int main(int argc, char **argv)
                 case 0:
                 {
                     testCaseName = "non_silent_region_detection";
-                    Rpp32f detectedIndex[batchSize];
-                    Rpp32f detectionLength[batchSize];
                     Rpp32f cutOffDB = -60.0;
                     Rpp32s windowLength = 2048;
                     Rpp32f referencePower = 0.0f;
@@ -155,10 +170,6 @@ int main(int argc, char **argv)
                     startWallTime = omp_get_wtime();
                     rppt_non_silent_region_detection_host(inputf32, srcDescPtr, srcLengthTensor, detectedIndex, detectionLength, cutOffDB, windowLength, referencePower, resetInterval, handle);
 
-                    // QA mode - verify outputs with golden outputs. Below code doesn’t run for performance tests
-                    if (testType == 0)
-                        verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst);
-
                     break;
                 }
                 case 1:
@@ -215,6 +226,82 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 4:
+                {
+                    testCaseName = "spectrogram";
+                    bool centerWindows = true;
+                    bool reflectPadding = true;
+                    Rpp32f *windowFn = NULL;
+                    Rpp32s power = 2;
+                    Rpp32s windowLength = 320;
+                    Rpp32s windowStep = 160;
+                    Rpp32s nfft = 512;
+                    dstDescPtr->layout = RpptLayout::NFT;
+
+                    int windowOffset = 0;
+                    if(!centerWindows)
+                        windowOffset = windowLength;
+
+                    maxDstWidth = 0;
+                    maxDstHeight = 0;
+                    if(dstDescPtr->layout == RpptLayout::NFT)
+                    {
+                        for(int i = 0; i < noOfAudioFiles; i++)
+                        {
+                            dstDims[i].height = nfft / 2 + 1;
+                            dstDims[i].width = ((srcLengthTensor[i] - windowOffset) / windowStep) + 1;
+                            maxDstHeight = std::max(maxDstHeight, (int)dstDims[i].height);
+                            maxDstWidth = std::max(maxDstWidth, (int)dstDims[i].width);
+                        }
+                    }
+                    else
+                    {
+                        for(int i = 0; i < noOfAudioFiles; i++)
+                        {
+                            dstDims[i].height = ((srcLengthTensor[i] - windowOffset) / windowStep) + 1;
+                            dstDims[i].width = nfft / 2 + 1;
+                            maxDstHeight = std::max(maxDstHeight, (int)dstDims[i].height);
+                            maxDstWidth = std::max(maxDstWidth, (int)dstDims[i].width);
+                        }
+                    }
+
+                    set_audio_descriptor_dims_and_strides_nostriding(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes);
+
+                    // Set buffer sizes for src/dst
+                    unsigned long long spectrogramBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)dstDescPtr->c * (unsigned long long)dstDescPtr->n;
+                    outputf32 = (Rpp32f *)realloc(outputf32, spectrogramBufferSize * sizeof(Rpp32f));
+
+                    startWallTime = omp_get_wtime();
+                    rppt_spectrogram_host(inputf32, srcDescPtr, outputf32, dstDescPtr, srcLengthTensor, centerWindows, reflectPadding, windowFn, nfft, power, windowLength, windowStep, handle);
+
+                    break;
+                }
+                case 5:
+                {
+                    testCaseName = "slice";
+                    Rpp32u nDim = 1; // testing for 1D slice
+                    auto fillValue = 0;
+                    bool enablePadding = true;
+                    Rpp32u roiTensor[batchSize * nDim * 2];
+                    Rpp32s anchorTensor[batchSize * nDim];
+                    Rpp32s shapeTensor[batchSize * nDim];
+
+                    // 1D slice arguments
+                    for (int i = 0; i < batchSize; i++)
+                    {
+                        int idx = i * nDim * 2;
+                        roiTensor[idx] = 10;
+                        roiTensor[idx + 1] = srcLengthTensor[i];
+                        anchorTensor[i] = 10;
+                        shapeTensor[i] = dstDims[i].width = srcLengthTensor[i] / 2;
+                        dstDims[i].height = 1;
+                    }
+
+                    startWallTime = omp_get_wtime();
+                    rppt_slice_host(inputf32, descriptorPtr3D, outputf32, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
+
+                    break;
+                }
                 case 6:
                 {
                     testCaseName = "resample";
@@ -259,6 +346,56 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 7:
+                {
+                    testCaseName = "mel_filter_bank";
+
+                    Rpp32f sampleRate = 16000;
+                    Rpp32f minFreq = 0.0;
+                    Rpp32f maxFreq = sampleRate / 2;
+                    RpptMelScaleFormula melFormula = RpptMelScaleFormula::SLANEY;
+                    Rpp32s numFilter = 80;
+                    bool normalize = true;
+                    Rpp32s srcDimsTensor[] = {257, 225, 257, 211, 257, 214}; // (height, width) for each tensor in a batch for given QA inputs.
+                    // Accepts outputs from FT layout of Spectrogram for QA
+                    srcDescPtr->layout = dstDescPtr->layout = RpptLayout::NFT;
+
+                    maxDstHeight = 0;
+                    maxDstWidth = 0;
+                    maxSrcHeight = 0;
+                    maxSrcWidth = 0;
+                    for(int i = 0, j = 0; i < batchSize; i++, j += 2)
+                    {
+                        maxSrcHeight = std::max(maxSrcHeight, (int)srcDimsTensor[j]);
+                        maxSrcWidth = std::max(maxSrcWidth, (int)srcDimsTensor[j + 1]);
+                        dstDims[i].height = numFilter;
+                        dstDims[i].width = srcDimsTensor[j + 1];
+                        maxDstHeight = std::max(maxDstHeight, (int)dstDims[i].height);
+                        maxDstWidth = std::max(maxDstWidth, (int)dstDims[i].width);
+                    }
+
+                    srcDescPtr->h = maxSrcHeight;
+                    srcDescPtr->w = maxSrcWidth;
+                    dstDescPtr->h = maxDstHeight;
+                    dstDescPtr->w = maxDstWidth;
+
+                    set_audio_descriptor_dims_and_strides_nostriding(srcDescPtr, batchSize, maxSrcHeight, maxSrcWidth, maxSrcChannels, offsetInBytes);
+                    set_audio_descriptor_dims_and_strides_nostriding(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes);
+
+                    // Set buffer sizes for src/dst
+                    unsigned long long spectrogramBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)srcDescPtr->c * (unsigned long long)srcDescPtr->n;
+                    unsigned long long melFilterBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)dstDescPtr->c * (unsigned long long)dstDescPtr->n;
+                    inputf32 = (Rpp32f *)realloc(inputf32, spectrogramBufferSize * sizeof(Rpp32f));
+                    outputf32 = (Rpp32f *)realloc(outputf32, melFilterBufferSize * sizeof(Rpp32f));
+
+                    // Read source data
+                    read_from_bin_file(inputf32, srcDescPtr, srcDimsTensor, "spectrogram", scriptPath);
+
+                    startWallTime = omp_get_wtime();
+                    rppt_mel_filter_bank_host(inputf32, srcDescPtr, outputf32, dstDescPtr, srcDimsTensor, maxFreq, minFreq, melFormula, numFilter, sampleRate, normalize, handle);
+
+                    break;
+                }
                 default:
                 {
                     missingFuncFlag = 1;
@@ -282,9 +419,9 @@ int main(int argc, char **argv)
         // QA mode - verify outputs with golden outputs. Below code doesn’t run for performance tests
         if (testType == 0)
         {
-            /* Run only if testCase is not 0
-            For testCase 0 verify_non_silent_region_detection function is used for QA testing */
-            if (testCase != 0)
+            if (testCase == 0)
+                verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst);
+            else
                 verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath);
 
             /* Dump the outputs to csv files for debugging
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index 669fc135d..2561c68e8 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -334,6 +334,15 @@ int main(int argc, char **argv)
         }
     }
 
+    // create generic descriptor and params in case of slice
+    RpptGenericDesc descriptor3D;
+    RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
+    Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
+    Rpp32u *roiTensor = NULL;
+    if(testCase == 92)
+        set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize);
+
+    // create cropRoi and patchRoi in case of crop_and_patch
     RpptROI *cropRoi, *patchRoi;
     if(testCase == 33)
     {
@@ -1196,6 +1205,30 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 92:
+                {
+                    testCaseName = "slice";
+                    Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims
+                    if(anchorTensor == NULL)
+                        anchorTensor = static_cast<Rpp32s*>(calloc(batchSize * numDims, sizeof(Rpp32s)));;
+                    if(shapeTensor == NULL)
+                       shapeTensor = static_cast<Rpp32s*>(calloc(batchSize * numDims, sizeof(Rpp32s)));;
+                    if(roiTensor == NULL)
+                        roiTensor = static_cast<Rpp32u*>(calloc(batchSize * numDims * 2, sizeof(Rpp32u)));;
+                    bool enablePadding = false;
+                    auto fillValue = 0;
+                    init_slice(descriptorPtr3D, roiTensorPtrSrc, roiTensor, anchorTensor, shapeTensor);
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+
+                    if((inputBitDepth == 0 || inputBitDepth == 2) && srcDescPtr->layout == dstDescPtr->layout)
+                        rppt_slice_host(input, descriptorPtr3D, output, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 default:
                 {
                     missingFuncFlag = 1;
@@ -1292,6 +1325,42 @@ int main(int argc, char **argv)
                     refFile.close();
                 }
 
+                // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values
+                // for output display and comparision purposes
+                if (testCase == 92)
+                {
+                    if (dstDescPtr->layout == RpptLayout::NCHW)
+                    {
+                        if (dstDescPtr->c == 3)
+                        {
+                            for(int i = 0; i < batchSize; i++)
+                            {
+                                int idx1 = i * 3;
+                                dstImgSizes[i].height = shapeTensor[idx1 + 1];
+                                dstImgSizes[i].width = shapeTensor[idx1 + 2];
+                            }
+                        }
+                        else
+                        {
+                            for(int i = 0; i < batchSize; i++)
+                            {
+                                int idx1 = i * 2;
+                                dstImgSizes[i].height = shapeTensor[idx1];
+                                dstImgSizes[i].width = shapeTensor[idx1 + 1];
+                            }
+                        }
+                    }
+                    else if (dstDescPtr->layout == RpptLayout::NHWC)
+                    {
+                        for(int i = 0; i < batchSize; i++)
+                        {
+                            int idx1 = i * 3;
+                            dstImgSizes[i].height = shapeTensor[idx1];
+                            dstImgSizes[i].width = shapeTensor[idx1 + 1];
+                        }
+                    }
+                }
+
                 /*Compare the output of the function with golden outputs only if
                 1.QA Flag is set
                 2.input bit depth 0 (Input U8 && Output U8)
@@ -1348,6 +1417,12 @@ int main(int argc, char **argv)
     free(roiTensorPtrSrc);
     free(roiTensorPtrDst);
     free(dstImgSizes);
+    if(anchorTensor != NULL)
+        free(anchorTensor);
+    if(shapeTensor != NULL)
+        free(shapeTensor);
+    if(roiTensor != NULL)
+        free(roiTensor);
     free(input);
     free(inputu8);
     free(inputu8Second);
diff --git a/utilities/test_suite/HOST/Tensor_misc_host.cpp b/utilities/test_suite/HOST/Tensor_misc_host.cpp
new file mode 100644
index 000000000..4efcd93fe
--- /dev/null
+++ b/utilities/test_suite/HOST/Tensor_misc_host.cpp
@@ -0,0 +1,193 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "../rpp_test_suite_misc.h"
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+    const int MIN_ARG_COUNT = 9;
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_normalize_host <case number = 0:0> <test type 0/1> <toggle 0/1> <number of dimensions> <batch size> <num runs> <dst path> <script path>\n");
+        return -1;
+    }
+    Rpp32u testCase, testType, nDim, batchSize, numRuns, toggle, addi;
+    bool qaMode;
+
+    testCase = atoi(argv[1]);
+    testType = atoi(argv[2]);
+    toggle = atoi(argv[3]);
+    nDim = atoi(argv[4]);
+    batchSize = atoi(argv[5]);
+    numRuns = atoi(argv[6]);
+    string dst = argv[8];
+    string scriptPath = argv[9];
+    qaMode = (testType == 0);
+    bool axisMaskCase = (testCase == 1);
+    int axisMask = (axisMaskCase) ? atoi(argv[7]) : 1;
+
+    if (qaMode && batchSize != 3)
+    {
+        std::cout<<"QA mode can only run with batchsize 3"<<std::endl;
+        return -1;
+    }
+
+    string funcName = augmentationMiscMap[testCase];
+    if (funcName.empty())
+    {
+        printf("\ncase %d is not supported\n", testCase);
+        return -1;
+    }
+
+    string func = funcName;
+    if (axisMaskCase)
+    {
+        char additionalParam_char[2];
+        std::sprintf(additionalParam_char, "%d", axisMask);
+        func += "_" + std::to_string(nDim) + "d" + "_axisMask";
+        func += additionalParam_char;
+    }
+
+    // fill roi based on mode and number of dimensions
+    Rpp32u *roiTensor = static_cast<Rpp32u *>(calloc(nDim * 2 * batchSize, sizeof(Rpp32u)));
+    fill_roi_values(nDim, batchSize, roiTensor, qaMode);
+
+    // set src/dst generic tensor descriptors
+    RpptGenericDesc srcDescriptor, dstDescriptor;
+    RpptGenericDescPtr srcDescriptorPtrND, dstDescriptorPtrND;
+    srcDescriptorPtrND = &srcDescriptor;
+    dstDescriptorPtrND = &dstDescriptor;
+    int bitDepth = 2, offSetInBytes = 0;
+    set_generic_descriptor(srcDescriptorPtrND, nDim, offSetInBytes, bitDepth, batchSize, roiTensor);
+    set_generic_descriptor(dstDescriptorPtrND, nDim, offSetInBytes, bitDepth, batchSize, roiTensor);
+    set_generic_descriptor_layout(srcDescriptorPtrND, dstDescriptorPtrND, nDim, toggle, qaMode);
+
+    Rpp32u bufferSize = 1;
+    for(int i = 0; i <= nDim; i++)
+        bufferSize *= srcDescriptorPtrND->dims[i];
+
+    // allocate memory for input / output
+    Rpp32f *inputF32 = NULL, *outputF32 = NULL;
+    inputF32 = static_cast<Rpp32f *>(calloc(bufferSize, sizeof(Rpp32f)));
+    outputF32 = static_cast<Rpp32f *>(calloc(bufferSize, sizeof(Rpp32f)));
+
+    // read input data
+    if(qaMode)
+        read_data(inputF32, nDim, 0, scriptPath);
+    else
+    {
+        std::srand(0);
+        for(int i = 0; i < bufferSize; i++)
+            inputF32[i] = static_cast<float>(std::rand() % 255);
+    }
+
+    // Set the number of threads to be used by OpenMP pragma for RPP batch processing on host.
+    // If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size
+    Rpp32u numThreads = 0;
+    rppHandle_t handle;
+    rppCreateWithBatchSize(&handle, batchSize, numThreads);
+
+    Rpp32f *meanTensor = nullptr, *stdDevTensor = nullptr;
+    double startWallTime, endWallTime;
+    double maxWallTime = 0, minWallTime = 500, avgWallTime = 0, wallTime = 0;
+
+    // case-wise RPP API and measure time script for Unit and Performance test
+    printf("\nRunning %s %d times (each time with a batch size of %d) and computing mean statistics...", func.c_str(), numRuns, batchSize);
+    for(int perfCount = 0; perfCount < numRuns; perfCount++)
+    {
+        switch(testCase)
+        {
+            case 1:
+            {
+                float scale = 1.0;
+                float shift = 0.0;
+                // computeMeanStddev set to 3 means both mean and stddev should be computed internally.
+                // Wherein 0th bit used to represent computeMean and 1st bit for computeStddev.
+                Rpp8u computeMeanStddev = 3;
+
+                Rpp32u size = 1; // length of mean and stddev tensors differ based on axisMask and nDim
+                Rpp32u maxSize = 1;
+                for(int batch = 0; batch < batchSize; batch++)
+                {
+                    size = 1;
+                    for(int i = 0; i < nDim; i++)
+                        size *= ((axisMask & (int)(pow(2,i))) >= 1) ? 1 : roiTensor[(nDim * 2 * batch) + nDim + i];
+                    maxSize = max(maxSize, size);
+                }
+
+                // allocate memory if no memory is allocated
+                if(meanTensor == nullptr)
+                    meanTensor = static_cast<Rpp32f *>(calloc(maxSize * batchSize, sizeof(Rpp32f)));
+
+                if(stdDevTensor == nullptr)
+                    stdDevTensor = static_cast<Rpp32f *>(calloc(maxSize * batchSize, sizeof(Rpp32f)));
+
+                if(!computeMeanStddev)
+                    fill_mean_stddev_values(nDim, maxSize, meanTensor, stdDevTensor, qaMode, axisMask, scriptPath);
+
+                startWallTime = omp_get_wtime();
+                rppt_normalize_host(inputF32, srcDescriptorPtrND, outputF32, dstDescriptorPtrND, axisMask, meanTensor, stdDevTensor, computeMeanStddev, scale, shift, roiTensor, handle);
+
+                // compare outputs if qaMode is true
+                if(qaMode)
+                {
+                    bool externalMeanStd = !computeMeanStddev; // when mean and stddev is passed from user
+                    compare_output(outputF32, nDim, batchSize, bufferSize, dst, func, axisMask, scriptPath, externalMeanStd);
+                }
+                break;
+            }
+            default:
+            {
+                cout << "functionality is not supported" <<std::endl;
+                exit(0);
+            }
+        }
+        endWallTime = omp_get_wtime();
+
+        wallTime = endWallTime - startWallTime;
+        maxWallTime = std::max(maxWallTime, wallTime);
+        minWallTime = std::min(minWallTime, wallTime);
+        avgWallTime += wallTime;
+    }
+
+    if(!qaMode)
+    {
+        maxWallTime *= 1000;
+        minWallTime *= 1000;
+        avgWallTime *= 1000;
+        avgWallTime /= numRuns;
+        cout << fixed << "\nmax,min,avg wall times in ms/batch = " << maxWallTime << "," << minWallTime << "," << avgWallTime;
+    }
+
+    free(inputF32);
+    free(outputF32);
+    free(roiTensor);
+    if(meanTensor != nullptr)
+        free(meanTensor);
+    if(stdDevTensor != nullptr)
+        free(stdDevTensor);
+    return 0;
+}
diff --git a/utilities/test_suite/HOST/Tensor_voxel_host.cpp b/utilities/test_suite/HOST/Tensor_voxel_host.cpp
index 968f29b54..89fddf2c6 100644
--- a/utilities/test_suite/HOST/Tensor_voxel_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_voxel_host.cpp
@@ -140,6 +140,10 @@ int main(int argc, char * argv[])
     void *pinnedMemArgs;
     pinnedMemArgs = calloc(2 * noOfFiles , sizeof(Rpp32f));
 
+    // arguments required for slice
+    Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
+    Rpp32u *roiTensor = NULL;
+
     // Set the number of threads to be used by OpenMP pragma for RPP batch processing on host.
     // If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size
     Rpp32u numThreads = 0;
@@ -241,11 +245,21 @@ int main(int argc, char * argv[])
                 case 1:
                 {
                     testCaseName = "slice";
+                    if(anchorTensor == NULL)
+                        anchorTensor = static_cast<Rpp32s*>(calloc(batchSize * 4, sizeof(Rpp32s)));;
+                    if(shapeTensor == NULL)
+                       shapeTensor = static_cast<Rpp32s*>(calloc(batchSize * 4, sizeof(Rpp32s)));;
+                    if(roiTensor == NULL)
+                        roiTensor = static_cast<Rpp32u*>(calloc(batchSize * 8, sizeof(Rpp32u)));;
+                    bool enablePadding = false;
+                    auto fillValue = 0;
+                    init_slice_voxel(descriptorPtr3D, roiGenericSrcPtr, roiTensor, anchorTensor, shapeTensor);
+
                     startWallTime = omp_get_wtime();
                     if(inputBitDepth == 0)
-                        rppt_slice_host(inputU8, descriptorPtr3D, outputU8, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle);
+                        rppt_slice_host(inputU8, descriptorPtr3D, outputU8, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
                     else if(inputBitDepth == 2)
-                        rppt_slice_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, roiGenericSrcPtr, roiTypeSrc, handle);
+                        rppt_slice_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
                     else
                         missingFuncFlag = 1;
 
@@ -393,6 +407,39 @@ int main(int argc, char * argv[])
                     outputF32[i] = static_cast<float>(outputU8[i]);
             }
 
+            // if test case is slice and qaFlag is set, update the ROI with shapeTensor values
+            // for output display and comparison purposes
+            if(testCase == 1)
+            {
+                // update the roi for comparision with the shapeTensor values
+                if (descriptorPtr3D->layout == RpptLayout::NCDHW)
+                {
+                    for(int i = 0; i < batchSize; i++)
+                    {
+                        int idx1 = i * 4;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1 + 1];
+                        roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 2];
+                        roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 3];
+                    }
+                }
+                else if(descriptorPtr3D->layout == RpptLayout::NDHWC)
+                {
+                    for(int i = 0; i < batchSize; i++)
+                    {
+                        int idx1 = i * 4;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.x = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.y = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.xyz.z = 0;
+                        roiGenericSrcPtr[i].xyzwhdROI.roiDepth = shapeTensor[idx1];
+                        roiGenericSrcPtr[i].xyzwhdROI.roiHeight = shapeTensor[idx1 + 1];
+                        roiGenericSrcPtr[i].xyzwhdROI.roiWidth = shapeTensor[idx1 + 2];
+                    }
+                }
+            }
+
             /*Compare the output of the function with golden outputs only if
             1.QA Flag is set
             2.input bit depth 2 (F32)*/
@@ -481,6 +528,12 @@ int main(int argc, char * argv[])
     free(outputF32);
     free(roiGenericSrcPtr);
     free(pinnedMemArgs);
+    if(anchorTensor != NULL)
+        free(anchorTensor);
+    if(shapeTensor != NULL)
+        free(shapeTensor);
+    if(roiTensor != NULL)
+        free(roiTensor);
     if(inputBitDepth == 0)
     {
         if(inputU8 != NULL)
diff --git a/utilities/test_suite/HOST/runAudioTests.py b/utilities/test_suite/HOST/runAudioTests.py
index ab824766c..0d213a142 100644
--- a/utilities/test_suite/HOST/runAudioTests.py
+++ b/utilities/test_suite/HOST/runAudioTests.py
@@ -36,41 +36,36 @@
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
 caseMin = 0
-caseMax = 6
-
+caseMax = 7
 
 # Get a list of log files based on a flag for preserving output
 def get_log_file_list():
     return [
-        outFolderPath + "/OUTPUT_PERFORMANCE_AUDIO_LOGS_HOST_" + timestamp + "/Tensor_host_audio_raw_performance_log.txt",
+        outFolderPath + "/OUTPUT_PERFORMANCE_AUDIO_LOGS_HOST_" + timestamp + "/Tensor_audio_host_raw_performance_log.txt",
     ]
 
-def run_unit_test(srcPath, case, numRuns, testType, batchSize, outFilePath):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    print(f"./Tensor_host_audio {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize}")
-    result = subprocess.run([buildFolderPath + "/build/Tensor_host_audio", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+def run_unit_test_cmd(srcPath, case, numRuns, testType, batchSize, outFilePath):
+    print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
     print(result.stdout.decode())
-
     print("------------------------------------------------------------------------------------------")
 
-def run_performance_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
+def run_performance_test_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
+    with open("{}/Tensor_audio_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
+        print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize} ")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+        read_from_subprocess_and_write_to_log(process, logFile)
+        print("------------------------------------------------------------------------------------------")
+
+def run_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
     print("\n\n\n\n")
     print("--------------------------------")
     print("Running a New Functionality...")
     print("--------------------------------")
-    with open("{}/Tensor_host_audio_raw_performance_log.txt".format(loggingFolder), "a") as log_file:
-        print(f"./Tensor_host_audio {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize} ")
-        process = subprocess.Popen([buildFolderPath + "/build/Tensor_host_audio", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
-        while True:
-            output = process.stdout.readline()
-            if not output and process.poll() is not None:
-                break
-            print(output.strip())
-            log_file.write(output)
-    print("------------------------------------------------------------------------------------------")
+    if testType == 0:
+        run_unit_test_cmd(srcPath, case, numRuns, testType, batchSize, outFilePath)
+    elif testType == 1:
+        run_performance_test_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath)
 
 # Parse and validate command-line arguments for the RPP test suite
 def rpp_test_suite_parser_and_validator():
@@ -179,34 +174,21 @@ def rpp_test_suite_parser_and_validator():
 subprocess.run(["make", "-j16"], cwd=".")    # nosec
 
 # List of cases supported
-supportedCaseList = ['0', '1', '2', '3', '6']
-
-if testType == 0:
-    if batchSize != 3:
-        print("QA tests can only run with a batch size of 3.")
-        exit(0)
-
-    for case in caseList:
-        if "--input_path" not in sys.argv:
-            if case == "3":
-                srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1"
-            else:
-                srcPath = inFilePath
-        if case not in supportedCaseList:
-            continue
+supportedCaseList = ['0', '1', '2', '3', '4', '5', '6', '7']
+if qaMode and batchSize != 3:
+    print("QA tests can only run with a batch size of 3.")
+    exit(0)
 
-        run_unit_test(srcPath, case, numRuns, testType, batchSize, outFilePath)
-else:
-    for case in caseList:
-        if "--input_path" not in sys.argv:
-            if case == "3":
-                srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1"
-            else:
-                srcPath = inFilePath
-        if case not in supportedCaseList:
-            continue
+for case in caseList:
+    if "--input_path" not in sys.argv:
+        if case == "3":
+            srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1"
+        else:
+            srcPath = inFilePath
 
-        run_performance_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath)
+    if case not in supportedCaseList:
+        continue
+    run_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath)
 
 # print the results of qa tests
 nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support
@@ -215,7 +197,7 @@ def rpp_test_suite_parser_and_validator():
     qaFilePath = os.path.join(outFilePath, "QA_results.txt")
     checkFile = os.path.isfile(qaFilePath)
     if checkFile:
-        print("---------------------------------- Results of QA Test - Tensor_host_audio -----------------------------------\n")
+        print("---------------------------------- Results of QA Test - Tensor_audio_host -----------------------------------\n")
         print_qa_tests_summary(qaFilePath, supportedCaseList, nonQACaseList)
 
 # Performance tests
diff --git a/utilities/test_suite/HOST/runMiscTests.py b/utilities/test_suite/HOST/runMiscTests.py
new file mode 100644
index 000000000..553b6e35e
--- /dev/null
+++ b/utilities/test_suite/HOST/runMiscTests.py
@@ -0,0 +1,197 @@
+"""
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import os
+import subprocess  # nosec
+import argparse
+import datetime
+import shutil
+import sys
+sys.dont_write_bytecode = True
+sys.path.append(os.path.join(os.path.dirname( __file__ ), '..' ))
+from common import *
+
+# Set the timestamp
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+scriptPath = os.path.dirname(os.path.realpath(__file__))
+outFolderPath = os.getcwd()
+buildFolderPath = os.getcwd()
+caseMin = 1
+caseMax = 1
+
+# Get a list of log files based on a flag for preserving output
+def get_log_file_list():
+    return [
+        outFolderPath + "/OUTPUT_PERFORMANCE_MISC_LOGS_HOST_" + timestamp + "/Tensor_misc_host_raw_performance_log.txt",
+    ]
+
+def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+    print(result.stdout.decode())
+    print("------------------------------------------------------------------------------------------")
+
+def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    with open("{}/Tensor_misc_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
+        print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+        read_from_subprocess_and_write_to_log(process, logFile)
+
+def run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+    print("\n\n\n\n")
+    print("--------------------------------")
+    print("Running a New Functionality...")
+    print("--------------------------------")
+    if testType == 0:
+        run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+    elif testType == 1:
+        run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg)
+
+# Parse and validate command-line arguments for the RPP test suite
+def rpp_test_suite_parser_and_validator():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = QA tests / 1 = Performance tests)")
+    parser.add_argument('--toggle', type = int, default = 0, help = "Toggle outputs")
+    parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to test", required = False)
+    parser.add_argument("--num_dims", type = int, default = 2, help = "Number of dimensions for input")
+    parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
+    parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Outputs from tests will be compared with golden outputs - (0 / 1)", required = False)
+    parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
+    parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
+    args = parser.parse_args()
+
+    # validate the parameters passed by user
+    if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
+        print("Starting case# and Ending case# must be in the 0:1 range. Aborting!")
+        exit(0)
+    elif args.case_end < args.case_start:
+        print("Ending case# must be greater than starting case#. Aborting!")
+        exit(0)
+    elif args.test_type < 0 or args.test_type > 1:
+        print("Test Type# must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.qa_mode < 0 or args.qa_mode > 1:
+        print("QA mode must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.case_list is not None and args.case_start > caseMin and args.case_end < caseMax:
+        print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
+        exit(0)
+    elif args.num_runs <= 0:
+        print("Number of Runs must be greater than 0. Aborting!")
+        exit(0)
+    elif args.batch_size <= 0:
+        print("Batch size must be greater than 0. Aborting!")
+        exit(0)
+
+    if args.case_list is None:
+        args.case_list = range(args.case_start, args.case_end + 1)
+        args.case_list = [str(x) for x in args.case_list]
+    else:
+        for case in args.case_list:
+            if int(case) < caseMin or int(case) > caseMax:
+                print("The case# must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+                exit(0)
+    return args
+
+args = rpp_test_suite_parser_and_validator()
+caseStart = args.case_start
+caseEnd = args.case_end
+testType = args.test_type
+toggle = args.toggle
+caseList = args.case_list
+numDims = args.num_dims
+numRuns = args.num_runs
+batchSize = args.batch_size
+qaMode = args.qa_mode
+if qaMode:
+    testType = 0
+preserveOutput = args.preserve_output
+bitDepth = 2 # Current audio test suite only supports bit depth 2
+outFilePath = " "
+
+if testType == 0 and batchSize != 3:
+    print("QA mode can only run with a batch size of 3.")
+    exit(0)
+if preserveOutput == 0:
+    validate_and_remove_folders(outFolderPath, "QA_RESULTS_MISC_HOST")
+    validate_and_remove_folders(outFolderPath, "OUTPUT_PERFORMANCE_MISC_LOGS_HOST")
+
+if(testType == 0):
+    outFilePath = outFolderPath + '/QA_RESULTS_MISC_HOST_' + timestamp
+    numRuns = 1
+elif(testType == 1):
+    if "--num_runs" not in sys.argv:
+        numRuns = 100   #default numRuns for running performance tests
+    outFilePath = outFolderPath + '/OUTPUT_PERFORMANCE_MISC_LOGS_HOST_' + timestamp
+else:
+    print("Invalid TEST_TYPE specified. TEST_TYPE should be 0/1 (0 = QA tests / 1 = Performance tests)")
+    exit(0)
+
+os.mkdir(outFilePath)
+loggingFolder = outFilePath
+dstPath = outFilePath
+
+# Enable extglob
+if os.path.exists(buildFolderPath + "/build"):
+    shutil.rmtree(buildFolderPath + "/build")
+os.makedirs(buildFolderPath + "/build")
+os.chdir(buildFolderPath + "/build")
+
+# Run cmake and make commands
+subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.run(["make", "-j16"], cwd=".")    # nosec
+
+supportedCaseList = ['1']
+for case in caseList:
+    if case not in supportedCaseList:
+        continue
+    if case == "1":
+        for axisMask in range(1, pow(2, numDims)):
+            run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, axisMask)
+    else:
+        run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, axisMask)
+
+# print the results of qa tests
+nonQACaseList = []
+supportedCases = 0
+for num in caseList:
+    if num in supportedCaseList:
+        supportedCases += 1
+caseInfo = "Tests are run for " + str(supportedCases) + " supported cases out of the " + str(len(caseList)) + " cases requested"
+if testType == 0:
+    qaFilePath = os.path.join(outFilePath, "QA_results.txt")
+    checkFile = os.path.isfile(qaFilePath)
+    if checkFile:
+        print("---------------------------------- Results of QA Test - Tensor_misc_host ----------------------------------\n")
+        print_qa_tests_summary(qaFilePath, supportedCaseList, nonQACaseList)
+
+# Performance tests
+if (testType == 1):
+    logFileList = get_log_file_list()
+    functionalityGroupList = ["statiscal_operations"]
+
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index 7aa3b1a0a..35c71ea33 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -21,7 +21,6 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
-
 import os
 import sys
 sys.dont_write_bytecode = True
@@ -40,7 +39,7 @@
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
 caseMin = 0
-caseMax = 91
+caseMax = 92
 
 # Get a list of log files based on a flag for preserving output
 def get_log_file_list(preserveOutput):
@@ -106,28 +105,18 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
 
             print("------------------------------------------------------------------------------------------")
 
-def run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     if qaMode == 1:
-        with open("{}/BatchPD_host_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file:
-            process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + log_file_layout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                log_file.write(output)
-
-    with open("{}/Tensor_host_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file:
+        with open("{}/BatchPD_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
+            process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + logFileLayout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+            read_from_subprocess_and_write_to_log(process, logFile)
+
+    with open("{}/Tensor_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
         print(f"./Tensor_host {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ")
         process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
-        while True:
-            output = process.stdout.readline()
-            if not output and process.poll() is not None:
-                break
-            print(output.strip())
-            log_file.write(output)
-
-def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+        read_from_subprocess_and_write_to_log(process, logFile)
+
+def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     print("\n\n\n\n")
     print("--------------------------------")
     print("Running a New Functionality...")
@@ -145,13 +134,13 @@ def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dst
             if case == "8":
                 # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
                 for noiseType in range(3):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, noiseType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             elif case == "21" or case == "23" or case == "24":
                 # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
                 for interpolationType in range(6):
-                    run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                    run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, interpolationType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             else:
-                run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "0", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+                run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, "0", numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
             print("------------------------------------------------------------------------------------------")
 
 # Parse and validate command-line arguments for the RPP test suite
@@ -283,7 +272,7 @@ def rpp_test_suite_parser_and_validator():
 subprocess.run(["make", "-j16"], cwd=".")    # nosec
 
 # List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
 
 print("\n\n\n\n\n")
 print("##########################################################################################")
@@ -302,7 +291,7 @@ def rpp_test_suite_parser_and_validator():
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
         for layout in range(3):
-            dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
+            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
 
             if qaMode == 0:
                 if not os.path.isdir(dstPathTemp):
@@ -324,8 +313,8 @@ def rpp_test_suite_parser_and_validator():
             srcPath1 = ricapInFilePath
             srcPath2 = ricapInFilePath
         for layout in range(3):
-            dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
-            run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
+            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
+            run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
 
 # print the results of qa tests
 nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
@@ -462,7 +451,7 @@ def rpp_test_suite_parser_and_validator():
     passedCases = df['Test_Result'].eq('PASSED').sum()
     failedCases = df['Test_Result'].eq('FAILED').sum()
 
-    summary_row = {'BatchPD_Augmentation_Type': pd.NA,
+    summaryRow = {'BatchPD_Augmentation_Type': pd.NA,
                    'Tensor_Augmentation_Type': pd.NA,
                    'Performance Speedup (%)': pd.NA,
                    'Test_Result': f'Final Results of Tests: Passed: {passedCases}, Failed: {failedCases}'}
@@ -471,8 +460,8 @@ def rpp_test_suite_parser_and_validator():
 
     # Append the summary row to the DataFrame
     # Convert the dictionary to a DataFrame
-    summary_row = pd.DataFrame([summary_row])
-    df = pd.concat([df, summary_row], ignore_index=True)
+    summaryRow = pd.DataFrame([summaryRow])
+    df = pd.concat([df, summaryRow], ignore_index=True)
 
     df.to_excel(excelFilePath, index=False)
     print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------")
@@ -482,9 +471,9 @@ def rpp_test_suite_parser_and_validator():
     print("- Random observations of negative speedups might always occur due to current test machine temperature/load variances or other CPU/GPU state-dependent conditions.")
     print("\n-------------------------------------------------------------------\n")
 elif (testType == 1 and qaMode == 0):
-    log_file_list = get_log_file_list(preserveOutput)
+    logFileList = get_log_file_list(preserveOutput)
 
-    functionality_group_list = [
+    functionalityGroupList = [
         "color_augmentations",
         "data_exchange_operations",
         "effects_augmentations",
@@ -493,5 +482,5 @@ def rpp_test_suite_parser_and_validator():
         "statistical_operations",
     ]
 
-    for log_file in log_file_list:
-        print_performance_tests_summary(log_file, functionality_group_list, numRuns)
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/HOST/runTests_voxel.py b/utilities/test_suite/HOST/runVoxelTests.py
similarity index 73%
rename from utilities/test_suite/HOST/runTests_voxel.py
rename to utilities/test_suite/HOST/runVoxelTests.py
index ad0dbcf7d..cf22fdbc9 100644
--- a/utilities/test_suite/HOST/runTests_voxel.py
+++ b/utilities/test_suite/HOST/runVoxelTests.py
@@ -57,44 +57,34 @@ def func_group_finder(case_number):
     else:
         return "miscellaneous"
 
-def run_unit_test(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    bitDepths = [0, 2]
-    if qaMode:
-        bitDepths = [2]
-    for bitDepth in bitDepths:
-        print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
-        print("\n\n\n\n")
+def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
+    result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
+    print(result.stdout.decode())
+    print("------------------------------------------------------------------------------------------")
+
+def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    with open(f"{loggingFolder}/Tensor_voxel_host_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
         print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-        result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
-        print(result.stdout.decode())
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+        while True:
+            output = process.stdout.readline()
+            if not output and process.poll() is not None:
+                break
+            print(output.strip())
+            if "Running" in output or "max,min,avg wall times" in output:
+                cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
+                cleanedOutput = cleanedOutput.strip()  # Remove leading/trailing whitespace
+                logFile.write(cleanedOutput + '\n')
+                if "max,min,avg wall times" in output:
+                    logFile.write("\n")
         print("------------------------------------------------------------------------------------------")
 
-def run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print("\n\n\n\n")
-    print("--------------------------------")
-    print("Running a New Functionality...")
-    print("--------------------------------")
-    bitDepths = [0, 2]
-    for bitDepth in bitDepths:
-        with open(f"{loggingFolder}/Tensor_voxel_host_{logFileLayout}_raw_performance_log.txt", "a") as log_file:
-            print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-            process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
-            while True:
-                output = process.stdout.readline()
-                if not output and process.poll() is not None:
-                    break
-                print(output.strip())
-                if "Running" in output or "max,min,avg wall times" in output:
-                    cleaned_output = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
-                    cleaned_output = cleaned_output.strip()  # Remove leading/trailing whitespace
-                    log_file.write(cleaned_output + '\n')
-                    if "max,min,avg wall times" in output:
-                        log_file.write("\n")
-        print("------------------------------------------------------------------------------------------")
+def run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
+    if testType == 0:
+        run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+    elif testType == 1:
+        run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
 
 # Parse and validate command-line arguments for the RPP test suite
 def rpp_test_suite_parser_and_validator():
@@ -224,24 +214,26 @@ def rpp_test_suite_parser_and_validator():
 print("##########################################################################################")
 
 bitDepths = [0, 2]
-if testType == 0:
-    for case in caseList:
-        if case not in supportedCaseList:
-            continue
-        for layout in range(3):
-            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
-            if qaMode == 0:
-                if not os.path.isdir(dstPathTemp):
-                    os.mkdir(dstPathTemp)
-
-            run_unit_test(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
-else:
-    for case in caseList:
-        if case not in supportedCaseList:
-            continue
-        for layout in range(3):
-            dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
-            run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
+for case in caseList:
+    if case not in supportedCaseList:
+        continue
+    print("\n\n\n\n")
+    print("--------------------------------")
+    print("Running a New Functionality...")
+    print("--------------------------------")
+    for layout in range(3):
+        dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
+        if testType == 0 and qaMode == 0:
+            if not os.path.isdir(dstPathTemp):
+                os.mkdir(dstPathTemp)
+
+        bitDepths = [0, 2]
+        if testType == 0 and qaMode:
+            bitDepths = [2]
+        for bitDepth in bitDepths:
+            print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
+            print("\n\n\n\n")
+            run_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
 
 # print the results of qa tests
 nonQACaseList = ['6'] # Add cases present in supportedCaseList, but without QA support
@@ -256,13 +248,8 @@ def rpp_test_suite_parser_and_validator():
 if (testType == 0 and qaMode == 0):   # Unit tests
     create_layout_directories(dstPath, layoutDict)
 elif (testType == 1):   # Performance tests
-    log_file_list = get_log_file_list()
-
-    functionality_group_list = [
-        "arithmetic_operations",
-        "geometric_augmentations",
-        "effects_augmentations"
-    ]
+    logFileList = get_log_file_list()
+    functionalityGroupList = ["arithmetic_operations", "geometric_augmentations", "effects_augmentations"]
 
-    for log_file in log_file_list:
-        print_performance_tests_summary(log_file, functionality_group_list, numRuns)
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
diff --git a/utilities/test_suite/NORMALIZE/input/2d_input.bin b/utilities/test_suite/NORMALIZE/input/2d_input.bin
new file mode 100644
index 000000000..e344a605a
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/input/2d_input.bin differ
diff --git a/utilities/test_suite/NORMALIZE/input/2d_mean_std.bin b/utilities/test_suite/NORMALIZE/input/2d_mean_std.bin
new file mode 100644
index 000000000..d50768a58
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/input/2d_mean_std.bin differ
diff --git a/utilities/test_suite/NORMALIZE/input/3d_input.bin b/utilities/test_suite/NORMALIZE/input/3d_input.bin
new file mode 100644
index 000000000..c3829ec21
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/input/3d_input.bin differ
diff --git a/utilities/test_suite/NORMALIZE/input/3d_mean_std.bin b/utilities/test_suite/NORMALIZE/input/3d_mean_std.bin
new file mode 100644
index 000000000..aed068df8
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/input/3d_mean_std.bin differ
diff --git a/utilities/test_suite/NORMALIZE/output/2d_output.bin b/utilities/test_suite/NORMALIZE/output/2d_output.bin
new file mode 100644
index 000000000..0f97d6bcf
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/output/2d_output.bin differ
diff --git a/utilities/test_suite/NORMALIZE/output/3d_output.bin b/utilities/test_suite/NORMALIZE/output/3d_output.bin
new file mode 100644
index 000000000..e9f31134a
Binary files /dev/null and b/utilities/test_suite/NORMALIZE/output/3d_output.bin differ
diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md
index c4d41aad3..c606202da 100644
--- a/utilities/test_suite/README.md
+++ b/utilities/test_suite/README.md
@@ -127,6 +127,10 @@ python runTests.py --case_start 0 --case_end 91 --test_type 0 --qa_mode 1 --batc
 ``` python
 python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 --num_runs 100
 ```
+-   QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement
+``` python
+python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 --num_runs 100
+```
 -   Unit test mode - Unit tests allowing users to pass a path to a folder containing images, to execute the desired functionality and variant once, report RPP execution wall time, save and view output images
 Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
       RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
@@ -187,43 +191,43 @@ The voxel test suite accepts the following command line arguments:
 ### Running the Tests for HOST Backend (RPP Voxel Test Suite)
 The test suite can be run with the following command:
 ``` python
-python runTests_voxel.py --header_path <header_path> --data_path <data_path> --case_start <case_start> --case_end <case_end> --test_type <test_type>
+python runVoxelTests.py --header_path <header_path> --data_path <data_path> --case_start <case_start> --case_end <case_end> --test_type <test_type>
 ```
 
 ### Running the Tests for HIP Backend (RPP Voxel Test Suite)
 The test suite can be run with the following command:
 ``` python
-python runTests_voxel.py --header_path <header_path> --data_path <data_path> --case_start <case_start> --case_end <case_end> --test_type <test_type> --profiling <profiling>
+python runVoxelTests.py --header_path <header_path> --data_path <data_path> --case_start <case_start> --case_end <case_end> --test_type <test_type> --profiling <profiling>
 ```
 
 ### Modes of operation (RPP Voxel Test Suite)
 -   QA mode - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs.
 ``` python
-python runTests_voxel.py --case_start 0 --case_end 4 --test_type 0 --qa_mode 1 --batch_size 3
+python runVoxelTests.py --case_start 0 --case_end 4 --test_type 0 --qa_mode 1 --batch_size 3
 ```
 -   Unit test mode - Unit tests allowing users to pass a path to a folder containing nii fikes, to execute the desired functionality and variant once, report RPP execution wall time, save and view output images, gifs and nii files
 ``` python
-python runTests_voxel.py --case_start 0 --case_end 4 --test_type 0 --qa_mode 0
+python runVoxelTests.py --case_start 0 --case_end 4 --test_type 0 --qa_mode 0
 ```
 -   Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time, or optionally, AMD rocprof kernel profiler max/min/avg time for HIP backend variants.
 ``` python
-python runTests_voxel.py --case_start 0 --case_end 4 --test_type 1
+python runVoxelTests.py --case_start 0 --case_end 4 --test_type 1
 ```
 
 To run the unit tests / performance tests for specific case numbers. please case use case_list parameter. Example as below
 
 -   To run unittests for case numbers 0, 2, 4
 ``` python
-python runTests_voxel.py --case_list 0 2 4 --test_type 0
+python runVoxelTests.py --case_list 0 2 4 --test_type 0
 ```
 -   To run performance tests for case numbers 0, 2, 4
 ``` python
-python runTests_voxel.py --case_list 0 2 4 --test_type 1
+python runVoxelTests.py --case_list 0 2 4 --test_type 1
 ```
 
 To run performance tests with AMD rocprof kernel profiler for HIP backend variants. This will generate profiler times for HIP backend variants
 ``` python
-python runTests_voxel.py --test_type 1 --profiling YES
+python runVoxelTests.py --test_type 1 --profiling YES
 ```
 
 ### Summary of features (RPP Voxel Test Suite)
@@ -294,3 +298,45 @@ The audio test suite includes:
 -   Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time.
 -   QA and Performance tests are included for one input/output bitdepth F32.
 -   Support for output referencing against golden outputs, and functionality validation checking, by tolerance-based pass/fail criterions for each variant.
+
+## RPP Miscellaneous Test Suite
+The miscellaneous test suite can be executed to validate the functionality and performance of the AMD ROCm Performance Primitives (RPP) generic libraries that can process N-Dimensional inputs
+-   HOST backend - (On a CPU with HOST backend)
+-   HIP backend - (On a GPU with HIP backend)
+-   F32 Bit Depth
+
+### Command Line Arguments (RPP Miscellaneous Test Suite)
+The miscellaneous test suite accepts the following command line arguments:
+-   case_start: The starting case number for the test range (1-1). Default is 1
+-   case_end: The ending case number for the test range (1-1). Default is 1
+-   test_type: The type of test to run (0 = QA tests, 1 = Performance tests). Default is 0
+-   qa_mode: Output audio data from tests will be compared with golden outputs - (0 / 1). Default is 0
+-   case_list: A list of specific case numbers to run. Must be used in conjunction with --test_type
+-   num_dims: The number of dimensions of input. Default is 2
+-   num_runs: Specifies the number of runs for running the performance tests
+-   preserve_output: preserves the output or performance logs generated from the previous test suite run - (0 = remove output or performance logs / 1 = preserve output or performance logs). Default is 1
+-   batch_size: Specifies the batch size to use for running tests. Default is 1
+-   profiling: Run the tests with a profiler (YES/NO). Default is NO. This option is only available with HIP backend
+
+### Running the Tests for HOST Backend (RPP Miscellaneous Test Suite)
+The test suite can be run with the following command:
+``` python
+python runMiscTests.py --case_start <case_start> --case_end <case_end> --test_type <test_type>
+```
+
+### Modes of operation (RPP Miscellaneous Test Suite)
+-   QA mode - Tolerance based PASS/FAIL tests for RPP MISC HOST functionalities checking match between output and preset golden outputs. Please note that QA mode is only supported with a batch size of 3 and num dims 2 or 3
+``` python
+python runMiscTests.py --case_start 1 --case_end 1 --qa_mode 1 --batch_size 3
+```
+
+-   Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time.
+``` python
+python runMiscTests.py --case_start 1 --case_end 1 --test_type 1
+```
+
+### Summary of features (RPP Miscellaneous Test Suite)
+The miscellaneous test suite includes:
+-   Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time.
+-   QA and Performance tests are included for one input/output bitdepth F32.
+-   Support for output referencing against golden outputs, and functionality validation checking, by tolerance-based pass/fail criterions for each variant.
\ No newline at end of file
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/slice/slice_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/slice/slice_u8_Tensor.bin
new file mode 100644
index 000000000..ac379dc79
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/slice/slice_u8_Tensor.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/mel_filter_bank/mel_filter_bank.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/mel_filter_bank/mel_filter_bank.bin
new file mode 100644
index 000000000..66cefe73f
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/mel_filter_bank/mel_filter_bank.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin
new file mode 100644
index 000000000..10b45a6ff
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/spectrogram/spectrogram.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/spectrogram/spectrogram.bin
new file mode 100644
index 000000000..bf471cb09
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/spectrogram/spectrogram.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice.bin
deleted file mode 100644
index 94802b30b..000000000
Binary files a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice.bin and /dev/null differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin
index 94802b30b..f51ec113c 100644
Binary files a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/slice/slice_nifti_output.bin differ
diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py
index 1646cdc94..3795a5daf 100644
--- a/utilities/test_suite/common.py
+++ b/utilities/test_suite/common.py
@@ -229,4 +229,22 @@ def print_performance_tests_summary(logFile, functionalityGroupList, numRuns):
         print("No variants under this category")
 
     # Closing log file
-    f.close()
\ No newline at end of file
+    f.close()
+
+# Read the standard output from subprocess and writes to log file
+def read_from_subprocess_and_write_to_log(process, logFile):
+    while True:
+        output = process.stdout.readline()
+        if not output and process.poll() is not None:
+            break
+        print(output.strip())
+        logFile.write(output)
+
+# Returns the layout name based on layout value
+def get_layout_name(layout):
+    if layout == 0:
+        return "PKD3"
+    elif  layout == 1:
+        return "PLN3"
+    elif layout == 2:
+        return "PLN1"
diff --git a/utilities/test_suite/rpp_test_suite_audio.h b/utilities/test_suite/rpp_test_suite_audio.h
index 6a59bbcc0..c70c6659f 100644
--- a/utilities/test_suite/rpp_test_suite_audio.h
+++ b/utilities/test_suite/rpp_test_suite_audio.h
@@ -40,7 +40,10 @@ std::map<int, string> audioAugmentationMap =
     {1, "to_decibels"},
     {2, "pre_emphasis_filter"},
     {3, "down_mixing"},
-    {6, "resample"}
+    {4, "spectrogram"},
+    {5, "slice"},
+    {6, "resample"},
+    {7, "mel_filter_bank"}
 };
 
 // Golden outputs for Non Silent Region Detection
@@ -69,6 +72,22 @@ inline void set_audio_descriptor_dims_and_strides(RpptDescPtr descPtr, int batch
     descPtr->strides.cStride = 1;
 }
 
+// sets descriptor dimensions and strides of src/dst
+inline void set_audio_descriptor_dims_and_strides_nostriding(RpptDescPtr descPtr, int batchSize, int maxHeight, int maxWidth, int maxChannels, int offsetInBytes)
+{
+    descPtr->numDims = 4;
+    descPtr->offsetInBytes = offsetInBytes;
+    descPtr->n = batchSize;
+    descPtr->h = maxHeight;
+    descPtr->w = maxWidth;
+    descPtr->c = maxChannels;
+
+    descPtr->strides.nStride = descPtr->c * descPtr->w * descPtr->h;
+    descPtr->strides.hStride = descPtr->c * descPtr->w;
+    descPtr->strides.wStride = descPtr->c;
+    descPtr->strides.cStride = 1;
+}
+
 // sets values of maxHeight and maxWidth
 inline void set_audio_max_dimensions(vector<string> audioFilesPath, int& maxWidth, int& maxChannels)
 {
@@ -131,6 +150,50 @@ void read_audio_batch_and_fill_dims(RpptDescPtr descPtr, Rpp32f *inputf32, vecto
     }
 }
 
+void read_from_bin_file(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcDims, string testCase, string scriptPath)
+{
+    // read data from golden outputs
+    Rpp64u oBufferSize = srcDescPtr->n * srcDescPtr->strides.nStride;
+    Rpp32f *refInput = static_cast<Rpp32f *>(malloc(oBufferSize * sizeof(float)));
+    string outFile = scriptPath + "/../REFERENCE_OUTPUTS_AUDIO/" + testCase + "/" + testCase + ".bin";
+    std::fstream fin(outFile, std::ios::in | std::ios::binary);
+    if(fin.is_open())
+    {
+        for(Rpp64u i = 0; i < oBufferSize; i++)
+        {
+            if(!fin.eof())
+                fin.read(reinterpret_cast<char*>(&refInput[i]), sizeof(float));
+            else
+            {
+                std::cout<<"\nUnable to read all data from golden outputs\n";
+                return;
+            }
+        }
+    }
+    else
+    {
+        std::cout<<"\nCould not open the reference output. Please check the path specified\n";
+        return;
+    }
+    for (int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        Rpp32f *srcPtrCurrent = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        Rpp32f *refPtrCurrent = refInput + batchCount * srcDescPtr->strides.nStride;
+        Rpp32f *srcPtrRow = srcPtrCurrent;
+        Rpp32f *refPtrRow = refPtrCurrent;
+        for(int i = 0; i < srcDims[batchCount * 2]; i++)
+        {
+            Rpp32f *srcPtrTemp = srcPtrRow;
+            Rpp32f *refPtrTemp = refPtrRow;
+            for(int j = 0; j < srcDims[(batchCount * 2) + 1]; j++)
+                srcPtrTemp[j] = refPtrTemp[j];
+            srcPtrRow += srcDescPtr->strides.hStride;
+            refPtrRow += srcDescPtr->strides.hStride;
+        }
+    }
+    free(refInput);
+}
+
 void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dstDims, string testCase, string dst, string scriptPath)
 {
     fstream refFile;
@@ -214,7 +277,7 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst
     free(refOutput);
 }
 
-void verify_non_silent_region_detection(float *detectedIndex, float *detectionLength, string testCase, int bs, vector<string> audioNames, string dst)
+void verify_non_silent_region_detection(int *detectedIndex, int *detectionLength, string testCase, int bs, vector<string> audioNames, string dst)
 {
     int fileMatch = 0;
     for (int i = 0; i < bs; i++)
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index 8d223aff8..5f3018fe5 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -108,6 +108,7 @@ std::map<int, string> augmentationMap =
     {89, "tensor_max"},
     {90, "tensor_mean"},
     {91, "tensor_stddev"},
+    {92, "slice"}
 };
 
 // Golden outputs for Tensor min Kernel
@@ -472,6 +473,43 @@ inline void set_generic_descriptor(RpptGenericDescPtr descriptorPtr3D, int noOfI
     descriptorPtr3D->strides[4] = 1;
 }
 
+// sets generic descriptor dimensions and strides of src/dst for slice functionality
+inline void set_generic_descriptor_slice(RpptDescPtr srcDescPtr, RpptGenericDescPtr descriptorPtr3D, int batchSize)
+{
+    descriptorPtr3D->offsetInBytes = 0;
+    descriptorPtr3D->dataType = srcDescPtr->dataType;
+    descriptorPtr3D->layout = srcDescPtr->layout;
+    if(srcDescPtr->c == 3)
+    {
+        descriptorPtr3D->numDims = 4;
+        descriptorPtr3D->dims[0] = batchSize;
+        if (srcDescPtr->layout == RpptLayout::NHWC)
+        {
+            descriptorPtr3D->dims[1] = srcDescPtr->h;
+            descriptorPtr3D->dims[2] = srcDescPtr->w;
+            descriptorPtr3D->dims[3] = srcDescPtr->c;
+        }
+        else
+        {
+            descriptorPtr3D->dims[1] = srcDescPtr->c;
+            descriptorPtr3D->dims[2] = srcDescPtr->h;
+            descriptorPtr3D->dims[3] = srcDescPtr->w;
+        }
+        descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1] * descriptorPtr3D->dims[2] * descriptorPtr3D->dims[3];
+        descriptorPtr3D->strides[1] = descriptorPtr3D->dims[2] * descriptorPtr3D->dims[3];
+        descriptorPtr3D->strides[2] = descriptorPtr3D->dims[3];
+    }
+    else
+    {
+        descriptorPtr3D->numDims = 3;
+        descriptorPtr3D->dims[0] = batchSize;
+        descriptorPtr3D->dims[1] = srcDescPtr->h;
+        descriptorPtr3D->dims[2] = srcDescPtr->w;
+        descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1] * descriptorPtr3D->dims[2];
+        descriptorPtr3D->strides[1] = descriptorPtr3D->dims[2];
+    }
+}
+
 // sets descriptor dimensions and strides of src/dst
 inline void set_descriptor_dims_and_strides(RpptDescPtr descPtr, int noOfImages, int maxHeight, int maxWidth, int numChannels, int offsetInBytes)
 {
@@ -911,12 +949,18 @@ inline void read_bin_file(string refFile, T *binaryContent)
     FILE *fp;
     fp = fopen(refFile.c_str(), "rb");
     if(!fp)
-        std::cerr << "\n unable to open file : "<<refFile;
+    {
+        std::cout << "\n unable to open file : "<<refFile;
+        exit(0);
+    }
 
     fseek(fp, 0, SEEK_END);
     long fsize = ftell(fp);
     if (fsize == 0)
-        std::cerr << "File is empty";
+    {
+        std::cout << "File is empty";
+        exit(0);
+    }
 
     fseek(fp, 0, SEEK_SET);
     fread(binaryContent, fsize, 1, fp);
@@ -1309,4 +1353,60 @@ void inline init_ricap(int width, int height, int batchSize, Rpp32u *permutation
     roiPtrInputCropRegion[1].xywhROI = {randrange(0, part0Width - 8), randrange(0, height - part0Height), width - part0Width, part0Height};
     roiPtrInputCropRegion[2].xywhROI = {randrange(0, width - part0Width - 8), randrange(0, part0Height), part0Width, height - part0Height};
     roiPtrInputCropRegion[3].xywhROI = {randrange(0, part0Width - 8), randrange(0, part0Height), width - part0Width, height - part0Height};
-}
\ No newline at end of file
+}
+
+// initialize the roi, anchor and shape values required for slice
+void init_slice(RpptGenericDescPtr descriptorPtr3D, RpptROIPtr roiPtrSrc, Rpp32u *roiTensor, Rpp32s *anchorTensor, Rpp32s *shapeTensor)
+{
+    if(descriptorPtr3D->numDims == 4)
+    {
+        if (descriptorPtr3D->layout == RpptLayout::NCHW)
+        {
+            for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+            {
+                int idx1 = i * 3;
+                int idx2 = i * 6;
+                roiTensor[idx2] = anchorTensor[idx1] = 0;
+                roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiPtrSrc[i].xywhROI.xy.y;
+                roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = roiPtrSrc[i].xywhROI.xy.x;
+                roiTensor[idx2 + 3] = descriptorPtr3D->dims[1];
+                roiTensor[idx2 + 4] = roiPtrSrc[i].xywhROI.roiHeight;
+                roiTensor[idx2 + 5] = roiPtrSrc[i].xywhROI.roiWidth;
+                shapeTensor[idx1] = roiTensor[idx2 + 3];
+                shapeTensor[idx1 + 1] = roiTensor[idx2 + 4] / 2;
+                shapeTensor[idx1 + 2] = roiTensor[idx2 + 5] / 2;
+            }
+        }
+        else if(descriptorPtr3D->layout == RpptLayout::NHWC)
+        {
+            for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+            {
+                int idx1 = i * 3;
+                int idx2 = i * 6;
+                roiTensor[idx2] = anchorTensor[idx1] = roiPtrSrc[i].xywhROI.xy.y;
+                roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiPtrSrc[i].xywhROI.xy.x;
+                roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = 0;
+                roiTensor[idx2 + 3] = roiPtrSrc[i].xywhROI.roiHeight;
+                roiTensor[idx2 + 4] = roiPtrSrc[i].xywhROI.roiWidth;
+                roiTensor[idx2 + 5] = descriptorPtr3D->dims[3];
+                shapeTensor[idx1] = roiTensor[idx2 + 3] / 2;
+                shapeTensor[idx1 + 1] = roiTensor[idx2 + 4] / 2;
+                shapeTensor[idx1 + 2] = roiTensor[idx2 + 5];
+            }
+        }
+    }
+    if(descriptorPtr3D->numDims == 3)
+    {
+        for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+        {
+            int idx1 = i * 2;
+            int idx2 = i * 4;
+            roiTensor[idx2] = anchorTensor[idx1] = roiPtrSrc[i].xywhROI.xy.y;
+            roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiPtrSrc[i].xywhROI.xy.x;
+            roiTensor[idx2 + 2] = roiPtrSrc[i].xywhROI.roiHeight;
+            roiTensor[idx2 + 3] = roiPtrSrc[i].xywhROI.roiWidth;
+            shapeTensor[idx1] = roiTensor[idx2 + 2] / 2;
+            shapeTensor[idx1 + 1] = roiTensor[idx2 + 3] / 2;
+        }
+    }
+}
diff --git a/utilities/test_suite/rpp_test_suite_misc.h b/utilities/test_suite/rpp_test_suite_misc.h
new file mode 100644
index 000000000..111dac74e
--- /dev/null
+++ b/utilities/test_suite/rpp_test_suite_misc.h
@@ -0,0 +1,340 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rpp_test_suite_common.h"
+
+using namespace std;
+
+std::map<int, string> augmentationMiscMap =
+{
+    {1, "normalize"}
+};
+
+void compute_strides(RpptGenericDescPtr descriptorPtr)
+{
+    if (descriptorPtr->numDims > 0)
+    {
+        uint64_t v = 1;
+        for (int i = descriptorPtr->numDims - 1; i > 0; i--)
+        {
+            descriptorPtr->strides[i] = v;
+            v *= descriptorPtr->dims[i];
+        }
+        descriptorPtr->strides[0] = v;
+    }
+}
+
+string get_path(Rpp32u nDim, Rpp32u readType, string scriptPath, bool isMeanStd = false)
+{
+    string folderName, suffix;
+    if(readType == 0)
+    {
+        folderName = "input";
+        if(isMeanStd)
+            suffix = "mean_std";
+        else
+            suffix = "input";
+    }
+    else if(readType == 1)
+    {
+        folderName = "output";
+        suffix = "output";
+    }
+
+    string fileName = std::to_string(nDim) + "d_" + suffix + ".bin";
+    string finalPath = scriptPath + "/../NORMALIZE/" + folderName + "/" + fileName;
+    return finalPath;
+}
+
+void read_data(Rpp32f *data, Rpp32u nDim, Rpp32u readType, string scriptPath, bool isMeanStd = false)
+{
+    if(nDim != 2 && nDim != 3)
+    {
+        std::cout<<"\nGolden Inputs / Outputs are generated only for 2D/3D data"<<std::endl;
+        exit(0);
+    }
+    string refPath = get_path(nDim, readType, scriptPath, isMeanStd);
+    read_bin_file(refPath, data);
+}
+
+// Fill the starting indices and length of ROI values
+void fill_roi_values(Rpp32u nDim, Rpp32u batchSize, Rpp32u *roiTensor, bool qaMode)
+{
+    if(qaMode)
+    {
+        switch(nDim)
+        {
+            case 2:
+            {
+                std::array<Rpp32u, 4> roi = {0, 0, 100, 100};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 4)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            case 3:
+            {
+                std::array<Rpp32u, 6> roi = {0, 0, 0, 50, 50, 8};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 6)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            default:
+            {
+                cout << "Error! QA mode is supported only for 2D/3D inputs" << endl;
+                exit(0);
+            }
+        }
+    }
+    else
+    {
+        switch(nDim)
+        {
+            case 2:
+            {
+                std::array<Rpp32u, 4> roi = {0, 0, 1920, 1080};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 4)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            case 3:
+            {
+                std::array<Rpp32u, 6> roi = {0, 0, 0, 1920, 1080, 3};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 6)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            case 4:
+            {
+                std::array<Rpp32u, 8> roi = {0, 0, 0, 0, 1, 128, 128, 128};
+                for(int i = 0, j = 0; i < batchSize ; i++, j += 8)
+                    std::copy(roi.begin(), roi.end(), &roiTensor[j]);
+                break;
+            }
+            default:
+            {
+                // if nDim is not 2/3/4 and mode choosen is not QA
+                for(int i = 0; i < batchSize; i++)
+                {
+                    int startIndex = i * nDim * 2;
+                    int lengthIndex = startIndex + nDim;
+                    for(int j = 0; j < nDim; j++)
+                    {
+                        roiTensor[startIndex + j] = 0;
+                        roiTensor[lengthIndex + j] = std::rand() % 10;  // limiting max value in a dimension to 10 for testing purposes
+                    }
+                }
+                break;
+            }
+        }
+    }
+}
+
+// Set layout for generic descriptor
+void set_generic_descriptor_layout(RpptGenericDescPtr srcDescriptorPtrND, RpptGenericDescPtr dstDescriptorPtrND, Rpp32u nDim, int toggle, int qaMode)
+{
+    if(qaMode && !toggle)
+    {
+        switch(nDim)
+        {
+            case 2:
+            {
+                srcDescriptorPtrND->layout = RpptLayout::NHWC;
+                dstDescriptorPtrND->layout = RpptLayout::NHWC;
+                break;
+            }
+            case 3:
+            {
+                srcDescriptorPtrND->layout = RpptLayout::NHWC;
+                dstDescriptorPtrND->layout = RpptLayout::NHWC;
+                break;
+            }
+            case 4:
+            {
+                srcDescriptorPtrND->layout = RpptLayout::NDHWC;
+                dstDescriptorPtrND->layout = RpptLayout::NDHWC;
+                break;
+            }
+            default:
+            {
+                cout << "Error! QA mode is supported only for 2D/3D inputs" << endl;
+                exit(0);
+            }
+        }
+    }
+    else if(nDim == 3)
+    {
+        if(toggle)
+        {
+            srcDescriptorPtrND->layout = RpptLayout::NHWC;
+            dstDescriptorPtrND->layout = RpptLayout::NCHW;
+        }
+    }
+    else
+    {
+        srcDescriptorPtrND->layout = RpptLayout::NDHWC;
+        dstDescriptorPtrND->layout = RpptLayout::NDHWC;
+    }
+}
+
+// sets generic descriptor numDims, offsetInBytes,  bitdepth, dims and strides
+inline void set_generic_descriptor(RpptGenericDescPtr descriptorPtr3D, int nDim, int offsetInBytes, int bitDepth, int batchSize, Rpp32u *roiTensor)
+{
+    descriptorPtr3D->numDims = nDim + 1;
+    descriptorPtr3D->offsetInBytes = offsetInBytes;
+    if (bitDepth == 0)
+        descriptorPtr3D->dataType = RpptDataType::U8;
+    else if (bitDepth == 1)
+        descriptorPtr3D->dataType = RpptDataType::F16;
+    else if (bitDepth == 2)
+        descriptorPtr3D->dataType = RpptDataType::F32;
+    else if (bitDepth == 5)
+        descriptorPtr3D->dataType = RpptDataType::I8;
+    descriptorPtr3D->dims[0] = batchSize;
+    for(int i = 1; i <= nDim; i++)
+        descriptorPtr3D->dims[i] = roiTensor[nDim + i - 1];
+    compute_strides(descriptorPtr3D);
+}
+
+// strides used for jumping to corresponding axisMask mean and stddev
+std::map<Rpp32s, Rpp32u> paramStrideMap2D =
+{
+    {1, 0},
+    {2, 100},
+    {3, 200}
+};
+
+// strides used for jumping to corresponding axisMask mean and stddev
+std::map<Rpp32s, Rpp32u> paramStrideMap3D =
+{
+    {1, 0},
+    {2, 400},
+    {3, 800},
+    {4, 808},
+    {5, 3308},
+    {6, 3358},
+    {7, 3408}
+};
+
+// fill the mean and stddev values used for normalize
+void fill_mean_stddev_values(Rpp32u nDim, Rpp32u size, Rpp32f *meanTensor,
+                             Rpp32f *stdDevTensor, bool qaMode, int axisMask, string scriptPath)
+{
+    if(qaMode)
+    {
+        Rpp32u numValues, paramStride;
+        switch(nDim)
+        {
+            case 2:
+            {
+                numValues = 100 + 100 + 1;
+                paramStride = paramStrideMap2D[axisMask];
+                break;
+            }
+            case 3:
+            {
+                numValues = 400 + 400 + 8 + 2500 + 50 + 50 + 1;
+                paramStride = paramStrideMap3D[axisMask];
+                break;
+            }
+            default:
+            {
+                cout << "Error! QA mode is supported only for 2D/3D inputs" << endl;
+                exit(0);
+            }
+        }
+        std::vector<Rpp32f> paramBuf(numValues * 2);
+        Rpp32f *data = paramBuf.data();
+        read_data(data, nDim, 0, scriptPath, true);
+        memcpy(meanTensor, data + paramStride, size * sizeof(Rpp32f));
+        memcpy(stdDevTensor, data + numValues + paramStride, size * sizeof(Rpp32f));
+    }
+    else
+    {
+        for(int j = 0; j < size; j++)
+        {
+            meanTensor[j] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
+            stdDevTensor[j] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
+        }
+    }
+}
+
+Rpp32u get_bin_size(Rpp32u nDim, Rpp32u readType, string scriptPath)
+{
+    string refFile = get_path(nDim, readType, scriptPath);
+    std::ifstream filestream(refFile, ios_base::in | ios_base::binary);
+    filestream.seekg(0, ios_base::end);
+    Rpp32u filesize = filestream.tellg();
+    return filesize;
+}
+
+void compare_output(Rpp32f *outputF32, Rpp32u nDim, Rpp32u batchSize, Rpp32u bufferLength,
+                    string dst, string funcName, int axisMask, string scriptPath, bool isMeanStd = false)
+{
+    Rpp32u goldenOutputLength = get_bin_size(nDim, 1, scriptPath);
+    Rpp32f *refOutput = static_cast<Rpp32f *>(calloc(goldenOutputLength, 1));
+    read_data(refOutput, nDim, 1, scriptPath);
+    int meanStdDevOutputStride = 0;
+    if(isMeanStd)
+        meanStdDevOutputStride = goldenOutputLength / (2 * sizeof(Rpp32f));
+    int axisMaskStride = (axisMask - 1) * bufferLength;
+    int sampleLength = bufferLength / batchSize;
+    int fileMatch = 0;
+    for(int i = 0; i < batchSize; i++)
+    {
+        Rpp32f *ref = refOutput + meanStdDevOutputStride + axisMaskStride + i * sampleLength;
+        Rpp32f *out = outputF32 + i * sampleLength;
+        int cnt = 0;
+        for(int j = 0; j < sampleLength; j++)
+        {
+            bool invalid_comparision = ((out[j] == 0.0f) && (ref[j] != 0.0f));
+            if(!invalid_comparision && abs(out[j] - ref[j]) < 1e-4)
+                cnt++;
+        }
+        if (cnt == sampleLength)
+            fileMatch++;
+    }
+
+    std::string status = funcName + ": ";
+    cout << std::endl << "Results for Test case: " << funcName << std::endl;
+    if (fileMatch == batchSize)
+    {
+        std::cout << "\nPASSED!"<<std::endl;
+        status += "PASSED";
+    }
+    else
+    {
+        std::cout << "\nFAILED! " << fileMatch << "/" << batchSize << " outputs are matching with reference outputs" << std::endl;
+        status += "FAILED";
+    }
+    free(refOutput);
+
+    // Append the QA results to file
+    std::string qaResultsPath = dst + "/QA_results.txt";
+    std:: ofstream qaResults(qaResultsPath, ios_base::app);
+    if (qaResults.is_open())
+    {
+        qaResults << status << std::endl;
+        qaResults.close();
+    }
+}
diff --git a/utilities/test_suite/rpp_test_suite_voxel.h b/utilities/test_suite/rpp_test_suite_voxel.h
index c24fc29a8..77de34f3b 100644
--- a/utilities/test_suite/rpp_test_suite_voxel.h
+++ b/utilities/test_suite/rpp_test_suite_voxel.h
@@ -242,6 +242,51 @@ inline string set_function_type(int layoutType, int pln1OutTypeCase, int outputF
     return funcType;
 }
 
+// initialize the roi, anchor and shape values required for slice
+void init_slice_voxel(RpptGenericDescPtr descriptorPtr3D, RpptROI3D *roiGenericSrcPtr, Rpp32u *roiTensor, Rpp32s *anchorTensor, Rpp32s *shapeTensor)
+{
+    if (descriptorPtr3D->layout == RpptLayout::NCDHW)
+    {
+        for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+        {
+            int idx1 = i * 4;
+            int idx2 = i * 8;
+            roiTensor[idx2] = anchorTensor[idx1] = 0;
+            roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiGenericSrcPtr[i].xyzwhdROI.xyz.z;
+            roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = roiGenericSrcPtr[i].xyzwhdROI.xyz.y;
+            roiTensor[idx2 + 3] = anchorTensor[idx1 + 3] = roiGenericSrcPtr[i].xyzwhdROI.xyz.x;
+            roiTensor[idx2 + 4] = descriptorPtr3D->dims[1];
+            roiTensor[idx2 + 5] = roiGenericSrcPtr[i].xyzwhdROI.roiDepth;
+            roiTensor[idx2 + 6] = roiGenericSrcPtr[i].xyzwhdROI.roiHeight;
+            roiTensor[idx2 + 7] = roiGenericSrcPtr[i].xyzwhdROI.roiWidth;
+            shapeTensor[idx1] = roiTensor[idx2 + 4];
+            shapeTensor[idx1 + 1] = roiTensor[idx2 + 5] / 2;
+            shapeTensor[idx1 + 2] = roiTensor[idx2 + 6] / 2;
+            shapeTensor[idx1 + 3] = roiTensor[idx2 + 7] / 2;
+        }
+    }
+    else if(descriptorPtr3D->layout == RpptLayout::NDHWC)
+    {
+        for(int i = 0; i < descriptorPtr3D->dims[0]; i++)
+        {
+            int idx1 = i * 4;
+            int idx2 = i * 8;
+            roiTensor[idx2] = anchorTensor[idx1] = roiGenericSrcPtr[i].xyzwhdROI.xyz.z;
+            roiTensor[idx2 + 1] = anchorTensor[idx1 + 1] = roiGenericSrcPtr[i].xyzwhdROI.xyz.y;
+            roiTensor[idx2 + 2] = anchorTensor[idx1 + 2] = roiGenericSrcPtr[i].xyzwhdROI.xyz.x;
+            roiTensor[idx2 + 3] = anchorTensor[idx1 + 3] = 0;
+            roiTensor[idx2 + 4] = roiGenericSrcPtr[i].xyzwhdROI.roiDepth;
+            roiTensor[idx2 + 5] = roiGenericSrcPtr[i].xyzwhdROI.roiHeight;
+            roiTensor[idx2 + 6] = roiGenericSrcPtr[i].xyzwhdROI.roiWidth;
+            roiTensor[idx2 + 7] = descriptorPtr3D->dims[4];
+            shapeTensor[idx1] = roiTensor[idx2 + 4] / 2;
+            shapeTensor[idx1 + 1] = roiTensor[idx2 + 5] / 2;
+            shapeTensor[idx1 + 2] = roiTensor[idx2 + 6] / 2;
+            shapeTensor[idx1 + 3] = roiTensor[idx2 + 7];
+        }
+    }
+}
+
 // reads nifti-1 header file
 static int read_nifti_header_file(char* const header_file, nifti_1_header &niftiHeader)
 {