diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 7be3d2fd4..916a0a0ad 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-# Documentation files
-docs/* @saadrahim @LisaDelaney
-*.md  @saadrahim @LisaDelaney
-*.rst  @saadrahim @LisaDelaney
-# Header directory
-library/include/*  @saadrahim @LisaDelaney @kiritigowda @rrawther
 # Source code
 @kiritigowda @rrawther
+# Documentation files
+docs/* @ROCm/rocm-documentation
+*.md @ROCm/rocm-documentation
+*.rst @ROCm/rocm-documentation
+# Header directory
+library/include/* @ROCm/rocm-documentation @kiritigowda @rrawther
diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
index 663d3c085..0d7834e2b 100644
--- a/.jenkins/precheckin.groovy
+++ b/.jenkins/precheckin.groovy
@@ -47,7 +47,7 @@ ci: {
     def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
     propertyList = auxiliary.appendPropertyList(propertyList)
 
-    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu22:['gfx908'], ubuntu20:['gfx906'], centos8:['gfx908']])]
+    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['gfx90a'], ubuntu22:['gfx1101'], sles15sp1:['gfx908'], rhel8:['gfx1030'], rhel9:['gfx908']])]
     jobNameList = auxiliary.appendJobNameList(jobNameList)
 
     propertyList.each 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34ce6fcac..224125b36 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -346,6 +346,7 @@ install(FILES ${PROJECT_BINARY_DIR}/include/rpp_backend.h
 # install Test
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test)
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_suite/ DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/utilities/rpp-performancetests DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test)
 
 # set license information
 set(CPACK_RESOURCE_FILE_LICENSE  "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 0c9b63672..8ecbd3663 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1 +1 @@
-rocm-docs-core[api_reference]==0.33.0
+rocm-docs-core[api_reference]==0.35.0
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index f7bc7e2c1..ea1c7619a 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -112,7 +112,7 @@ requests==2.28.2
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core[api-reference]==0.33.0
+rocm-docs-core[api-reference]==0.35.0
     # via
     #   -r requirements.in
     #   rocm-docs-core
diff --git a/include/rppdefs.h b/include/rppdefs.h
index 2beafbc0c..b0baf7d34 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -116,8 +116,8 @@ typedef enum
     RPP_ERROR_NOT_ENOUGH_MEMORY         = -16,
     /*! \brief Out of bound source ROI \ingroup group_rppdefs */
     RPP_ERROR_OUT_OF_BOUND_SRC_ROI      = -17,
-    /*! \brief src and dst layout mismatch \ingroup group_rppdefs */
-    RPP_ERROR_SRC_DST_LAYOUT_MISMATCH   = -18
+    /*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
+    RPP_ERROR_INVALID_CHANNELS          = -18
 } RppStatus;
 
 /*! \brief RPP rppStatus_t type enums
diff --git a/include/rppi_arithmetic_operations.h b/include/rppi_arithmetic_operations.h
index 0fb79dbf6..17aef722d 100644
--- a/include/rppi_arithmetic_operations.h
+++ b/include/rppi_arithmetic_operations.h
@@ -320,4 +320,4 @@ RppStatus rppi_tensor_multiply_u8_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RppPtr
 }
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h
index 0a247f886..51705eefc 100644
--- a/include/rppt_tensor_arithmetic_operations.h
+++ b/include/rppt_tensor_arithmetic_operations.h
@@ -30,7 +30,7 @@ SOFTWARE.
  * \brief RPPT Tensor Arithmetic operation Functions.
  *
  * \defgroup group_tensor_arithmetic Operations: AMD RPP Tensor Arithmetic Operations
- * \brief Tensor Color Augmentations.
+ * \brief Tensor Arithmetic Operations.
  */
 
 #include "rpp.h"
@@ -39,53 +39,221 @@ SOFTWARE.
 extern "C" {
 #endif
 
-/*! \brief  Fmadd augmentation HOST
+/*!
+ * \file
+ * \brief RPPT Tensor Operations - Arithmetic Operations.
+ * \defgroup group_tensor_arithmetic_operations RPPT Tensor Operations - Arithmetic Operations.
+ * \brief RPPT Tensor Operations - Arithmetic Operations.
+ */
+
+/*! \addtogroup group_rppt_tensor_arithmetic_operations
+ * @{
+ */
+
+/*! \brief Fused multiply add scalar augmentation on HOST backend
  * \details This function performs the fmadd operation on a batch of 4D tensors.
  *          It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
  *          adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \param [in] srcPtr source tensor memory
+ * \param [in] srcPtr source tensor in HOST memory
  * \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor memory
+ * \param[out] dstPtr destination tensor in HOST memory
  * \param[in] dstGenericDescPtr destination tensor descriptor
  * \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values)
  * \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values)
  * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
  * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
- * \param [in] rppHandle Host-handle
- * \return <tt> RppStatus enum</tt>.
- * \returns RPP_SUCCESS <tt>\ref RppStatus</tt> on successful completion.
- * Else return RPP_ERROR
- * \ingroup group_tensor_arithmetic
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
  */
 RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
 
-
-/*! \brief  Fmadd augmentation GPU
+#ifdef GPU_SUPPORT
+/*! \brief Fused multiply add scalar augmentation on HIP backend
  * \details This function performs the fmadd operation on a batch of 4D tensors.
  *          It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
  *          adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \param [in] srcPtr source tensor memory
+ * \param [in] srcPtr source tensor in HIP memory
  * \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor memory
+ * \param[out] dstPtr destination tensor in HIP memory
  * \param[in] dstGenericDescPtr destination tensor descriptor
  * \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values)
  * \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values)
  * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
  * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
- * \param [in] rppHandle Hip-handle
- * \return <tt> RppStatus enum</tt>.
- * \returns RPP_SUCCESS <tt>\ref RppStatus</tt> on successful completion.
- * Else return RPP_ERROR
- * \ingroup group_tensor_arithmetic
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Add scalar augmentation on HOST backend
+ * \details This function performs the addition operation on a batch of 4D tensors.
+ *          It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
+ *          Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
  */
+RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
 
 #ifdef GPU_SUPPORT
-RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+/*! \brief Add scalar augmentation on HIP backend
+ * \details This function performs the addition operation on a batch of 4D tensors.
+ *          It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
+ *          Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HIP memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Subtract scalar augmentation on HOST backend
+ * \details This function performs the subtraction operation on a batch of 4D tensors.
+ *          It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
+ *          Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Subtract scalar augmentation on HIP backend
+ * \details This function performs the subtraction operation on a batch of 4D tensors.
+ *          It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
+ *          Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HIP memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Multiply scalar augmentation on HOST backend
+ * \details This function performs the multiplication operation on a batch of 4D tensors.
+ *          It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor.
+ *          Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] mulTensor multiplier values for used for multiplication (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Multiply scalar augmentation on HIP backend
+ * \details This function performs the multiplication operation on a batch of 4D tensors.
+ *          It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor.
+ *          Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HIP memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] mulTensor multiplier values for used for multiplication (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/*! \brief Magnitude computation on HOST backend for a NCHW/NHWC layout tensor
+ * \details This function computes magnitude of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ *          srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ *          dstPtr depth ranges - Will be same depth as srcPtr. <br>
+ * \image html img150x150.jpg Sample Input1
+ * \image html img150x150_2.jpg Sample Input2
+ * \image html magnitude_operation_img150x150.jpg Sample Output
+ * \param [in] srcPtr1 source1 tensor in HOST memory
+ * \param [in] srcPtr2 source2 tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Magnitude computation on HOST backend for a NCHW/NHWC layout tensor
+ * \details This function computes magnitude of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ *          srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ *          dstPtr depth ranges - Will be same depth as srcPtr. <br>
+ * \image html img150x150.jpg Sample Input1
+ * \image html img150x150_2.jpg Sample Input2
+ * \image html magnitude_operation_img150x150.jpg Sample Output
+ * \param [in] srcPtr1 source1 tensor in HOST memory
+ * \param [in] srcPtr2 source2 tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! @}
+ */
+
 #ifdef __cplusplus
 }
 #endif
-#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H
+#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H
\ No newline at end of file
diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
index 138b3baa8..31bb34eff 100644
--- a/include/rppt_tensor_audio_augmentations.h
+++ b/include/rppt_tensor_audio_augmentations.h
@@ -95,7 +95,22 @@ RppStatus rppt_to_decibels_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
  */
 RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcLengthTensor, Rpp32f *coeffTensor, RpptAudioBorderType borderType, rppHandle_t rppHandle);
 
+/*! \brief Down Mixing augmentation on HOST backend
+* \details Down Mixing augmentation for audio data
+* \param[in] srcPtr source tensor in HOST memory
+* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param[out] dstPtr destination tensor in HOST memory
+* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+* \param[in] normalizeWeights bool flag to specify if normalization of weights is needed
+* \param[in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+* \return A <tt> \ref RppStatus</tt> enumeration.
+* \retval RPP_SUCCESS Successful completion.
+* \retval RPP_ERROR* Unsuccessful completion.
+*/
+RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle);
+
 #ifdef __cplusplus
 }
 #endif
-#endif // RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
diff --git a/include/rppt_tensor_color_augmentations.h b/include/rppt_tensor_color_augmentations.h
index deabd885d..99909cb42 100644
--- a/include/rppt_tensor_color_augmentations.h
+++ b/include/rppt_tensor_color_augmentations.h
@@ -417,6 +417,48 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
 RppStatus rppt_lut_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RppPtr_t lutPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/*! \brief Color Temperature augmentation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The color temperature augmentation does a image temperature adjustment operation, taking a pixel adjustment value as argument for each image in a batch of RGB(3 channel) with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input
+ * \image html color_augmentations_color_temperature_img150x150.jpg Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp8s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Color Temperature augmentation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The color temperature augmentation does a image temperature adjustment operation, taking a pixel adjustment value as argument for each image in a batch of RGB(3 channel) with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input
+ * \image html color_augmentations_color_temperature_img150x150.jpg Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_color_temperature_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! @}
  */
 
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 181b1c565..3cb49a82b 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -24,6 +24,7 @@ SOFTWARE.
 
 #ifndef RPPT_TENSOR_STATISTICAL_OPERATIONS_H
 #define RPPT_TENSOR_STATISTICAL_OPERATIONS_H
+
 #include "rpp.h"
 #include "rppdefs.h"
 #ifdef __cplusplus
@@ -77,6 +78,78 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
 RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorSumArr, Rpp32u tensorSumArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/*! \brief Tensor min operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] minArr destination array in HOST memory
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t minArr, Rpp32u minArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor min operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] minArr destination array in HIP memory
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMinArr, Rpp32u imageMinArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Tensor max operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] maxArr destination array in HOST memory
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t maxArr, Rpp32u maxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor max operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] maxArr destination array in HIP memory
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! @}
  */
 
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index 67c34de70..1e748cc86 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -2431,6 +2431,24 @@ inline RppStatus custom_convolve_image_host(T* srcPtr, RppiSize srcSize, U* dstP
 
 // Compute Functions for RPP Tensor API
 
+inline void compute_multiply_16_host(__m256 *p, __m256 *pMulParam)
+{
+    p[0] = _mm256_mul_ps(p[0], pMulParam[0]);    // multiply adjustment
+    p[1] = _mm256_mul_ps(p[1], pMulParam[0]);    // multiply adjustment
+}
+
+inline void compute_subtract_16_host(__m256 *p, __m256 *pSubtractParam)
+{
+    p[0] = _mm256_sub_ps(p[0], pSubtractParam[0]);    // subtract adjustment
+    p[1] = _mm256_sub_ps(p[1], pSubtractParam[0]);    // subtract adjustment
+}
+
+inline void compute_add_16_host(__m256 *p, __m256 *pAddParam)
+{
+    p[0] = _mm256_add_ps(p[0], pAddParam[0]);    // add adjustment
+    p[1] = _mm256_add_ps(p[1], pAddParam[0]);    // add adjustment
+}
+
 inline void compute_rmn_24_host(__m256 *p, __m256 *pRMNParams)
 {
     p[0] = _mm256_mul_ps(_mm256_sub_ps(p[0], pRMNParams[0]), pRMNParams[1]);
@@ -3032,6 +3050,22 @@ inline void compute_color_cast_12_host(__m128 *p, __m128 pMul, __m128 *pAdd)
     p[2] = _mm_fmadd_ps(_mm_sub_ps(p[2], pAdd[2]), pMul, pAdd[2]);    // color_cast adjustment Rs
 }
 
+inline void compute_color_temperature_48_host(__m256 *p, __m256 pAdj)
+{
+    p[0] = _mm256_add_ps(p[0], pAdj);    // color_temperature adjustment Rs
+    p[1] = _mm256_add_ps(p[1], pAdj);    // color_temperature adjustment Rs
+    // no color_temperature adjustment Gs
+    p[4] = _mm256_sub_ps(p[4], pAdj);    // color_temperature adjustment Bs
+    p[5] = _mm256_sub_ps(p[5], pAdj);    // color_temperature adjustment Bs
+}
+
+inline void compute_color_temperature_24_host(__m256 *p, __m256 pAdj)
+{
+    p[0] = _mm256_add_ps(p[0], pAdj);    // color_temperature adjustment Rs
+    // no color_temperature adjustment Gs
+    p[2] = _mm256_sub_ps(p[2], pAdj);    // color_temperature adjustment Bs
+}
+
 inline void compute_xywh_from_ltrb_host(RpptROIPtr roiPtrInput, RpptROIPtr roiPtrImage)
 {
     roiPtrImage->xywhROI.xy.x = roiPtrInput->ltrbROI.lt.x;
@@ -5962,4 +5996,284 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2
     pSumB[0] = _mm256_add_pd(_mm256_add_pd(p[4], p[5]), pSumB[0]); //add 8B values and bring it down to 4
 }
 
-#endif //RPP_CPU_COMMON_H
\ No newline at end of file
+inline void reduce_min_32_host(__m256i *pMin, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+    px[0] = _mm256_castsi256_si128(pMin[0]);
+    px[1] = _mm256_extracti128_si256(pMin[0], 1);
+    px[0] = _mm_min_epu8(px[0], px[1]);
+    px[1] = _mm_unpacklo_epi8(zero, px[0]);
+    px[0] = _mm_unpackhi_epi8(zero, px[0]);
+    px[0] = _mm_min_epu8(px[0], px[1]);
+    px[1] = _mm_unpacklo_epi16(zero, px[0]);
+    px[0] = _mm_unpackhi_epi16(zero, px[0]);
+    px[0] = _mm_min_epu16(px[0], px[1]);
+    px[1] = _mm_unpacklo_epi32(zero, px[0]);
+    px[0] = _mm_unpackhi_epi32(zero, px[0]);
+    px[0] = _mm_min_epu32(px[0], px[1]);
+    result[0] = _mm_shuffle_epi8(px[0], mask);
+}
+
+inline void compute_min_96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB)
+{
+    pMinR[0] = _mm256_min_epu8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min
+    pMinG[0] = _mm256_min_epu8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min
+    pMinB[0] = _mm256_min_epu8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min
+}
+
+inline void reduce_min_96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result)
+{
+    __m128i px[4];
+    __m128i zero = _mm_setzero_si128();
+    px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1));
+    px[1] = _mm_min_epu8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1));
+    px[1] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+    px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1));
+    px[0] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+    px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB)
+{
+    pMinR[0] = _mm_min_epu8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min
+    pMinG[0] = _mm_min_epu8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min
+    pMinB[0] = _mm_min_epu8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min
+}
+
+inline void reduce_min_48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    px[1] = _mm_min_epu8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0]));
+    px[0] = _mm_min_epu8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero));
+    px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void reduce_max_32_host(__m256i *pMax, __m128i *result)
+{
+    __m128i px;
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+    px = _mm_max_epu8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1));
+    px = _mm_max_epu8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px));
+    px = _mm_max_epu16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px));
+    px = _mm_max_epu32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px));
+    result[0] = _mm_shuffle_epi8(px, mask);
+}
+
+inline void compute_max_96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB)
+{
+    pMaxR[0] = _mm256_max_epu8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max
+    pMaxG[0] = _mm256_max_epu8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max
+    pMaxB[0] = _mm256_max_epu8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max
+}
+
+inline void reduce_max_96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result)
+{
+    __m128i px[4];
+    __m128i zero = _mm_setzero_si128();
+    px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1));
+    px[1] = _mm_max_epu8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1));
+    px[1] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+    px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1));
+    px[0] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+    px[1] = _mm_max_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_max_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_max_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_max_48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB)
+{
+    pMaxR[0] = _mm_max_epu8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max
+    pMaxG[0] = _mm_max_epu8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max
+    pMaxB[0] = _mm_max_epu8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max
+}
+
+inline void reduce_max_48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero));
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_float8_host(__m256 *p1, __m256 *pMin)
+{
+    pMin[0] = _mm256_min_ps(p1[0], pMin[0]); //compare and store min of 8 values into global min
+}
+
+inline void reduce_min_float8_host(__m256 *pMin, __m128 *result)
+{
+    __m128 px;
+    px = _mm_min_ps(_mm256_castps256_ps128(pMin[0]), _mm256_extractf128_ps(pMin[0], 1));
+    px = _mm_min_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px));
+    result[0] = _mm_shuffle_ps(px, px, 39);
+}
+
+inline void compute_min_float24_host(__m256 *p1, __m256 *pMinR, __m256 *pMinG, __m256 *pMinB)
+{
+    pMinR[0] = _mm256_min_ps(p1[0], pMinR[0]); //compare and store min of 8 R values into global min
+    pMinG[0] = _mm256_min_ps(p1[1], pMinG[0]); //compare and store min of 8 G values into global min
+    pMinB[0] = _mm256_min_ps(p1[2], pMinB[0]); //compare and store min of 8 B values into global min
+}
+
+inline void reduce_min_float24_host(__m256 *pMinR, __m256 *pMinG, __m256 *pMinB, __m256 *result)   // TO CHANGE
+{
+    __m128 px[2];
+    px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinR[0]), _mm256_extractf128_ps(pMinR[0], 1));
+    px[1] = _mm_min_ps(_mm256_castps256_ps128(pMinG[0]), _mm256_extractf128_ps(pMinG[0], 1));
+    px[0] = _mm_min_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1]));
+    px[0] = _mm_permute_ps(px[0], 0b11011000);
+    result[0] = _mm256_castps128_ps256(px[0]);
+    px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinB[0]), _mm256_extractf128_ps(pMinB[0], 1));
+    px[1] = _mm_min_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0));
+    px[0] = _mm_shuffle_ps(px[1], px[1], 34);
+    result[0] = _mm256_insertf128_ps(result[0], px[0], 1);
+}
+
+inline void compute_max_float8_host(__m256 *p1, __m256 *pMax)
+{
+    pMax[0] = _mm256_max_ps(p1[0], pMax[0]); //compare and store max of 8 values into global min
+}
+
+inline void reduce_max_float8_host(__m256 *pMax, __m128 *result)
+{
+    __m128 px;
+    px = _mm_max_ps(_mm256_castps256_ps128(pMax[0]), _mm256_extractf128_ps(pMax[0], 1));
+    px = _mm_max_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px));
+    result[0] = _mm_shuffle_ps(px, px, 39);
+}
+
+inline void compute_max_float24_host(__m256 *p1, __m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB)
+{
+    pMaxR[0] = _mm256_max_ps(p1[0], pMaxR[0]); //compare and store max of 8 R values into global min
+    pMaxG[0] = _mm256_max_ps(p1[1], pMaxG[0]); //compare and store max of 8 G values into global min
+    pMaxB[0] = _mm256_max_ps(p1[2], pMaxB[0]); //compare and store max of 8 B values into global min
+}
+
+inline void reduce_max_float24_host(__m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB, __m256 *result)
+{
+    __m128 px[2];
+    px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxR[0]), _mm256_extractf128_ps(pMaxR[0], 1));
+    px[1] = _mm_max_ps(_mm256_castps256_ps128(pMaxG[0]), _mm256_extractf128_ps(pMaxG[0], 1));
+    px[0] = _mm_max_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1]));
+    px[0] = _mm_permute_ps(px[0], 0b11011000);
+    result[0] = _mm256_castps128_ps256(px[0]);
+    px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxB[0]), _mm256_extractf128_ps(pMaxB[0], 1));
+    px[1] = _mm_max_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0));
+    px[0] = _mm_shuffle_ps(px[1], px[1], 34);
+    result[0] = _mm256_insertf128_ps(result[0], px[0], 1);
+}
+
+inline void reduce_min_i32_host(__m256i *pMin, __m128i *result)
+{
+    __m128i px;
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+    px = _mm_min_epi8(_mm256_castsi256_si128(pMin[0]), _mm256_extracti128_si256(pMin[0], 1));
+    px = _mm_min_epi8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px));
+    px = _mm_min_epi16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px));
+    px = _mm_min_epi32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px));
+    result[0] = _mm_shuffle_epi8(px, mask);
+}
+
+inline void compute_min_i96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB)
+{
+    pMinR[0] = _mm256_min_epi8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min
+    pMinG[0] = _mm256_min_epi8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min
+    pMinB[0] = _mm256_min_epi8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min
+}
+
+inline void reduce_min_i96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result)
+{
+    __m128i px[4];
+    __m128i zero = _mm_setzero_si128();
+    px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1));
+    px[1] = _mm_min_epi8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1));
+    px[1] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+    px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1));
+    px[0] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+    px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_i48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB)
+{
+    pMinR[0] = _mm_min_epi8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min
+    pMinG[0] = _mm_min_epi8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min
+    pMinB[0] = _mm_min_epi8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min
+}
+
+inline void reduce_min_i48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    px[1] = _mm_min_epi8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0]));
+    px[0] = _mm_min_epi8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero));
+    px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void reduce_max_i32_host(__m256i *pMax, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+    px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi8(zero, px[0]), _mm_unpackhi_epi8(zero, px[0]));
+    px[0] = _mm_max_epi16(_mm_unpacklo_epi16(zero, px[0]), _mm_unpackhi_epi16(zero, px[0]));
+    px[0] = _mm_max_epi32(_mm_unpacklo_epi32(zero, px[0]), _mm_unpackhi_epi32(zero, px[0]));
+    result[0] = _mm_shuffle_epi8(px[0], mask);
+}
+
+inline void compute_max_i96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB)
+{
+    pMaxR[0] = _mm256_max_epi8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max
+    pMaxG[0] = _mm256_max_epi8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max
+    pMaxB[0] = _mm256_max_epi8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max
+}
+
+inline void reduce_max_i96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result)
+{
+    __m128i px[4];
+    __m128i zero = _mm_setzero_si128();
+    px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1));
+    px[1] = _mm_max_epi8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1));
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+    px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_max_i48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB)
+{
+    pMaxR[0] = _mm_max_epi8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max
+    pMaxG[0] = _mm_max_epi8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max
+    pMaxB[0] = _mm_max_epi8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max
+}
+
+inline void reduce_max_i48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero));
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+#endif //RPP_CPU_COMMON_H
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index 84c898b90..d03ec0e79 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -75,7 +75,7 @@ typedef union
 
 #define SIMD_GET_PS(name) (*(const __m128  *)_xmm_const_##name)
 
-const __m128 xmm_p0 = _mm_set1_ps(0.0f);
+const __m128 xmm_p0 = _mm_setzero_ps();
 const __m128 xmm_p1 = _mm_set1_ps(1.0f);
 const __m128 xmm_p2 = _mm_set1_ps(2.0f);
 const __m128 xmm_pm2 = _mm_set1_ps(-2.0f);
@@ -243,7 +243,7 @@ inline void rpp_mm256_print_epi8(__m256i vPrintArray)
     printf("\n");
     for (int ct = 0; ct < 32; ct++)
     {
-        printf("%d ", printArray[ct]);
+        printf("%d ", (unsigned char)printArray[ct]);
     }
 }
 
@@ -1271,6 +1271,20 @@ inline void rpp_load16_u8_to_u32_avx(Rpp8u *srcPtr, __m256i *p)
     p[1] = _mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15));    /* Contains pixels 09-16 */
 }
 
+inline void rpp_load96_u8_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256i *p)
+{
+    p[0] = _mm256_loadu_si256((__m256i *)srcPtrR);
+    p[1] = _mm256_loadu_si256((__m256i *)srcPtrG);
+    p[2] = _mm256_loadu_si256((__m256i *)srcPtrB);
+}
+
+inline void rpp_load96_i8_avx(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *p)
+{
+    p[0] = _mm256_load_si256((__m256i *)srcPtrR);
+    p[1] = _mm256_load_si256((__m256i *)srcPtrG);
+    p[2] = _mm256_load_si256((__m256i *)srcPtrB);
+}
+
 inline void rpp_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p)
 {
     __m128 p128[8];
@@ -1478,6 +1492,16 @@ inline void rpp_store4_f64_to_f64_avx(Rpp64f *dstPtr, __m256d *p)
     _mm256_storeu_pd(dstPtr, p[0]);
 }
 
+inline void rpp_store16_u8_to_u8(Rpp8u *dstPtr, __m128i *p)
+{
+    _mm_storeu_si128((__m128i *)dstPtr, p[0]);
+}
+
+inline void rpp_store16_i8(Rpp8s *dstPtr, __m128i *p)
+{
+    _mm_store_si128((__m128i *)dstPtr, p[0]);
+}
+
 inline void rpp_store8_f32_to_f16_avx(Rpp16f *dstPtr, __m256 *p)
 {
     __m128i px128 = _mm256_cvtps_ph(p[0], _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
@@ -2438,6 +2462,29 @@ static inline __m128 log_ps(__m128 x)
     return x;
 }
 
+inline Rpp32f rpp_hsum_ps(__m128 x)
+{
+    __m128 shuf = _mm_movehdup_ps(x);        // broadcast elements 3,1 to 2,0
+    __m128 sums = _mm_add_ps(x, shuf);
+    shuf = _mm_movehl_ps(shuf, sums);        // high half -> low half
+    sums = _mm_add_ss(sums, shuf);
+    return _mm_cvtss_f32(sums);
+}
+
+inline Rpp32f rpp_hsum_ps(__m256 x)
+{
+    __m128 p0 = _mm256_extractf128_ps(x, 1); // Contains x7, x6, x5, x4
+    __m128 p1 = _mm256_castps256_ps128(x);   // Contains x3, x2, x1, x0
+    __m128 sum = _mm_add_ps(p0, p1);         // Contains x3 + x7, x2 + x6, x1 + x5, x0 + x4
+    p0 = sum;                                // Contains -, -, x1 + x5, x0 + x4
+    p1 = _mm_movehl_ps(sum, sum);            // Contains -, -, x3 + x7, x2 + x6
+    sum = _mm_add_ps(p0, p1);                // Contains -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6
+    p0 = sum;                                // Contains -, -, -, x0 + x2 + x4 + x6
+    p1 = _mm_shuffle_ps(sum, sum, 0x1);      // Contains -, -, -, x1 + x3 + x5 + x7
+    sum = _mm_add_ss(p0, p1);                // Contains -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7
+    return _mm_cvtss_f32(sum);
+}
+
 static inline void fast_matmul4x4_sse(float *A, float *B, float *C)
 {
     __m128 row1 = _mm_load_ps(&B[0]);                   // Row 0 of B
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index a7412aa2d..d9c0ce02d 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -184,6 +184,13 @@ inline void generate_gaussian_kernel_gpu(Rpp32f stdDev, Rpp32f* kernel, Rpp32u k
     }
 }
 
+// Retrieve Min and Max given a datatype
+
+inline void getImageBitDepthMinMax(uchar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(float *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(half *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(schar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(-128, 127); }
+
 /******************** DEVICE FUNCTIONS ********************/
 
 // -------------------- Set 0 - Range checks and Range adjustment --------------------
@@ -1560,6 +1567,20 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_to_int24_pln3(schar *srcPtr,
 
 // /******************** DEVICE MATH HELPER FUNCTIONS ********************/
 
+// float8 min
+
+__device__ __forceinline__ void rpp_hip_math_min8(d_float8 *srcPtr_f8, float *dstPtr)
+{
+    *dstPtr = fminf(fminf(fminf(fminf(fminf(fminf(fminf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]);
+}
+
+// float8 max
+
+__device__ __forceinline__ void rpp_hip_math_max8(d_float8 *srcPtr_f8, float *dstPtr)
+{
+    *dstPtr = fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]);
+}
+
 // d_float16 floor
 
 __device__ __forceinline__ void rpp_hip_math_floor16(d_float16 *srcPtr_f16, d_float16 *dstPtr_f16)
diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
index 96553489d..b98145be0 100644
--- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp
+++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
@@ -26,5 +26,9 @@ SOFTWARE.
 #define HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
 
 #include "kernel/fused_multiply_add_scalar.hpp"
+#include "kernel/add_scalar.hpp"
+#include "kernel/subtract_scalar.hpp"
+#include "kernel/multiply_scalar.hpp"
+#include "kernel/magnitude.hpp"
 
-#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
\ No newline at end of file
+#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/cpu/host_tensor_audio_augmentations.hpp b/src/modules/cpu/host_tensor_audio_augmentations.hpp
index 7737b38c3..e2edb1afc 100644
--- a/src/modules/cpu/host_tensor_audio_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_audio_augmentations.hpp
@@ -28,5 +28,6 @@ SOFTWARE.
 #include "kernel/non_silent_region_detection.hpp"
 #include "kernel/to_decibels.hpp"
 #include "kernel/pre_emphasis_filter.hpp"
+#include "kernel/down_mixing.hpp"
 
 #endif // HOST_TENSOR_AUDIO_AUGMENTATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/host_tensor_color_augmentations.hpp b/src/modules/cpu/host_tensor_color_augmentations.hpp
index 19e0b471c..aba3b8158 100644
--- a/src/modules/cpu/host_tensor_color_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_color_augmentations.hpp
@@ -34,5 +34,6 @@ SOFTWARE.
 #include "kernel/exposure.hpp"
 #include "kernel/contrast.hpp"
 #include "kernel/lut.hpp"
+#include "kernel/color_temperature.hpp"
 
 #endif // HOST_TENSOR_COLOR_AUGMENTATIONS_HPP
diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp
index dae3e6236..32b8b62b5 100644
--- a/src/modules/cpu/host_tensor_statistical_operations.hpp
+++ b/src/modules/cpu/host_tensor_statistical_operations.hpp
@@ -26,5 +26,7 @@ SOFTWARE.
 #define HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
 
 #include "kernel/tensor_sum.hpp"
+#include "kernel/tensor_min.hpp"
+#include "kernel/tensor_max.hpp"
 
 #endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/add_scalar.hpp b/src/modules/cpu/kernel/add_scalar.hpp
new file mode 100644
index 000000000..d0179d4e1
--- /dev/null
+++ b/src/modules/cpu/kernel/add_scalar.hpp
@@ -0,0 +1,152 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus add_scalar_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                         RpptGenericDescPtr srcGenericDescPtr,
+                                         Rpp32f *dstPtr,
+                                         RpptGenericDescPtr dstGenericDescPtr,
+                                         Rpp32f *addTensor,
+                                         RpptROI3DPtr roiGenericPtrSrc,
+                                         RpptRoi3DType roiType,
+                                         RppLayoutParams layoutParams,
+                                         rpp::Handle& handle)
+{
+    RpptROI3D roiDefault;
+    if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
+        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
+    else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
+        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+    {
+        RpptROI3D roi;
+        RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
+        compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        Rpp32f addParam = addTensor[batchCount];
+        Rpp32f *srcPtrChannel, *dstPtrChannel;
+        dstPtrChannel = dstPtrImage;
+
+        Rpp32u vectorIncrement = 16;
+        Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+        Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+        __m256 pAddParam = _mm256_set1_ps(addParam);
+
+        // Add without fused output-layout toggle (NCDHW -> NCDHW)
+        if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+        {
+            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp32f *srcPtrDepth, *dstPtrDepth;
+                srcPtrDepth = srcPtrChannel;
+                dstPtrDepth = dstPtrChannel;
+                for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+                {
+                    Rpp32f *srcPtrRow, *dstPtrRow;
+                    srcPtrRow = srcPtrDepth;
+                    dstPtrRow = dstPtrDepth;
+                    for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+                    {
+                        Rpp32f *srcPtrTemp, *dstPtrTemp;
+                        srcPtrTemp = srcPtrRow;
+                        dstPtrTemp = dstPtrRow;
+                        int vectorLoopCount = 0;
+                        for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                        {
+                            __m256 p[2];
+                            rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p);    // simd loads
+                            compute_add_16_host(p, &pAddParam);                         // add adjustment
+                            rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);  // simd stores
+                            srcPtrTemp += vectorIncrement;
+                            dstPtrTemp += vectorIncrement;
+                        }
+                        for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                        {
+                            *dstPtrTemp++ = *srcPtrTemp++ + addParam;
+                        }
+                        srcPtrRow += srcGenericDescPtr->strides[3];
+                        dstPtrRow += dstGenericDescPtr->strides[3];
+                    }
+                    srcPtrDepth += srcGenericDescPtr->strides[2];
+                    dstPtrDepth += dstGenericDescPtr->strides[2];
+                }
+                srcPtrChannel += srcGenericDescPtr->strides[1];
+                dstPtrChannel += srcGenericDescPtr->strides[1];
+            }
+        }
+        // Add without fused output-layout toggle (NDHWC -> NDHWC)
+        else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+        {
+            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+            Rpp32f *srcPtrDepth = srcPtrChannel;
+            Rpp32f *dstPtrDepth = dstPtrChannel;
+            for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+            {
+                Rpp32f *srcPtrRow, *dstPtrRow;
+                srcPtrRow = srcPtrDepth;
+                dstPtrRow = dstPtrDepth;
+                for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+                {
+                    Rpp32f *srcPtrTemp, *dstPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[2];
+                        rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p);    // simd loads
+                        compute_add_16_host(p, &pAddParam);                         // add adjustment
+                        rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);  // simd stores
+                        srcPtrTemp += vectorIncrement;
+                        dstPtrTemp += vectorIncrement;
+                    }
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        *dstPtrTemp++ = *srcPtrTemp++ + addParam;
+                    }
+                    srcPtrRow += srcGenericDescPtr->strides[2];
+                    dstPtrRow += dstGenericDescPtr->strides[2];
+                }
+                srcPtrDepth += srcGenericDescPtr->strides[1];
+                dstPtrDepth += dstGenericDescPtr->strides[1];
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/color_temperature.hpp b/src/modules/cpu/kernel/color_temperature.hpp
new file mode 100644
index 000000000..1358ac800
--- /dev/null
+++ b/src/modules/cpu/kernel/color_temperature.hpp
@@ -0,0 +1,1035 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus color_temperature_u8_u8_host_tensor(Rpp8u *srcPtr,
+                                              RpptDescPtr srcDescPtr,
+                                              Rpp8u *dstPtr,
+                                              RpptDescPtr dstDescPtr,
+                                              Rpp8s *adjustmentValueTensor,
+                                              RpptROIPtr roiTensorPtrSrc,
+                                              RpptRoiType roiType,
+                                              RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(dstDescPtr->n)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f adjustmentValue = adjustmentValueTensor[batchCount];
+
+        Rpp8u *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        __m256 pAdj = _mm256_set1_ps(adjustmentValue);
+
+        Rpp8u *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        // Color Temperature with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+            Rpp8u *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 48)
+                {
+                    __m256 p[6];
+
+                    rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtrTemp, p);    // simd loads
+                    compute_color_temperature_48_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p);    // simd stores
+
+                    srcPtrTemp += 48;
+                    dstPtrTempR += 16;
+                    dstPtrTempG += 16;
+                    dstPtrTempB += 16;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    *dstPtrTempR++ = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[0] + adjustmentValue);
+                    *dstPtrTempG++ = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[1]);
+                    *dstPtrTempB++ = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[2] - adjustmentValue);
+
+                    srcPtrTemp += 3;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+            Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+                {
+                    __m256 p[6];
+
+                    rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);    // simd loads
+                    compute_color_temperature_48_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p);    // simd stores
+
+                    srcPtrTempR += 16;
+                    srcPtrTempG += 16;
+                    srcPtrTempB += 16;
+                    dstPtrTemp += 48;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    dstPtrTemp[0] = (Rpp8u) RPPPIXELCHECK(*srcPtrTempR + adjustmentValue);
+                    dstPtrTemp[1] = (Rpp8u) RPPPIXELCHECK(*srcPtrTempG);
+                    dstPtrTemp[2] = (Rpp8u) RPPPIXELCHECK(*srcPtrTempB - adjustmentValue);
+
+                    dstPtrTemp += 3;
+                    srcPtrTempR++;
+                    srcPtrTempG++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NHWC -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+            Rpp8u *srcPtrRow, *dstPtrRow;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTemp, *dstPtrTemp;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 48)
+                {
+                    __m256 p[6];
+
+                    rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtrTemp, p);    // simd loads
+                    compute_color_temperature_48_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p);    // simd stores
+
+                    srcPtrTemp += 48;
+                    dstPtrTemp += 48;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    dstPtrTemp[0] = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[0] + adjustmentValue);
+                    dstPtrTemp[1] = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[1]);
+                    dstPtrTemp[2] = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[2] - adjustmentValue);
+
+                    srcPtrTemp += 3;
+                    dstPtrTemp += 3;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NCHW -> NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+            Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+                {
+                    __m256 p[6];
+
+                    rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);    // simd loads
+                    compute_color_temperature_48_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p);    // simd stores
+
+                    srcPtrTempR += 16;
+                    srcPtrTempG += 16;
+                    srcPtrTempB += 16;
+                    dstPtrTempR += 16;
+                    dstPtrTempG += 16;
+                    dstPtrTempB += 16;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    *dstPtrTempR++ = (Rpp8u) RPPPIXELCHECK(*srcPtrTempR + adjustmentValue);
+                    *dstPtrTempG++ = (Rpp8u) RPPPIXELCHECK(*srcPtrTempG);
+                    *dstPtrTempB++ = (Rpp8u) RPPPIXELCHECK(*srcPtrTempB - adjustmentValue);
+
+                    srcPtrTempR++;
+                    srcPtrTempG++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus color_temperature_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                                RpptDescPtr srcDescPtr,
+                                                Rpp32f *dstPtr,
+                                                RpptDescPtr dstDescPtr,
+                                                Rpp8s *adjustmentValueTensor,
+                                                RpptROIPtr roiTensorPtrSrc,
+                                                RpptRoiType roiType,
+                                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(dstDescPtr->n)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f adjustmentValue = adjustmentValueTensor[batchCount] * ONE_OVER_255;
+
+        Rpp32f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        __m256 pAdj = _mm256_set1_ps(adjustmentValue);
+
+        Rpp32f *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        // Color Temperature with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+            Rpp32f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 24)
+                {
+                    __m256 p[3];
+
+                    rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);    // simd loads
+                    compute_color_temperature_24_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p);    // simd stores
+
+                    srcPtrTemp += 24;
+                    dstPtrTempR += 8;
+                    dstPtrTempG += 8;
+                    dstPtrTempB += 8;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    *dstPtrTempR++ = RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue);
+                    *dstPtrTempG++ = RPPPIXELCHECKF32(srcPtrTemp[1]);
+                    *dstPtrTempB++ = RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue);
+
+                    srcPtrTemp += 3;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+            Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+                {
+                    __m256 p[3];
+
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);    // simd loads
+                    compute_color_temperature_24_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p);    // simd stores
+
+                    srcPtrTempR += 8;
+                    srcPtrTempG += 8;
+                    srcPtrTempB += 8;
+                    dstPtrTemp += 24;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    dstPtrTemp[0] = RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue);
+                    dstPtrTemp[1] = RPPPIXELCHECKF32(*srcPtrTempG);
+                    dstPtrTemp[2] = RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue);
+
+                    dstPtrTemp += 3;
+                    srcPtrTempR++;
+                    srcPtrTempG++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NHWC -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+            Rpp32f *srcPtrRow, *dstPtrRow;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTemp, *dstPtrTemp;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 24)
+                {
+                    __m256 p[3];
+
+                    rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);    // simd loads
+                    compute_color_temperature_24_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p);    // simd stores
+
+                    srcPtrTemp += 24;
+                    dstPtrTemp += 24;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    dstPtrTemp[0] = RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue);
+                    dstPtrTemp[1] = RPPPIXELCHECKF32(srcPtrTemp[1]);
+                    dstPtrTemp[2] = RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue);
+
+                    srcPtrTemp += 3;
+                    dstPtrTemp += 3;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NCHW -> NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+            Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+                {
+                    __m256 p[3];
+
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);    // simd loads
+                    compute_color_temperature_24_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p);    // simd stores
+
+                    srcPtrTempR += 8;
+                    srcPtrTempG += 8;
+                    srcPtrTempB += 8;
+                    dstPtrTempR += 8;
+                    dstPtrTempG += 8;
+                    dstPtrTempB += 8;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    *dstPtrTempR++ = RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue);
+                    *dstPtrTempG++ = RPPPIXELCHECKF32(*srcPtrTempG);
+                    *dstPtrTempB++ = RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue);
+
+                    srcPtrTempR++;
+                    srcPtrTempG++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRowR += srcDescPtr->strides.hStride;
+                dstPtrRowG += srcDescPtr->strides.hStride;
+                dstPtrRowB += srcDescPtr->strides.hStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus color_temperature_f16_f16_host_tensor(Rpp16f *srcPtr,
+                                                RpptDescPtr srcDescPtr,
+                                                Rpp16f *dstPtr,
+                                                RpptDescPtr dstDescPtr,
+                                                Rpp8s *adjustmentValueTensor,
+                                                RpptROIPtr roiTensorPtrSrc,
+                                                RpptRoiType roiType,
+                                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(dstDescPtr->n)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f adjustmentValue = adjustmentValueTensor[batchCount] * ONE_OVER_255;
+
+        Rpp16f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        __m256 pAdj = _mm256_set1_ps(adjustmentValue);
+
+        Rpp16f *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+        Rpp32u vectorIncrement = 24;
+
+        // Color Temperature with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+
+            Rpp16f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    Rpp32f srcPtrTemp_ps[24];
+                    Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8];
+
+                    for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                        srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+
+                    __m256 p[3];
+
+                    rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);    // simd loads
+                    compute_color_temperature_24_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, p);    // simd stores
+
+                    for(int cnt = 0; cnt < 8; cnt++)
+                    {
+                        dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt];
+                        dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt];
+                        dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt];
+                    }
+
+                    srcPtrTemp += 24;
+                    dstPtrTempR += 8;
+                    dstPtrTempG += 8;
+                    dstPtrTempB += 8;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    *dstPtrTempR++ = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue);
+                    *dstPtrTempG++ = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[1]);
+                    *dstPtrTempB++ = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue);
+
+                    srcPtrTemp += 3;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+            Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+                {
+                    Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+                    Rpp32f dstPtrTemp_ps[25];
+
+                    for(int cnt = 0; cnt < 8; cnt++)
+                    {
+                        srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+                        srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+                        srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+                    }
+
+                    __m256 p[3];
+
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);    // simd loads
+                    compute_color_temperature_24_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, p);    // simd stores
+
+                    for(int cnt = 0; cnt < 24; cnt++)
+                        dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+
+                    srcPtrTempR += 8;
+                    srcPtrTempG += 8;
+                    srcPtrTempB += 8;
+                    dstPtrTemp += 24;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    dstPtrTemp[0] = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue);
+                    dstPtrTemp[1] = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempG);
+                    dstPtrTemp[2] = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue);
+
+                    dstPtrTemp += 3;
+                    srcPtrTempR++;
+                    srcPtrTempG++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NHWC -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+            Rpp16f *srcPtrRow, *dstPtrRow;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTemp, *dstPtrTemp;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 24)
+                {
+                    Rpp32f srcPtrTemp_ps[24], dstPtrTemp_ps[25];
+
+                    for(int cnt = 0; cnt < 24; cnt++)
+                        srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+
+                    __m256 p[3];
+
+                    rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);    // simd loads
+                    compute_color_temperature_24_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, p);    // simd stores
+
+                    for(int cnt = 0; cnt < 24; cnt++)
+                        dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+
+                    srcPtrTemp += 24;
+                    dstPtrTemp += 24;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    dstPtrTemp[0] = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue);
+                    dstPtrTemp[1] = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[1]);
+                    dstPtrTemp[2] = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue);
+
+                    srcPtrTemp += 3;
+                    dstPtrTemp += 3;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NCHW -> NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+            Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+                {
+                    Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+                    Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8];
+
+                    for(int cnt = 0; cnt < 8; cnt++)
+                    {
+                        srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+                        srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+                        srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+                    }
+
+                    __m256 p[3];
+
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);    // simd loads
+                    compute_color_temperature_24_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, p);    // simd stores
+
+                    for(int cnt = 0; cnt < 8; cnt++)
+                    {
+                        dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt];
+                        dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt];
+                        dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt];
+                    }
+
+                    srcPtrTempR += 8;
+                    srcPtrTempG += 8;
+                    srcPtrTempB += 8;
+                    dstPtrTempR += 8;
+                    dstPtrTempG += 8;
+                    dstPtrTempB += 8;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    *dstPtrTempR++ = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue);
+                    *dstPtrTempG++ = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempG);
+                    *dstPtrTempB++ = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue);
+
+                    srcPtrTempR++;
+                    srcPtrTempG++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRowR += srcDescPtr->strides.hStride;
+                dstPtrRowG += srcDescPtr->strides.hStride;
+                dstPtrRowB += srcDescPtr->strides.hStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus color_temperature_i8_i8_host_tensor(Rpp8s *srcPtr,
+                                              RpptDescPtr srcDescPtr,
+                                              Rpp8s *dstPtr,
+                                              RpptDescPtr dstDescPtr,
+                                              Rpp8s *adjustmentValueTensor,
+                                              RpptROIPtr roiTensorPtrSrc,
+                                              RpptRoiType roiType,
+                                              RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(dstDescPtr->n)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f adjustmentValue = adjustmentValueTensor[batchCount];
+
+        Rpp8s *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        __m256 pAdj = _mm256_set1_ps(adjustmentValue);
+
+        Rpp8s *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        // Color Temperature with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+            Rpp8s *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 48)
+                {
+                    __m256 p[6];
+
+                    rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtrTemp, p);    // simd loads
+                    compute_color_temperature_48_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p);    // simd stores
+
+                    srcPtrTemp += 48;
+                    dstPtrTempR += 16;
+                    dstPtrTempG += 16;
+                    dstPtrTempB += 16;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    *dstPtrTempR++ = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[0] + adjustmentValue);
+                    *dstPtrTempG++ = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[1]);
+                    *dstPtrTempB++ = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[2] - adjustmentValue);
+
+                    srcPtrTemp += 3;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+            Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+                {
+                    __m256 p[6];
+
+                    rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);    // simd loads
+                    compute_color_temperature_48_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p);    // simd stores
+
+                    srcPtrTempR += 16;
+                    srcPtrTempG += 16;
+                    srcPtrTempB += 16;
+                    dstPtrTemp += 48;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    dstPtrTemp[0] = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempR + adjustmentValue);
+                    dstPtrTemp[1] = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempG);
+                    dstPtrTemp[2] = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempB - adjustmentValue);
+
+                    dstPtrTemp += 3;
+                    srcPtrTempR++;
+                    srcPtrTempG++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NHWC -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+            Rpp8s *srcPtrRow, *dstPtrRow;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTemp, *dstPtrTemp;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 48)
+                {
+                    __m256 p[6];
+
+                    rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtrTemp, p);    // simd loads
+                    compute_color_temperature_48_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p);    // simd stores
+
+                    srcPtrTemp += 48;
+                    dstPtrTemp += 48;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    dstPtrTemp[0] = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[0] + adjustmentValue);
+                    dstPtrTemp[1] = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[1]);
+                    dstPtrTemp[2] = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[2] - adjustmentValue);
+
+                    srcPtrTemp += 3;
+                    dstPtrTemp += 3;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Color Temperature with fused output-layout toggle (NCHW -> NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+            Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+                {
+                    __m256 p[6];
+
+                    rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);    // simd loads
+                    compute_color_temperature_48_host(p, pAdj);    // color_temperature adjustment
+                    rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p);    // simd stores
+
+                    srcPtrTempR += 16;
+                    srcPtrTempG += 16;
+                    srcPtrTempB += 16;
+                    dstPtrTempR += 16;
+                    dstPtrTempG += 16;
+                    dstPtrTempB += 16;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    *dstPtrTempR++ = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempR + adjustmentValue);
+                    *dstPtrTempG++ = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempG);
+                    *dstPtrTempB++ = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempB - adjustmentValue);
+
+                    srcPtrTempR++;
+                    srcPtrTempG++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/down_mixing.hpp b/src/modules/cpu/kernel/down_mixing.hpp
new file mode 100644
index 000000000..9cefc64a2
--- /dev/null
+++ b/src/modules/cpu/kernel/down_mixing.hpp
@@ -0,0 +1,122 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include <omp.h>
+
+RppStatus down_mixing_host_tensor(Rpp32f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp32f *dstPtr,
+                                  RpptDescPtr dstDescPtr,
+                                  Rpp32s *srcDimsTensor,
+                                  bool normalizeWeights,
+                                  rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32s samples = srcDimsTensor[batchCount * 2];
+        Rpp32s channels = srcDimsTensor[batchCount * 2 + 1];
+        bool flagAVX = 0;
+
+        if(channels == 1)
+        {
+            // No need of downmixing, do a direct memcpy
+            memcpy(dstPtrTemp, srcPtrTemp, (size_t)(samples * sizeof(Rpp32f)));
+        }
+        else
+        {
+            Rpp32f *weights = handle.GetInitHandle()->mem.mcpu.tempFloatmem + batchCount * channels;
+            std::fill(weights, weights + channels, 1.f / channels);
+
+            if(normalizeWeights)
+            {
+                // Compute sum of the weights
+                Rpp32f sum = 0.0;
+                for(int i = 0; i < channels; i++)
+                    sum += weights[i];
+
+                // Normalize the weights
+                Rpp32f invSum = 1.0 / sum;
+                for(int i = 0; i < channels; i++)
+                    weights[i] *= invSum;
+            }
+
+            Rpp32s channelIncrement = 4;
+            Rpp32s alignedChannels = (channels / 4) * 4;
+            if(channels > 7)
+            {
+                flagAVX = 1;
+                channelIncrement = 8;
+                alignedChannels = (channels / 8) * 8;
+            }
+
+            // use weights to downmix to mono
+            for(int64_t dstIdx = 0; dstIdx < samples; dstIdx++)
+            {
+                Rpp32s channelLoopCount = 0;
+                // if number of channels are greater than or equal to 8, use AVX implementation
+                if(flagAVX)
+                {
+                    __m256 pDst = avx_p0;
+                    for(; channelLoopCount < alignedChannels; channelLoopCount += channelIncrement)
+                    {
+                        __m256 pSrc, pWeights;
+                        pWeights = _mm256_setr_ps(weights[channelLoopCount], weights[channelLoopCount + 1], weights[channelLoopCount + 2], weights[channelLoopCount + 3],
+                                weights[channelLoopCount + 4], weights[channelLoopCount + 5], weights[channelLoopCount + 6], weights[channelLoopCount + 7]);
+                        pSrc = _mm256_loadu_ps(srcPtrTemp);
+                        pSrc = _mm256_mul_ps(pSrc, pWeights);
+                        pDst = _mm256_add_ps(pDst, pSrc);
+                        srcPtrTemp += channelIncrement;
+                    }
+                    dstPtrTemp[dstIdx] = rpp_hsum_ps(pDst);
+                }
+                else
+                {
+                    __m128 pDst = xmm_p0;
+                    for(; channelLoopCount < alignedChannels; channelLoopCount += channelIncrement)
+                    {
+                        __m128 pSrc, pWeights;
+                        pWeights = _mm_setr_ps(weights[channelLoopCount], weights[channelLoopCount + 1], weights[channelLoopCount + 2], weights[channelLoopCount + 3]);
+                        pSrc = _mm_loadu_ps(srcPtrTemp);
+                        pSrc = _mm_mul_ps(pSrc, pWeights);
+                        pDst = _mm_add_ps(pDst, pSrc);
+                        srcPtrTemp += channelIncrement;
+                    }
+                    dstPtrTemp[dstIdx] = rpp_hsum_ps(pDst);
+                }
+                for(; channelLoopCount < channels; channelLoopCount++)
+                    dstPtrTemp[dstIdx] += ((*srcPtrTemp++) * weights[channelLoopCount]);
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/magnitude.hpp b/src/modules/cpu/kernel/magnitude.hpp
new file mode 100644
index 000000000..6eaf4f236
--- /dev/null
+++ b/src/modules/cpu/kernel/magnitude.hpp
@@ -0,0 +1,1001 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus magnitude_u8_u8_host_tensor(Rpp8u *srcPtr1,
+                                      Rpp8u *srcPtr2,
+                                      RpptDescPtr srcDescPtr,
+                                      Rpp8u *dstPtr,
+                                      RpptDescPtr dstDescPtr,
+                                      RpptROIPtr roiTensorPtrSrc,
+                                      RpptRoiType roiType,
+                                      RppLayoutParams layoutParams,
+                                      rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+        srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+        srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+        srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+        Rpp32u alignedLength = (bufferLength / 48) * 48;
+        Rpp32u vectorIncrement = 48;
+        Rpp32u vectorIncrementPerChannel = 16;
+#endif
+
+        // Magnitude with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtr1Row = srcPtr1Channel;
+            srcPtr2Row = srcPtr2Channel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtr1Temp = srcPtr1Row;
+                srcPtr2Temp = srcPtr2Row;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p1[6], p2[6];
+
+                    rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr1Temp, p1);    // simd loads
+                    rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr2Temp, p2);    // simd loads
+                    p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                    p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                    p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2])));    // magnitude computation
+                    p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3])));    // magnitude computation
+                    p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4])));    // magnitude computation
+                    p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5])));    // magnitude computation
+                    rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1);    // simd stores
+
+                    srcPtr1Temp += vectorIncrement;
+                    srcPtr2Temp += vectorIncrement;
+                    dstPtrTempR += vectorIncrementPerChannel;
+                    dstPtrTempG += vectorIncrementPerChannel;
+                    dstPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    Rpp32f srcPtr1TempValue0 = static_cast<Rpp32f>(srcPtr1Temp[0]);
+                    Rpp32f srcPtr1TempValue1 = static_cast<Rpp32f>(srcPtr1Temp[1]);
+                    Rpp32f srcPtr1TempValue2 = static_cast<Rpp32f>(srcPtr1Temp[2]);
+                    Rpp32f srcPtr2TempValue0 = static_cast<Rpp32f>(srcPtr2Temp[0]);
+                    Rpp32f srcPtr2TempValue1 = static_cast<Rpp32f>(srcPtr2Temp[1]);
+                    Rpp32f srcPtr2TempValue2 = static_cast<Rpp32f>(srcPtr2Temp[2]);
+                    *dstPtrTempR++ = static_cast<Rpp8u>(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)))));
+                    *dstPtrTempG++ = static_cast<Rpp8u>(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)))));
+                    *dstPtrTempB++ = static_cast<Rpp8u>(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)))));
+
+                    srcPtr1Temp += 3;
+                    srcPtr2Temp += 3;
+                }
+
+                srcPtr1Row += srcDescPtr->strides.hStride;
+                srcPtr2Row += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Magnitude with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+            srcPtr1RowR = srcPtr1Channel;
+            srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+            srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+            srcPtr2RowR = srcPtr2Channel;
+            srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+            srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+                srcPtr1TempR = srcPtr1RowR;
+                srcPtr1TempG = srcPtr1RowG;
+                srcPtr1TempB = srcPtr1RowB;
+                srcPtr2TempR = srcPtr2RowR;
+                srcPtr2TempG = srcPtr2RowG;
+                srcPtr2TempB = srcPtr2RowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256 p1[6], p2[6];
+
+                    rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1);    // simd loads
+                    rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2);    // simd loads
+                    p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                    p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                    p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2])));    // magnitude computation
+                    p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3])));    // magnitude computation
+                    p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4])));    // magnitude computation
+                    p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5])));    // magnitude computation
+                    rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p1);    // simd stores
+
+                    srcPtr1TempR += vectorIncrementPerChannel;
+                    srcPtr1TempG += vectorIncrementPerChannel;
+                    srcPtr1TempB += vectorIncrementPerChannel;
+                    srcPtr2TempR += vectorIncrementPerChannel;
+                    srcPtr2TempG += vectorIncrementPerChannel;
+                    srcPtr2TempB += vectorIncrementPerChannel;
+                    dstPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    Rpp32f srcPtr1TempValue0 = static_cast<Rpp32f>(*srcPtr1TempR);
+                    Rpp32f srcPtr1TempValue1 = static_cast<Rpp32f>(*srcPtr1TempG);
+                    Rpp32f srcPtr1TempValue2 = static_cast<Rpp32f>(*srcPtr1TempB);
+                    Rpp32f srcPtr2TempValue0 = static_cast<Rpp32f>(*srcPtr2TempR);
+                    Rpp32f srcPtr2TempValue1 = static_cast<Rpp32f>(*srcPtr2TempG);
+                    Rpp32f srcPtr2TempValue2 = static_cast<Rpp32f>(*srcPtr2TempB);
+                    dstPtrTemp[0] = static_cast<Rpp8u>(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)))));
+                    dstPtrTemp[1] = static_cast<Rpp8u>(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)))));
+                    dstPtrTemp[2] = static_cast<Rpp8u>(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)))));
+
+                    srcPtr1TempR++;
+                    srcPtr1TempG++;
+                    srcPtr1TempB++;
+                    srcPtr2TempR++;
+                    srcPtr2TempG++;
+                    srcPtr2TempB++;
+                    dstPtrTemp += 3;
+                }
+
+                srcPtr1RowR += srcDescPtr->strides.hStride;
+                srcPtr1RowG += srcDescPtr->strides.hStride;
+                srcPtr1RowB += srcDescPtr->strides.hStride;
+                srcPtr2RowR += srcDescPtr->strides.hStride;
+                srcPtr2RowG += srcDescPtr->strides.hStride;
+                srcPtr2RowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+        else
+        {
+#if __AVX2__
+            alignedLength = bufferLength & ~15;
+#endif
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+                srcPtr1Row = srcPtr1Channel;
+                srcPtr2Row = srcPtr2Channel;
+                dstPtrRow = dstPtrChannel;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+                    srcPtr1Temp = srcPtr1Row;
+                    srcPtr2Temp = srcPtr2Row;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                    {
+                        __m256 p1[2], p2[2];
+
+                        rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1Temp, p1);    // simd loads
+                        rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2);    // simd loads
+                        p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                        p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                        rpp_simd_store(rpp_store16_f32_to_u8_avx, dstPtrTemp, p1);    // simd stores
+
+                        srcPtr1Temp += vectorIncrementPerChannel;
+                        srcPtr2Temp += vectorIncrementPerChannel;
+                        dstPtrTemp += vectorIncrementPerChannel;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        Rpp32f srcPtr1TempValue = static_cast<Rpp32f>(*srcPtr1Temp);
+                        Rpp32f srcPtr2TempValue = static_cast<Rpp32f>(*srcPtr2Temp);
+                        *dstPtrTemp++ = static_cast<Rpp8u>(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue * srcPtr1TempValue) + (srcPtr2TempValue * srcPtr2TempValue)))));
+
+                        srcPtr1Temp++;
+                        srcPtr2Temp++;
+                    }
+
+                    srcPtr1Row += srcDescPtr->strides.hStride;
+                    srcPtr2Row += srcDescPtr->strides.hStride;
+                    dstPtrRow += dstDescPtr->strides.hStride;
+                }
+
+                srcPtr1Channel += srcDescPtr->strides.cStride;
+                srcPtr2Channel += srcDescPtr->strides.cStride;
+                dstPtrChannel += dstDescPtr->strides.cStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus magnitude_f32_f32_host_tensor(Rpp32f *srcPtr1,
+                                        Rpp32f *srcPtr2,
+                                        RpptDescPtr srcDescPtr,
+                                        Rpp32f *dstPtr,
+                                        RpptDescPtr dstDescPtr,
+                                        RpptROIPtr roiTensorPtrSrc,
+                                        RpptRoiType roiType,
+                                        RppLayoutParams layoutParams,
+                                        rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+        srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+        srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+        srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+        // Magnitude with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtr1Row = srcPtr1Channel;
+            srcPtr2Row = srcPtr2Channel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtr1Temp = srcPtr1Row;
+                srcPtr2Temp = srcPtr2Row;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p1[3], p2[3];
+
+                    rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1);    // simd loads
+                    rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2);    // simd loads
+                    p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                    p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                    p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2])));    // magnitude computation
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1);    // simd stores
+
+                    srcPtr1Temp += vectorIncrement;
+                    srcPtr2Temp += vectorIncrement;
+                    dstPtrTempR += vectorIncrementPerChannel;
+                    dstPtrTempG += vectorIncrementPerChannel;
+                    dstPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    *dstPtrTempR++ = RPPPIXELCHECKF32(sqrt((srcPtr1Temp[0] * srcPtr1Temp[0]) + (srcPtr2Temp[0] * srcPtr2Temp[0])));
+                    *dstPtrTempG++ = RPPPIXELCHECKF32(sqrt((srcPtr1Temp[1] * srcPtr1Temp[1]) + (srcPtr2Temp[1] * srcPtr2Temp[1])));
+                    *dstPtrTempB++ = RPPPIXELCHECKF32(sqrt((srcPtr1Temp[2] * srcPtr1Temp[2]) + (srcPtr2Temp[2] * srcPtr2Temp[2])));
+
+                    srcPtr1Temp += 3;
+                    srcPtr2Temp += 3;
+                }
+
+                srcPtr1Row += srcDescPtr->strides.hStride;
+                srcPtr2Row += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Magnitude with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+            srcPtr1RowR = srcPtr1Channel;
+            srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+            srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+            srcPtr2RowR = srcPtr2Channel;
+            srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+            srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+                srcPtr1TempR = srcPtr1RowR;
+                srcPtr1TempG = srcPtr1RowG;
+                srcPtr1TempB = srcPtr1RowB;
+                srcPtr2TempR = srcPtr2RowR;
+                srcPtr2TempG = srcPtr2RowG;
+                srcPtr2TempB = srcPtr2RowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256 p1[3], p2[3];
+
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1);    // simd loads
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2);    // simd loads
+                    p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                    p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                    p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2])));    // magnitude computation
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1);    // simd stores
+
+                    srcPtr1TempR += vectorIncrementPerChannel;
+                    srcPtr1TempG += vectorIncrementPerChannel;
+                    srcPtr1TempB += vectorIncrementPerChannel;
+                    srcPtr2TempR += vectorIncrementPerChannel;
+                    srcPtr2TempG += vectorIncrementPerChannel;
+                    srcPtr2TempB += vectorIncrementPerChannel;
+                    dstPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    dstPtrTemp[0] = RPPPIXELCHECKF32(sqrt((*srcPtr1TempR * *srcPtr1TempR) + (*srcPtr2TempR * *srcPtr2TempR)));
+                    dstPtrTemp[1] = RPPPIXELCHECKF32(sqrt((*srcPtr1TempG * *srcPtr1TempG) + (*srcPtr2TempG * *srcPtr2TempG)));
+                    dstPtrTemp[2] = RPPPIXELCHECKF32(sqrt((*srcPtr1TempB * *srcPtr1TempB) + (*srcPtr2TempB * *srcPtr2TempB)));
+
+                    srcPtr1TempR++;
+                    srcPtr1TempG++;
+                    srcPtr1TempB++;
+                    srcPtr2TempR++;
+                    srcPtr2TempG++;
+                    srcPtr2TempB++;
+                    dstPtrTemp += 3;
+                }
+
+                srcPtr1RowR += srcDescPtr->strides.hStride;
+                srcPtr1RowG += srcDescPtr->strides.hStride;
+                srcPtr1RowB += srcDescPtr->strides.hStride;
+                srcPtr2RowR += srcDescPtr->strides.hStride;
+                srcPtr2RowG += srcDescPtr->strides.hStride;
+                srcPtr2RowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+        else
+        {
+#if __AVX2__
+            alignedLength = bufferLength & ~7;
+#endif
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+                srcPtr1Row = srcPtr1Channel;
+                srcPtr2Row = srcPtr2Channel;
+                dstPtrRow = dstPtrChannel;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+                    srcPtr1Temp = srcPtr1Row;
+                    srcPtr2Temp = srcPtr2Row;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                    {
+                        __m256 p1[1], p2[1];
+
+                        rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1);    // simd loads
+                        rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2);    // simd loads
+                        p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                        rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1);    // simd stores
+
+                        srcPtr1Temp += vectorIncrementPerChannel;
+                        srcPtr2Temp += vectorIncrementPerChannel;
+                        dstPtrTemp += vectorIncrementPerChannel;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        *dstPtrTemp++ = RPPPIXELCHECKF32(sqrt((*srcPtr1Temp * *srcPtr1Temp) + (*srcPtr2Temp * *srcPtr2Temp)));
+
+                        srcPtr1Temp++;
+                        srcPtr2Temp++;
+                    }
+
+                    srcPtr1Row += srcDescPtr->strides.hStride;
+                    srcPtr2Row += srcDescPtr->strides.hStride;
+                    dstPtrRow += dstDescPtr->strides.hStride;
+                }
+
+                srcPtr1Channel += srcDescPtr->strides.cStride;
+                srcPtr2Channel += srcDescPtr->strides.cStride;
+                dstPtrChannel += dstDescPtr->strides.cStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus magnitude_f16_f16_host_tensor(Rpp16f *srcPtr1,
+                                        Rpp16f *srcPtr2,
+                                        RpptDescPtr srcDescPtr,
+                                        Rpp16f *dstPtr,
+                                        RpptDescPtr dstDescPtr,
+                                        RpptROIPtr roiTensorPtrSrc,
+                                        RpptRoiType roiType,
+                                        RppLayoutParams layoutParams,
+                                        rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+        srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+        srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+        srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+        // Magnitude with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtr1Row = srcPtr1Channel;
+            srcPtr2Row = srcPtr2Channel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtr1Temp = srcPtr1Row;
+                srcPtr2Temp = srcPtr2Row;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24];
+
+                    for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                    {
+                        srcPtr1Temp_ps[cnt] = static_cast<Rpp32f>(srcPtr1Temp[cnt]);
+                        srcPtr2Temp_ps[cnt] = static_cast<Rpp32f>(srcPtr2Temp[cnt]);
+                    }
+
+                    __m256 p1[3], p2[3];
+
+                    rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp_ps, p1);    // simd loads
+                    rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp_ps, p2);    // simd loads
+                    p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                    p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                    p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2])));    // magnitude computation
+                    rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1);    // simd stores
+
+                    srcPtr1Temp += vectorIncrement;
+                    srcPtr2Temp += vectorIncrement;
+                    dstPtrTempR += vectorIncrementPerChannel;
+                    dstPtrTempG += vectorIncrementPerChannel;
+                    dstPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    *dstPtrTempR++ = static_cast<Rpp16f>(RPPPIXELCHECKF32(sqrt((srcPtr1Temp[0] * srcPtr1Temp[0]) + (srcPtr2Temp[0] * srcPtr2Temp[0]))));
+                    *dstPtrTempG++ = static_cast<Rpp16f>(RPPPIXELCHECKF32(sqrt((srcPtr1Temp[1] * srcPtr1Temp[1]) + (srcPtr2Temp[1] * srcPtr2Temp[1]))));
+                    *dstPtrTempB++ = static_cast<Rpp16f>(RPPPIXELCHECKF32(sqrt((srcPtr1Temp[2] * srcPtr1Temp[2]) + (srcPtr2Temp[2] * srcPtr2Temp[2]))));
+
+                    srcPtr1Temp += 3;
+                    srcPtr2Temp += 3;
+                }
+
+                srcPtr1Row += srcDescPtr->strides.hStride;
+                srcPtr2Row += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Magnitude with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+            srcPtr1RowR = srcPtr1Channel;
+            srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+            srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+            srcPtr2RowR = srcPtr2Channel;
+            srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+            srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+                srcPtr1TempR = srcPtr1RowR;
+                srcPtr1TempG = srcPtr1RowG;
+                srcPtr1TempB = srcPtr1RowB;
+                srcPtr2TempR = srcPtr2RowR;
+                srcPtr2TempG = srcPtr2RowG;
+                srcPtr2TempB = srcPtr2RowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24];
+
+                    for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                    {
+                        srcPtr1Temp_ps[cnt] = static_cast<Rpp32f>(srcPtr1TempR[cnt]);
+                        srcPtr1Temp_ps[cnt + 8] = static_cast<Rpp32f>(srcPtr1TempG[cnt]);
+                        srcPtr1Temp_ps[cnt + 16] = static_cast<Rpp32f>(srcPtr1TempB[cnt]);
+
+                        srcPtr2Temp_ps[cnt] = static_cast<Rpp32f>(srcPtr2TempR[cnt]);
+                        srcPtr2Temp_ps[cnt + 8] = static_cast<Rpp32f>(srcPtr2TempG[cnt]);
+                        srcPtr2Temp_ps[cnt + 16] = static_cast<Rpp32f>(srcPtr2TempB[cnt]);
+                    }
+
+                    __m256 p1[4], p2[4];
+
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1Temp_ps, srcPtr1Temp_ps + 8, srcPtr1Temp_ps + 16, p1);    // simd loads
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2Temp_ps, srcPtr2Temp_ps + 8, srcPtr2Temp_ps + 16, p2);    // simd loads
+                    p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                    p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                    p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2])));    // magnitude computation
+                    rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1);    // simd stores
+
+                    srcPtr1TempR += vectorIncrementPerChannel;
+                    srcPtr1TempG += vectorIncrementPerChannel;
+                    srcPtr1TempB += vectorIncrementPerChannel;
+                    srcPtr2TempR += vectorIncrementPerChannel;
+                    srcPtr2TempG += vectorIncrementPerChannel;
+                    srcPtr2TempB += vectorIncrementPerChannel;
+                    dstPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    dstPtrTemp[0] = static_cast<Rpp16f>(RPPPIXELCHECKF32(sqrt((*srcPtr1TempR * *srcPtr1TempR) + (*srcPtr2TempR * *srcPtr2TempR))));
+                    dstPtrTemp[1] = static_cast<Rpp16f>(RPPPIXELCHECKF32(sqrt((*srcPtr1TempG * *srcPtr1TempG) + (*srcPtr2TempG * *srcPtr2TempG))));
+                    dstPtrTemp[2] = static_cast<Rpp16f>(RPPPIXELCHECKF32(sqrt((*srcPtr1TempB * *srcPtr1TempB) + (*srcPtr2TempB * *srcPtr2TempB))));
+
+                    srcPtr1TempR++;
+                    srcPtr1TempG++;
+                    srcPtr1TempB++;
+                    srcPtr2TempR++;
+                    srcPtr2TempG++;
+                    srcPtr2TempB++;
+                    dstPtrTemp += 3;
+                }
+
+                srcPtr1RowR += srcDescPtr->strides.hStride;
+                srcPtr1RowG += srcDescPtr->strides.hStride;
+                srcPtr1RowB += srcDescPtr->strides.hStride;
+                srcPtr2RowR += srcDescPtr->strides.hStride;
+                srcPtr2RowG += srcDescPtr->strides.hStride;
+                srcPtr2RowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+        else
+        {
+#if __AVX2__
+            alignedLength = bufferLength & ~7;
+#endif
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+                srcPtr1Row = srcPtr1Channel;
+                srcPtr2Row = srcPtr2Channel;
+                dstPtrRow = dstPtrChannel;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+                    srcPtr1Temp = srcPtr1Row;
+                    srcPtr2Temp = srcPtr2Row;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                    {
+                        Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8];
+
+                        for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                        {
+                            srcPtr1Temp_ps[cnt] = static_cast<Rpp32f>(srcPtr1Temp[cnt]);
+                            srcPtr2Temp_ps[cnt] = static_cast<Rpp32f>(srcPtr2Temp[cnt]);
+                        }
+
+                        __m256 p1[1], p2[1];
+
+                        rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp_ps, p1);    // simd loads
+                        rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp_ps, p2);    // simd loads
+                        p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                        rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1);    // simd stores
+
+                        srcPtr1Temp += vectorIncrementPerChannel;
+                        srcPtr2Temp += vectorIncrementPerChannel;
+                        dstPtrTemp += vectorIncrementPerChannel;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        *dstPtrTemp++ = static_cast<Rpp16f>(RPPPIXELCHECKF32(sqrt((*srcPtr1Temp * *srcPtr1Temp) + (*srcPtr2Temp * *srcPtr2Temp))));
+                        srcPtr1Temp++;
+                        srcPtr2Temp++;
+                    }
+
+                    srcPtr1Row += srcDescPtr->strides.hStride;
+                    srcPtr2Row += srcDescPtr->strides.hStride;
+                    dstPtrRow += dstDescPtr->strides.hStride;
+                }
+
+                srcPtr1Channel += srcDescPtr->strides.cStride;
+                srcPtr2Channel += srcDescPtr->strides.cStride;
+                dstPtrChannel += dstDescPtr->strides.cStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus magnitude_i8_i8_host_tensor(Rpp8s *srcPtr1,
+                                      Rpp8s *srcPtr2,
+                                      RpptDescPtr srcDescPtr,
+                                      Rpp8s *dstPtr,
+                                      RpptDescPtr dstDescPtr,
+                                      RpptROIPtr roiTensorPtrSrc,
+                                      RpptRoiType roiType,
+                                      RppLayoutParams layoutParams,
+                                      rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+        srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+        srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+        srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+        Rpp32u alignedLength = (bufferLength / 48) * 48;
+        Rpp32u vectorIncrement = 48;
+        Rpp32u vectorIncrementPerChannel = 16;
+#endif
+
+        // Magnitude with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtr1Row = srcPtr1Channel;
+            srcPtr2Row = srcPtr2Channel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtr1Temp = srcPtr1Row;
+                srcPtr2Temp = srcPtr2Row;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p1[6], p2[6];
+
+                    rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr1Temp, p1);    // simd loads
+                    rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr2Temp, p2);    // simd loads
+                    p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                    p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                    p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2])));    // magnitude computation
+                    p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3])));    // magnitude computation
+                    p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4])));    // magnitude computation
+                    p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5])));    // magnitude computation
+                    rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1);    // simd stores
+
+                    srcPtr1Temp += vectorIncrement;
+                    srcPtr2Temp += vectorIncrement;
+                    dstPtrTempR += vectorIncrementPerChannel;
+                    dstPtrTempG += vectorIncrementPerChannel;
+                    dstPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                {
+                    Rpp32f srcPtr1TempValue0 = static_cast<Rpp32f>(srcPtr1Temp[0] + 128);
+                    Rpp32f srcPtr1TempValue1 = static_cast<Rpp32f>(srcPtr1Temp[1] + 128);
+                    Rpp32f srcPtr1TempValue2 = static_cast<Rpp32f>(srcPtr1Temp[2] + 128);
+                    Rpp32f srcPtr2TempValue0 = static_cast<Rpp32f>(srcPtr2Temp[0] + 128);
+                    Rpp32f srcPtr2TempValue1 = static_cast<Rpp32f>(srcPtr2Temp[1] + 128);
+                    Rpp32f srcPtr2TempValue2 = static_cast<Rpp32f>(srcPtr2Temp[2] + 128);
+                    *dstPtrTempR++ = static_cast<Rpp8s>(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)) - 128)));
+                    *dstPtrTempG++ = static_cast<Rpp8s>(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)) - 128)));
+                    *dstPtrTempB++ = static_cast<Rpp8s>(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)) - 128)));
+
+                    srcPtr1Temp += 3;
+                    srcPtr2Temp += 3;
+                }
+
+                srcPtr1Row += srcDescPtr->strides.hStride;
+                srcPtr2Row += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Magnitude with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+            srcPtr1RowR = srcPtr1Channel;
+            srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+            srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+            srcPtr2RowR = srcPtr2Channel;
+            srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+            srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+                srcPtr1TempR = srcPtr1RowR;
+                srcPtr1TempG = srcPtr1RowG;
+                srcPtr1TempB = srcPtr1RowB;
+                srcPtr2TempR = srcPtr2RowR;
+                srcPtr2TempG = srcPtr2RowG;
+                srcPtr2TempB = srcPtr2RowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256 p1[6], p2[6];
+
+                    rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1);    // simd loads
+                    rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2);    // simd loads
+                    p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                    p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                    p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2])));    // magnitude computation
+                    p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3])));    // magnitude computation
+                    p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4])));    // magnitude computation
+                    p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5])));    // magnitude computation
+                    rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p1);    // simd stores
+
+                    srcPtr1TempR += vectorIncrementPerChannel;
+                    srcPtr1TempG += vectorIncrementPerChannel;
+                    srcPtr1TempB += vectorIncrementPerChannel;
+                    srcPtr2TempR += vectorIncrementPerChannel;
+                    srcPtr2TempG += vectorIncrementPerChannel;
+                    srcPtr2TempB += vectorIncrementPerChannel;
+                    dstPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    Rpp32f srcPtr1TempValue0 = static_cast<Rpp32f>(*srcPtr1TempR + 128);
+                    Rpp32f srcPtr1TempValue1 = static_cast<Rpp32f>(*srcPtr1TempG + 128);
+                    Rpp32f srcPtr1TempValue2 = static_cast<Rpp32f>(*srcPtr1TempB + 128);
+                    Rpp32f srcPtr2TempValue0 = static_cast<Rpp32f>(*srcPtr2TempR + 128);
+                    Rpp32f srcPtr2TempValue1 = static_cast<Rpp32f>(*srcPtr2TempG + 128);
+                    Rpp32f srcPtr2TempValue2 = static_cast<Rpp32f>(*srcPtr2TempB + 128);
+                    dstPtrTemp[0] = static_cast<Rpp8s>(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)) - 128)));
+                    dstPtrTemp[1] = static_cast<Rpp8s>(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)) - 128)));
+                    dstPtrTemp[2] = static_cast<Rpp8s>(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)) - 128)));
+
+                    srcPtr1TempR++;
+                    srcPtr1TempG++;
+                    srcPtr1TempB++;
+                    srcPtr2TempR++;
+                    srcPtr2TempG++;
+                    srcPtr2TempB++;
+                    dstPtrTemp += 3;
+                }
+
+                srcPtr1RowR += srcDescPtr->strides.hStride;
+                srcPtr1RowG += srcDescPtr->strides.hStride;
+                srcPtr1RowB += srcDescPtr->strides.hStride;
+                srcPtr2RowR += srcDescPtr->strides.hStride;
+                srcPtr2RowG += srcDescPtr->strides.hStride;
+                srcPtr2RowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+        else
+        {
+#if __AVX2__
+            alignedLength = bufferLength & ~15;
+#endif
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+                srcPtr1Row = srcPtr1Channel;
+                srcPtr2Row = srcPtr2Channel;
+                dstPtrRow = dstPtrChannel;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+                    srcPtr1Temp = srcPtr1Row;
+                    srcPtr2Temp = srcPtr2Row;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                    {
+                        __m256 p1[2], p2[2];
+
+                        rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1Temp, p1);    // simd loads
+                        rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2);    // simd loads
+                        p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0])));    // magnitude computation
+                        p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1])));    // magnitude computation
+                        rpp_simd_store(rpp_store16_f32_to_i8_avx, dstPtrTemp, p1);    // simd stores
+
+                        srcPtr1Temp += vectorIncrementPerChannel;
+                        srcPtr2Temp += vectorIncrementPerChannel;
+                        dstPtrTemp += vectorIncrementPerChannel;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        Rpp32f srcPtr1TempValue = static_cast<Rpp32f>(*srcPtr1Temp + 128);
+                        Rpp32f srcPtr2TempValue = static_cast<Rpp32f>(*srcPtr2Temp + 128);
+                        *dstPtrTemp++ = static_cast<Rpp8s>(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue * srcPtr1TempValue) + (srcPtr2TempValue * srcPtr2TempValue)) - 128)));
+
+                        srcPtr1Temp++;
+                        srcPtr2Temp++;
+                    }
+
+                    srcPtr1Row += srcDescPtr->strides.hStride;
+                    srcPtr2Row += srcDescPtr->strides.hStride;
+                    dstPtrRow += dstDescPtr->strides.hStride;
+                }
+
+                srcPtr1Channel += srcDescPtr->strides.cStride;
+                srcPtr2Channel += srcDescPtr->strides.cStride;
+                dstPtrChannel += dstDescPtr->strides.cStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/multiply_scalar.hpp b/src/modules/cpu/kernel/multiply_scalar.hpp
new file mode 100644
index 000000000..a27782bcc
--- /dev/null
+++ b/src/modules/cpu/kernel/multiply_scalar.hpp
@@ -0,0 +1,152 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus multiply_scalar_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                              RpptGenericDescPtr srcGenericDescPtr,
+                                              Rpp32f *dstPtr,
+                                              RpptGenericDescPtr dstGenericDescPtr,
+                                              Rpp32f *mulTensor,
+                                              RpptROI3DPtr roiGenericPtrSrc,
+                                              RpptRoi3DType roiType,
+                                              RppLayoutParams layoutParams,
+                                              rpp::Handle& handle)
+{
+    RpptROI3D roiDefault;
+    if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
+        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
+    else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
+        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+    {
+        RpptROI3D roi;
+        RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
+        compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+        Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp32f mulParam = mulTensor[batchCount];
+        Rpp32f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        Rpp32f *srcPtrChannel, *dstPtrChannel;
+        dstPtrChannel = dstPtrImage;
+#if __AVX2__
+        Rpp32u vectorIncrement = 16;
+        __m256 pMulParam = _mm256_set1_ps(mulParam);
+        Rpp32u alignedLength = bufferLength & ~(vectorIncrement - 1);
+#endif
+        // multiply without fused output-layout toggle (NCDHW -> NCDHW)
+        if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+        {
+            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp32f *srcPtrDepth, *dstPtrDepth;
+                srcPtrDepth = srcPtrChannel;
+                dstPtrDepth = dstPtrChannel;
+                for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+                {
+                    Rpp32f *srcPtrRow, *dstPtrRow;
+                    srcPtrRow = srcPtrDepth;
+                    dstPtrRow = dstPtrDepth;
+                    for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+                    {
+                        Rpp32f *srcPtrTemp, *dstPtrTemp;
+                        srcPtrTemp = srcPtrRow;
+                        dstPtrTemp = dstPtrRow;
+                        int vectorLoopCount = 0;
+#if __AVX2__
+                        for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                        {
+                            __m256 p[2];
+                            rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p);    // simd loads
+                            compute_multiply_16_host(p, &pMulParam);                    // multiply adjustment
+                            rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);  // simd stores
+                            srcPtrTemp += vectorIncrement;
+                            dstPtrTemp += vectorIncrement;
+                        }
+#endif
+                        for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                            *dstPtrTemp++ = (*srcPtrTemp++ * mulParam);
+
+                        srcPtrRow += srcGenericDescPtr->strides[3];
+                        dstPtrRow += dstGenericDescPtr->strides[3];
+                    }
+                    srcPtrDepth += srcGenericDescPtr->strides[2];
+                    dstPtrDepth += dstGenericDescPtr->strides[2];
+                }
+                srcPtrChannel += srcGenericDescPtr->strides[1];
+                dstPtrChannel += srcGenericDescPtr->strides[1];
+            }
+        }
+        // multiply without fused output-layout toggle (NDHWC -> NDHWC)
+        else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+        {
+            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+            Rpp32f *srcPtrDepth = srcPtrChannel;
+            Rpp32f *dstPtrDepth = dstPtrChannel;
+            for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+            {
+                Rpp32f *srcPtrRow, *dstPtrRow;
+                srcPtrRow = srcPtrDepth;
+                dstPtrRow = dstPtrDepth;
+                for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+                {
+                    Rpp32f *srcPtrTemp, *dstPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+                    dstPtrTemp = dstPtrRow;
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[2];
+                        rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p);    // simd loads
+                        compute_multiply_16_host(p, &pMulParam);                    // multiply adjustment
+                        rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);  // simd stores
+                        srcPtrTemp += vectorIncrement;
+                        dstPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                        *dstPtrTemp++ = (*srcPtrTemp++ * mulParam);
+
+                    srcPtrRow += srcGenericDescPtr->strides[2];
+                    dstPtrRow += dstGenericDescPtr->strides[2];
+                }
+                srcPtrDepth += srcGenericDescPtr->strides[1];
+                dstPtrDepth += dstGenericDescPtr->strides[1];
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/subtract_scalar.hpp b/src/modules/cpu/kernel/subtract_scalar.hpp
new file mode 100644
index 000000000..a40e6219f
--- /dev/null
+++ b/src/modules/cpu/kernel/subtract_scalar.hpp
@@ -0,0 +1,152 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus subtract_scalar_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                              RpptGenericDescPtr srcGenericDescPtr,
+                                              Rpp32f *dstPtr,
+                                              RpptGenericDescPtr dstGenericDescPtr,
+                                              Rpp32f *subtractTensor,
+                                              RpptROI3DPtr roiGenericPtrSrc,
+                                              RpptRoi3DType roiType,
+                                              RppLayoutParams layoutParams,
+                                              rpp::Handle& handle)
+{
+    RpptROI3D roiDefault;
+    if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
+        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
+    else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
+        roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+    {
+        RpptROI3D roi;
+        RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
+        compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        Rpp32f subtractParam = subtractTensor[batchCount];
+        Rpp32f *srcPtrChannel, *dstPtrChannel;
+        dstPtrChannel = dstPtrImage;
+
+        Rpp32u vectorIncrement = 16;
+        Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+        Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+        __m256 pSubtractParam = _mm256_set1_ps(subtractParam);
+
+        // Subtract without fused output-layout toggle (NCDHW -> NCDHW)
+        if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+        {
+            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp32f *srcPtrDepth, *dstPtrDepth;
+                srcPtrDepth = srcPtrChannel;
+                dstPtrDepth = dstPtrChannel;
+                for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+                {
+                    Rpp32f *srcPtrRow, *dstPtrRow;
+                    srcPtrRow = srcPtrDepth;
+                    dstPtrRow = dstPtrDepth;
+                    for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+                    {
+                        Rpp32f *srcPtrTemp, *dstPtrTemp;
+                        srcPtrTemp = srcPtrRow;
+                        dstPtrTemp = dstPtrRow;
+                        int vectorLoopCount = 0;
+                        for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                        {
+                            __m256 p[2];
+                            rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p);    // simd loads
+                            compute_subtract_16_host(p, &pSubtractParam);               // subtract adjustment
+                            rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);  // simd stores
+                            srcPtrTemp += vectorIncrement;
+                            dstPtrTemp += vectorIncrement;
+                        }
+                        for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                        {
+                            *dstPtrTemp++ = *srcPtrTemp++ - subtractParam;
+                        }
+                        srcPtrRow += srcGenericDescPtr->strides[3];
+                        dstPtrRow += dstGenericDescPtr->strides[3];
+                    }
+                    srcPtrDepth += srcGenericDescPtr->strides[2];
+                    dstPtrDepth += dstGenericDescPtr->strides[2];
+                }
+                srcPtrChannel += srcGenericDescPtr->strides[1];
+                dstPtrChannel += srcGenericDescPtr->strides[1];
+            }
+        }
+        // Subtract without fused output-layout toggle (NDHWC -> NDHWC)
+        else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+        {
+            srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+            Rpp32f *srcPtrDepth = srcPtrChannel;
+            Rpp32f *dstPtrDepth = dstPtrChannel;
+            for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+            {
+                Rpp32f *srcPtrRow, *dstPtrRow;
+                srcPtrRow = srcPtrDepth;
+                dstPtrRow = dstPtrDepth;
+                for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+                {
+                    Rpp32f *srcPtrTemp, *dstPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[2];
+                        rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p);    // simd loads
+                        compute_subtract_16_host(p, &pSubtractParam);               // subtract adjustment
+                        rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);  // simd stores
+                        srcPtrTemp += vectorIncrement;
+                        dstPtrTemp += vectorIncrement;
+                    }
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        *dstPtrTemp++ = *srcPtrTemp++ - subtractParam;
+                    }
+                    srcPtrRow += srcGenericDescPtr->strides[2];
+                    dstPtrRow += dstGenericDescPtr->strides[2];
+                }
+                srcPtrDepth += srcGenericDescPtr->strides[1];
+                dstPtrDepth += dstGenericDescPtr->strides[1];
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/tensor_max.hpp b/src/modules/cpu/kernel/tensor_max.hpp
new file mode 100644
index 000000000..0380f4ef6
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_max.hpp
@@ -0,0 +1,847 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus tensor_max_u8_u8_host(Rpp8u *srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                Rpp8u *maxArr,
+                                Rpp32u maxArrLength,
+                                RpptROIPtr roiTensorPtrSrc,
+                                RpptRoiType roiType,
+                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8u *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8u *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 96) * 96;
+        Rpp32u vectorIncrement = 96;
+        Rpp32u vectorIncrementPerChannel = 32;
+
+        // Tensor max 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp8u max = 0;
+            Rpp8u resultAvx[16];
+
+            Rpp8u *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+                __m256i pMax = _mm256_setzero_si256();
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp);
+                        pMax = _mm256_max_epu8(p1, pMax); //compare and store max of 32 values into global max
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        max = std::max(*srcPtrTemp++, max);
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_32_host(&pMax, &result);
+                rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+                max = std::max(resultAvx[0], max);
+#endif
+            maxArr[batchCount] = max;
+        }
+        // Tensor max 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0;
+            Rpp8u resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+                srcPtrRowR = srcPtrChannel;
+                srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+                srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+                __m256i pMaxR = _mm256_setzero_si256();
+                __m256i pMaxG = pMaxR;
+                __m256i pMaxB = pMaxR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                    srcPtrTempR = srcPtrRowR;
+                    srcPtrTempG = srcPtrRowG;
+                    srcPtrTempB = srcPtrRowB;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                    {
+                        __m256i p[3];
+                        rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                        compute_max_96_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTempR += vectorIncrementPerChannel;
+                        srcPtrTempG += vectorIncrementPerChannel;
+                        srcPtrTempB += vectorIncrementPerChannel;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        maxR = std::max(*srcPtrTempR++, maxR);
+                        maxG = std::max(*srcPtrTempG++, maxG);
+                        maxB = std::max(*srcPtrTempB++, maxB);
+                    }
+                    srcPtrRowR += srcDescPtr->strides.hStride;
+                    srcPtrRowG += srcDescPtr->strides.hStride;
+                    srcPtrRowB += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_96_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+                maxR = std::max(resultAvx[0], maxR);
+                maxG = std::max(resultAvx[1], maxG);
+                maxB = std::max(resultAvx[2], maxB);
+#endif
+            }
+            maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+            maxArr[maxArrIndex + 1] = maxG;
+            maxArr[maxArrIndex + 2] = maxB;
+            maxArr[maxArrIndex + 3] = maxC;
+        }
+
+        // Tensor max 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+            Rpp32u vectorIncrement = 48;
+            Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0;
+            Rpp8u resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8u *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+                __m128i pMaxR = _mm_setzero_si128();
+                __m128i pMaxG = pMaxR;
+                __m128i pMaxB = pMaxR;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m128i p[3];
+                        rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p);
+                        compute_max_48_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        maxR = std::max(srcPtrTemp[0], maxR);
+                        maxG = std::max(srcPtrTemp[1], maxG);
+                        maxB = std::max(srcPtrTemp[2], maxB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_48_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+                maxR = std::max(resultAvx[0], maxR);
+                maxG = std::max(resultAvx[1], maxG);
+                maxB = std::max(resultAvx[2], maxB);
+#endif
+            }
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+			maxArr[maxArrIndex + 1] = maxG;
+			maxArr[maxArrIndex + 2] = maxB;
+			maxArr[maxArrIndex + 3] = maxC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_f32_f32_host(Rpp32f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp32f *maxArr,
+                                  Rpp32u maxArrLength,
+                                  RpptROIPtr roiTensorPtrSrc,
+                                  RpptRoiType roiType,
+                                  RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp32f *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+
+        // Tensor max 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp32f max = 0.0;
+            Rpp32f resultAvx[4];
+
+            Rpp32f *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256 pMax = _mm256_setzero_ps();
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p1;
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1);
+                    compute_max_float8_host(&p1, &pMax);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    max = std::max(*srcPtrTemp++, max);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128 result;
+            reduce_max_float8_host(&pMax, &result);
+            rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+            max = std::max(std::max(resultAvx[0], resultAvx[1]), max);
+#endif
+            maxArr[batchCount] = max;
+        }
+
+        // Tensor max 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+            Rpp32f resultAvx[8];
+
+            Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256 pMaxR = _mm256_setzero_ps();
+            __m256 pMaxG = pMaxR;
+            __m256 pMaxB = pMaxR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256 p[3];
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                    compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    maxR = std::max(*srcPtrTempR++, maxR);
+                    maxG = std::max(*srcPtrTempG++, maxG);
+                    maxB = std::max(*srcPtrTempB++, maxB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m256 result;
+            reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+            rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+            maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+            maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+            maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+			maxArr[maxArrIndex + 1] = maxG;
+			maxArr[maxArrIndex + 2] = maxB;
+			maxArr[maxArrIndex + 3] = maxC;
+        }
+
+        // Tensor max 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+            Rpp32u vectorIncrement = 24;
+            Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+            Rpp32f resultAvx[8];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp32f *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+                __m256 pMaxR = _mm256_setzero_ps();
+                __m256 pMaxG = pMaxR;
+                __m256 pMaxB = pMaxR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp32f *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[3];
+                        rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);
+                        compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        maxR = std::max(srcPtrTemp[0], maxR);
+                        maxG = std::max(srcPtrTemp[1], maxG);
+                        maxB = std::max(srcPtrTemp[2], maxB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m256 result;
+                reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+                maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+                maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+                maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+            }
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+			maxArr[maxArrIndex + 1] = maxG;
+			maxArr[maxArrIndex + 2] = maxB;
+			maxArr[maxArrIndex + 3] = maxC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_f16_f16_host(Rpp16f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp16f *maxArr,
+                                  Rpp32u maxArrLength,
+                                  RpptROIPtr roiTensorPtrSrc,
+                                  RpptRoiType roiType,
+                                  RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp16f *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp16f *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+
+        // Tensor max 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp32f max = 0.0;
+            Rpp32f resultAvx[4];
+
+            Rpp16f *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256 pMax = _mm256_setzero_ps();
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    Rpp32f srcPtrTemp_ps[8];
+                    for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                    {
+                        srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+                    }
+                    __m256 p1;
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1);
+                    compute_max_float8_host(&p1, &pMax);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    max = std::max((Rpp32f)*srcPtrTemp++, max);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128 result;
+            reduce_max_float8_host(&pMax, &result);
+            rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+            max = std::max(std::max(resultAvx[0], resultAvx[1]), max);
+#endif
+            maxArr[batchCount] = (Rpp16f)max;
+        }
+
+        // Tensor max 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+            Rpp32f resultAvx[8];
+
+            Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256 pMaxR = _mm256_setzero_ps();
+            __m256 pMaxG = pMaxR;
+            __m256 pMaxB = pMaxR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+                    for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                    {
+                        srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+                        srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+                        srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+                    }
+                    __m256 p[3];
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+                    compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    maxR = std::max((Rpp32f)*srcPtrTempR++, maxR);
+                    maxG = std::max((Rpp32f)*srcPtrTempG++, maxG);
+                    maxB = std::max((Rpp32f)*srcPtrTempB++, maxB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m256 result;
+            reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+            rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+            maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+            maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+            maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+
+#endif
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = (Rpp16f)maxR;
+			maxArr[maxArrIndex + 1] = (Rpp16f)maxG;
+			maxArr[maxArrIndex + 2] = (Rpp16f)maxB;
+			maxArr[maxArrIndex + 3] = (Rpp16f)maxC;
+        }
+
+        // Tensor max 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+            Rpp32u vectorIncrement = 24;
+            Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+            Rpp32f resultAvx[8];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp16f *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+                __m256 pMaxR = _mm256_setzero_ps();
+                __m256 pMaxG = pMaxR;
+                __m256 pMaxB = pMaxR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp16f *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        Rpp32f srcPtrTemp_ps[24];
+                        for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                        {
+                            srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+                        }
+                        __m256 p[3];
+                        rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);
+                        compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        maxR = std::max((Rpp32f)srcPtrTemp[0], maxR);
+                        maxG = std::max((Rpp32f)srcPtrTemp[1], maxG);
+                        maxB = std::max((Rpp32f)srcPtrTemp[2], maxB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m256 result;
+                reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+                maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+                maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+                maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+            }
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = (Rpp16f)maxR;
+			maxArr[maxArrIndex + 1] = (Rpp16f)maxG;
+			maxArr[maxArrIndex + 2] = (Rpp16f)maxB;
+			maxArr[maxArrIndex + 3] = (Rpp16f)maxC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_i8_i8_host(Rpp8s *srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                Rpp8s *maxArr,
+                                Rpp32u maxArrLength,
+                                RpptROIPtr roiTensorPtrSrc,
+                                RpptRoiType roiType,
+                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8s *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8s *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 96) * 96;
+        Rpp32u vectorIncrement = 96;
+        Rpp32u vectorIncrementPerChannel = 32;
+
+        // Tensor max 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp8s max = INT8_MIN;
+            Rpp8s resultAvx[16];
+
+            Rpp8s *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+                __m256i pMax = _mm256_set1_epi8(INT8_MIN);
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp);
+                        pMax = _mm256_max_epi8(p1, pMax); //compare and store max of 32 values into global max
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        max = std::max(*srcPtrTemp++, max);
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_i32_host(&pMax, &result);
+                rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+                max = std::max(resultAvx[0], max);
+#endif
+            maxArr[batchCount] = max;
+        }
+        // Tensor max 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN;
+            Rpp8s resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+                srcPtrRowR = srcPtrChannel;
+                srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+                srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+                __m256i pMaxR = _mm256_set1_epi8(INT8_MIN);
+                __m256i pMaxG = pMaxR;
+                __m256i pMaxB = pMaxR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                    srcPtrTempR = srcPtrRowR;
+                    srcPtrTempG = srcPtrRowG;
+                    srcPtrTempB = srcPtrRowB;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                    {
+                        __m256i p[3];
+                        rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                        compute_max_i96_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTempR += vectorIncrementPerChannel;
+                        srcPtrTempG += vectorIncrementPerChannel;
+                        srcPtrTempB += vectorIncrementPerChannel;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        maxR = std::max(*srcPtrTempR++, maxR);
+                        maxG = std::max(*srcPtrTempG++, maxG);
+                        maxB = std::max(*srcPtrTempB++, maxB);
+                    }
+                    srcPtrRowR += srcDescPtr->strides.hStride;
+                    srcPtrRowG += srcDescPtr->strides.hStride;
+                    srcPtrRowB += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_i96_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+                maxR = std::max(resultAvx[0], maxR);
+                maxG = std::max(resultAvx[1], maxG);
+                maxB = std::max(resultAvx[2], maxB);
+#endif
+            }
+            maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+            maxArr[maxArrIndex + 1] = maxG;
+            maxArr[maxArrIndex + 2] = maxB;
+            maxArr[maxArrIndex + 3] = maxC;
+        }
+
+        // Tensor max 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+            Rpp32u vectorIncrement = 48;
+            Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN;
+            Rpp8s resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8s *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+                __m128i pMaxR = _mm_set1_epi8(INT8_MIN);
+                __m128i pMaxG = pMaxR;
+                __m128i pMaxB = pMaxR;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m128i p[3];
+                        rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p);
+                        compute_max_i48_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        maxR = std::max(srcPtrTemp[0], maxR);
+                        maxG = std::max(srcPtrTemp[1], maxG);
+                        maxB = std::max(srcPtrTemp[2], maxB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_i48_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+                maxR = std::max(resultAvx[0], maxR);
+                maxG = std::max(resultAvx[1], maxG);
+                maxB = std::max(resultAvx[2], maxB);
+#endif
+            }
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+			maxArr[maxArrIndex + 1] = maxG;
+			maxArr[maxArrIndex + 2] = maxB;
+			maxArr[maxArrIndex + 3] = maxC;
+        }
+    }
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/tensor_min.hpp b/src/modules/cpu/kernel/tensor_min.hpp
new file mode 100644
index 000000000..15b9b77ba
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_min.hpp
@@ -0,0 +1,845 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus tensor_min_u8_u8_host(Rpp8u *srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                Rpp8u *minArr,
+                                Rpp32u minArrLength,
+                                RpptROIPtr roiTensorPtrSrc,
+                                RpptRoiType roiType,
+                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8u *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8u *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 96) * 96;
+        Rpp32u vectorIncrement = 96;
+        Rpp32u vectorIncrementPerChannel = 32;
+
+        // Tensor min 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp8u min = 255;
+            Rpp8u resultAvx[16];
+
+            Rpp8u *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256i pMin = _mm256_set1_epi8((char)255);
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp);
+                    pMin = _mm256_min_epu8(p1, pMin);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    min = std::min(*srcPtrTemp++, min);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128i result;
+            reduce_min_32_host(&pMin, &result);
+            rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+            min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+            minArr[batchCount] = min;
+        }
+
+        // Tensor min 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp8u minC = 255, minR = 255, minG = 255, minB = 255;
+            Rpp8u resultAvx[16];
+
+            Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256i pMinR = _mm256_set1_epi8((char)255);
+            __m256i pMinG = pMinR;
+            __m256i pMinB = pMinR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i p[3];
+                    rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                    compute_min_96_host(p, &pMinR, &pMinG, &pMinB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    minR = std::min(*srcPtrTempR++, minR);
+                    minG = std::min(*srcPtrTempG++, minG);
+                    minB = std::min(*srcPtrTempB++, minB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128i result;
+            reduce_min_96_host(&pMinR, &pMinG, &pMinB, &result);
+            rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+            minR = std::min(resultAvx[0], minR);
+            minG = std::min(resultAvx[1], minG);
+            minB = std::min(resultAvx[2], minB);
+#endif
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+
+        // Tensor min 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+            Rpp32u vectorIncrement = 48;
+            Rpp8u minC = 255, minR = 255, minG = 255, minB = 255;
+            Rpp8u resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8u *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+                __m128i pMinR = _mm_set1_epi8((char)255);
+                __m128i pMinG = pMinR;
+                __m128i pMinB = pMinR;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m128i p[3];
+                        rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p);
+                        compute_min_48_host(p, &pMinR, &pMinG, &pMinB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        minR = std::min(srcPtrTemp[0], minR);
+                        minG = std::min(srcPtrTemp[1], minG);
+                        minB = std::min(srcPtrTemp[2], minB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+
+                __m128i result;
+                reduce_min_48_host(&pMinR, &pMinG, &pMinB, &result);
+                rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+                minR = std::min(resultAvx[0], minR);
+                minG = std::min(resultAvx[1], minG);
+                minB = std::min(resultAvx[2], minB);
+            }
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_f32_f32_host(Rpp32f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp32f *minArr,
+                                  Rpp32u minArrLength,
+                                  RpptROIPtr roiTensorPtrSrc,
+                                  RpptRoiType roiType,
+                                  RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp32f *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+
+        // Tensor min 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp32f min = 255.0;
+            Rpp32f resultAvx[4];
+
+            Rpp32f *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256 pMin = _mm256_set1_ps(255.0);
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p1;
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1);
+                    compute_min_float8_host(&p1, &pMin);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    min = std::min(*srcPtrTemp++, min);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+
+#if __AVX2__
+            __m128 result;
+            reduce_min_float8_host(&pMin, &result);
+            rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+            min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+            minArr[batchCount] = min;
+        }
+
+        // Tensor min 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+            Rpp32f resultAvx[8];
+
+            Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256 pMinR = _mm256_set1_ps(255.0);
+            __m256 pMinG = pMinR;
+            __m256 pMinB = pMinR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256 p[3];
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                    compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    minR = std::min(*srcPtrTempR++, minR);
+                    minG = std::min(*srcPtrTempG++, minG);
+                    minB = std::min(*srcPtrTempB++, minB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m256 result;
+            reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+            rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+            minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+            minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+            minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+
+        // Tensor min 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+            Rpp32u vectorIncrement = 24;
+            Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+            Rpp32f resultAvx[8];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp32f *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+                __m256 pMinR = _mm256_set1_ps(255.0);
+                __m256 pMinG = pMinR;
+                __m256 pMinB = pMinR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp32f *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[3];
+                        rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);
+                        compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        minR = std::min(srcPtrTemp[0], minR);
+                        minG = std::min(srcPtrTemp[1], minG);
+                        minB = std::min(srcPtrTemp[2], minB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+
+#if __AVX2__
+                __m256 result;
+                reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+                rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+                minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+                minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+                minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+            }
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_f16_f16_host(Rpp16f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp16f *minArr,
+                                  Rpp32u minArrLength,
+                                  RpptROIPtr roiTensorPtrSrc,
+                                  RpptRoiType roiType,
+                                  RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp16f *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp16f *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+
+        // Tensor min 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp32f min = 255.0;
+            Rpp32f resultAvx[4];
+
+            Rpp16f *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256 pMin = _mm256_set1_ps(255.0);
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    Rpp32f srcPtrTemp_ps[8];
+                    for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                    {
+                        srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+                    }
+                    __m256 p1;
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1);
+                    compute_min_float8_host(&p1, &pMin);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    min = std::min((Rpp32f)*srcPtrTemp++, min);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+
+#if __AVX2__
+            __m128 result;
+            reduce_min_float8_host(&pMin, &result);
+            rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+            min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+            minArr[batchCount] = (Rpp16f) min;
+        }
+
+        // Tensor min 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+            Rpp32f resultAvx[8];
+
+            Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256 pMinR = _mm256_set1_ps(255.0);
+            __m256 pMinG = pMinR;
+            __m256 pMinB = pMinR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+                    for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                    {
+                        srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+                        srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+                        srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+                    }
+                    __m256 p[3];
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+                    compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    minR = std::min((Rpp32f)*srcPtrTempR++, minR);
+                    minG = std::min((Rpp32f)*srcPtrTempG++, minG);
+                    minB = std::min((Rpp32f)*srcPtrTempB++, minB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m256 result;
+            reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+            rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+            minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+            minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+            minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = (Rpp16f) minR;
+			minArr[minArrIndex + 1] = (Rpp16f) minG;
+			minArr[minArrIndex + 2] = (Rpp16f) minB;
+			minArr[minArrIndex + 3] = (Rpp16f) minC;
+        }
+
+        // Tensor min 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+            Rpp32u vectorIncrement = 24;
+            Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+            Rpp32f resultAvx[8];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp16f *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+                __m256 pMinR = _mm256_set1_ps(255.0);
+                __m256 pMinG = pMinR;
+                __m256 pMinB = pMinR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp16f *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        Rpp32f srcPtrTemp_ps[24];
+                        for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                        {
+                            srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+                        }
+                        __m256 p[3];
+                        rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);
+                        compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        minR = std::min((Rpp32f)srcPtrTemp[0], minR);
+                        minG = std::min((Rpp32f)srcPtrTemp[1], minG);
+                        minB = std::min((Rpp32f)srcPtrTemp[2], minB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+
+#if __AVX2__
+                __m256 result;
+                reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+                rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+                minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+                minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+                minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+            }
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = (Rpp16f) minR;
+			minArr[minArrIndex + 1] = (Rpp16f) minG;
+			minArr[minArrIndex + 2] = (Rpp16f) minB;
+			minArr[minArrIndex + 3] = (Rpp16f) minC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_i8_i8_host(Rpp8s *srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                Rpp8s *minArr,
+                                Rpp32u minArrLength,
+                                RpptROIPtr roiTensorPtrSrc,
+                                RpptRoiType roiType,
+                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8s *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8s *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 96) * 96;
+        Rpp32u vectorIncrement = 96;
+        Rpp32u vectorIncrementPerChannel = 32;
+
+        // Tensor min 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp8s min = 127;
+            Rpp8s resultAvx[16];
+
+            Rpp8s *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256i pMin = _mm256_set1_epi8((char)127);
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp);
+                    pMin = _mm256_min_epi8(p1, pMin); //compare and store min of 32 values into global min
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    min = std::min((*srcPtrTemp++), min);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+
+#if __AVX2__
+            __m128i result;
+            reduce_min_i32_host(&pMin, &result);
+            rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+            min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+            minArr[batchCount] = min;
+        }
+
+        // Tensor min 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp8s minC = 127, minR = 127, minG = 127, minB = 127;
+            Rpp8s resultAvx[16];
+
+            Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256i pMinR = _mm256_set1_epi8((char)127);
+            __m256i pMinG = pMinR;
+            __m256i pMinB = pMinR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i p[3];
+                    rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                    compute_min_i96_host(p, &pMinR, &pMinG, &pMinB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    minR = std::min(*srcPtrTempR++, minR);
+                    minG = std::min(*srcPtrTempG++, minG);
+                    minB = std::min(*srcPtrTempB++, minB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128i result;
+            reduce_min_i96_host(&pMinR, &pMinG, &pMinB, &result);
+            rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+            minR = std::min(resultAvx[0], minR);
+            minG = std::min(resultAvx[1], minG);
+            minB = std::min(resultAvx[2], minB);
+#endif
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+
+        // Tensor min 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+            Rpp32u vectorIncrement = 48;
+            Rpp8s minC = 127, minR = 127, minG = 127, minB = 127;
+            Rpp8s resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8s *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+                __m128i pMinR = _mm_set1_epi8((char)127);
+                __m128i pMinG = pMinR;
+                __m128i pMinB = pMinR;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m128i p[3];
+                        rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p);
+                        compute_min_i48_host(p, &pMinR, &pMinG, &pMinB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        minR = std::min(srcPtrTemp[0], minR);
+                        minG = std::min(srcPtrTemp[1], minG);
+                        minB = std::min(srcPtrTemp[2], minB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_min_i48_host(&pMinR, &pMinG, &pMinB, &result);
+                rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+                minR = std::min(resultAvx[0], minR);
+                minG = std::min(resultAvx[1], minG);
+                minB = std::min(resultAvx[2], minB);
+#endif
+            }
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+    }
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
index 55fbb7832..0345171fc 100644
--- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp
+++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
@@ -26,5 +26,9 @@ SOFTWARE.
 #define HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
 
 #include "kernel/fused_multiply_add_scalar.hpp"
+#include "kernel/add_scalar.hpp"
+#include "kernel/subtract_scalar.hpp"
+#include "kernel/multiply_scalar.hpp"
+#include "kernel/magnitude.hpp"
 
 #endif // HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_color_augmentations.hpp b/src/modules/hip/hip_tensor_color_augmentations.hpp
index 873f06b97..c5610dbcb 100644
--- a/src/modules/hip/hip_tensor_color_augmentations.hpp
+++ b/src/modules/hip/hip_tensor_color_augmentations.hpp
@@ -33,5 +33,6 @@ SOFTWARE.
 #include "kernel/exposure.hpp"
 #include "kernel/contrast.hpp"
 #include "kernel/lut.hpp"
+#include "kernel/color_temperature.hpp"
 
 #endif // HIP_TENSOR_COLOR_AUGMENTATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp
index 328a232a1..c79e0a951 100644
--- a/src/modules/hip/hip_tensor_statistical_operations.hpp
+++ b/src/modules/hip/hip_tensor_statistical_operations.hpp
@@ -23,8 +23,9 @@ SOFTWARE.
 */
 
 #ifndef HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
-#define HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
 
 #include "kernel/tensor_sum.hpp"
+#include "kernel/tensor_min.hpp"
+#include "kernel/tensor_max.hpp"
 
-#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
+#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/add_scalar.hpp b/src/modules/hip/kernel/add_scalar.hpp
new file mode 100644
index 000000000..709337c9d
--- /dev/null
+++ b/src/modules/hip/kernel/add_scalar.hpp
@@ -0,0 +1,114 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+
+__global__ void add_scalar_ncdhw_hip_tensor(float *srcPtr,
+                                            uint3 srcStridesCDH,
+                                            float *dstPtr,
+                                            uint3 dstStridesCDH,
+                                            int channels,
+                                            float addParam,
+                                            RpptROI3DPtr roiGenericPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // W - inner most dim vectorized
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
+
+    if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x);
+    uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+
+    d_float8 val_f8;
+    for(int c = 0; c < channels; c++)
+    {
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8);
+        rpp_hip_math_add8_const(&val_f8, &val_f8, static_cast<float4>(addParam));
+        rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+        srcIdx += srcStridesCDH.x;
+        dstIdx += dstStridesCDH.x;
+    }
+}
+
+__global__ void add_scalar_ndhwc_hip_tensor(float *srcPtr,
+                                            uint2 srcStridesDH,
+                                            float *dstPtr,
+                                            uint2 dstStridesDH,
+                                            float addParam,
+                                            RpptROI3DPtr roiGenericPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // WC - inner most dim vectorized
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
+
+    if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3;
+    uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+
+    d_float24 val_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
+    rpp_hip_math_add24_const(&val_f24, &val_f24, static_cast<float4>(addParam));
+    rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+RppStatus hip_exec_add_scalar_tensor(Rpp32f *srcPtr,
+                                     RpptGenericDescPtr srcGenericDescPtr,
+                                     Rpp32f *dstPtr,
+                                     RpptGenericDescPtr dstGenericDescPtr,
+                                     RpptROI3DPtr roiGenericPtrSrc,
+                                     Rpp32f *addTensor,
+                                     rpp::Handle& handle)
+{
+    if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+    {
+        int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[3];               // H - height (y direction)
+        int globalThreads_z = dstGenericDescPtr->dims[2];               // D - depth (z direction)
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            hipLaunchKernelGGL(add_scalar_ncdhw_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+                               make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                               make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+                               dstGenericDescPtr->dims[1],
+                               addTensor[batchCount],
+                               &roiGenericPtrSrc[batchCount]);
+        }
+    }
+    else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+    {
+        int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[2];                   // H - height (y direction)
+        int globalThreads_z = dstGenericDescPtr->dims[1];                   // D - depth (z direction)
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            hipLaunchKernelGGL(add_scalar_ndhwc_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+                               make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                               make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+                               addTensor[batchCount],
+                               &roiGenericPtrSrc[batchCount]);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/color_temperature.hpp b/src/modules/hip/kernel/color_temperature.hpp
new file mode 100644
index 000000000..ad8adc32a
--- /dev/null
+++ b/src/modules/hip/kernel/color_temperature.hpp
@@ -0,0 +1,223 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+template <typename T>
+__device__ void color_temperature_hip_compute(T *srcPtr, d_float24 *pix_f24, float4 *adjustmentValue_f4)
+{
+    float4 adjustment_f4;
+    if constexpr ((std::is_same<T, float>::value) || (std::is_same<T, half>::value))
+    {
+        adjustment_f4 = *adjustmentValue_f4 * (float4) ONE_OVER_255;
+        rpp_hip_math_add8_const(&pix_f24->f8[0], &pix_f24->f8[0], adjustment_f4);
+        rpp_hip_math_subtract8_const(&pix_f24->f8[2], &pix_f24->f8[2], adjustment_f4);
+    }
+    else if constexpr (std::is_same<T, schar>::value)
+    {
+        adjustment_f4 = *adjustmentValue_f4;
+        rpp_hip_math_add24_const(pix_f24, pix_f24, (float4)128);
+        rpp_hip_math_add8_const(&pix_f24->f8[0], &pix_f24->f8[0], adjustment_f4);
+        rpp_hip_math_subtract8_const(&pix_f24->f8[2], &pix_f24->f8[2], adjustment_f4);
+        rpp_hip_pixel_check_0to255(pix_f24);
+        rpp_hip_math_subtract24_const(pix_f24, pix_f24, (float4)128);
+    }
+    else
+    {
+        rpp_hip_math_add8_const(&pix_f24->f8[0], &pix_f24->f8[0], *adjustmentValue_f4);
+        rpp_hip_math_subtract8_const(&pix_f24->f8[2], &pix_f24->f8[2], *adjustmentValue_f4);
+    }
+}
+
+template <typename T>
+__global__ void color_temperature_pkd_hip_tensor(T *srcPtr,
+                                                 uint2 srcStridesNH,
+                                                 T *dstPtr,
+                                                 uint2 dstStridesNH,
+                                                 int *adjustmentValueTensor,
+                                                 RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+    float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]);
+
+    d_float24 pix_f24;
+
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &pix_f24);
+    color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4);
+    rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &pix_f24);
+}
+
+template <typename T>
+__global__ void color_temperature_pln_hip_tensor(T *srcPtr,
+                                                 uint3 srcStridesNCH,
+                                                 T *dstPtr,
+                                                 uint3 dstStridesNCH,
+                                                 int *adjustmentValueTensor,
+                                                 RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+    uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+    float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]);
+
+    d_float24 pix_f24;
+
+    rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &pix_f24);
+    color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4);
+    rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &pix_f24);
+}
+
+template <typename T>
+__global__ void color_temperature_pkd3_pln3_hip_tensor(T *srcPtr,
+                                                       uint2 srcStridesNH,
+                                                       T *dstPtr,
+                                                       uint3 dstStridesNCH,
+                                                       int *adjustmentValueTensor,
+                                                       RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+    uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+    float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]);
+
+    d_float24 pix_f24;
+
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &pix_f24);
+    color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4);
+    rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &pix_f24);
+}
+
+template <typename T>
+__global__ void color_temperature_pln3_pkd3_hip_tensor(T *srcPtr,
+                                                       uint3 srcStridesNCH,
+                                                       T *dstPtr,
+                                                       uint2 dstStridesNH,
+                                                       int *adjustmentValueTensor,
+                                                       RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+    float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]);
+
+    d_float24 pix_f24;
+
+    rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &pix_f24);
+    color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4);
+    rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &pix_f24);
+}
+
+template <typename T>
+RppStatus hip_exec_color_temperature_tensor(T *srcPtr,
+                                            RpptDescPtr srcDescPtr,
+                                            T *dstPtr,
+                                            RpptDescPtr dstDescPtr,
+                                            RpptROIPtr roiTensorPtrSrc,
+                                            RpptRoiType roiType,
+                                            rpp::Handle& handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+    if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+    {
+        int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
+        int globalThreads_y = dstDescPtr->h;
+        int globalThreads_z = handle.GetBatchSize();
+
+        if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            hipLaunchKernelGGL(color_temperature_pkd_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                               dstPtr,
+                               make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                               handle.GetInitHandle()->mem.mgpu.intArr[0].intmem,
+                               roiTensorPtrSrc);
+        }
+        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            hipLaunchKernelGGL(color_temperature_pln_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                               dstPtr,
+                               make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+                               handle.GetInitHandle()->mem.mgpu.intArr[0].intmem,
+                               roiTensorPtrSrc);
+        }
+        else if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            hipLaunchKernelGGL(color_temperature_pkd3_pln3_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                               dstPtr,
+                               make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+                               handle.GetInitHandle()->mem.mgpu.intArr[0].intmem,
+                               roiTensorPtrSrc);
+        }
+        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            hipLaunchKernelGGL(color_temperature_pln3_pkd3_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                               dstPtr,
+                               make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                               handle.GetInitHandle()->mem.mgpu.intArr[0].intmem,
+                               roiTensorPtrSrc);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/magnitude.hpp b/src/modules/hip/kernel/magnitude.hpp
new file mode 100644
index 000000000..902d27bde
--- /dev/null
+++ b/src/modules/hip/kernel/magnitude.hpp
@@ -0,0 +1,244 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+template <typename T>
+__device__ void magnitude_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
+{
+    if constexpr (std::is_same<T, schar>::value)
+    {
+        rpp_hip_math_add8_const(src1_f8, src1_f8, (float4)128);
+        rpp_hip_math_add8_const(src2_f8, src2_f8, (float4)128);
+    }
+
+    d_float8 src1Sq_f8, src2Sq_f8, sum_f8;
+    rpp_hip_math_multiply8(src1_f8, src1_f8, &src1Sq_f8);
+    rpp_hip_math_multiply8(src2_f8, src2_f8, &src2Sq_f8);
+    rpp_hip_math_add8(&src1Sq_f8, &src2Sq_f8, &sum_f8);
+    rpp_hip_math_sqrt8(&sum_f8, dst_f8);
+
+    if constexpr (std::is_same<T, schar>::value)
+    {
+        dst_f8->f4[0] = rpp_hip_pixel_check_0to255(dst_f8->f4[0]) - (float4)128;
+        dst_f8->f4[1] = rpp_hip_pixel_check_0to255(dst_f8->f4[1]) - (float4)128;
+    }
+}
+
+template <typename T>
+__global__ void magnitude_pkd_hip_tensor(T *srcPtr1,
+                                         T *srcPtr2,
+                                         uint2 srcStridesNH,
+                                         T *dstPtr,
+                                         uint2 dstStridesNH,
+                                         RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3;
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+    d_float24 src1_f24, src2_f24, dst_f24;
+
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+    magnitude_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+    magnitude_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+    magnitude_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+    rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template <typename T>
+__global__ void magnitude_pln_hip_tensor(T *srcPtr1,
+                                         T *srcPtr2,
+                                         uint3 srcStridesNCH,
+                                         T *dstPtr,
+                                         uint3 dstStridesNCH,
+                                         int channelsDst,
+                                         RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+    uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+    d_float8 src1_f8, src2_f8, dst_f8;
+
+    rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+    rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+    magnitude_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+    if (channelsDst == 3)
+    {
+        srcIdx += srcStridesNCH.y;
+        dstIdx += dstStridesNCH.y;
+
+        rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+        rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+        magnitude_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+        rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+        srcIdx += srcStridesNCH.y;
+        dstIdx += dstStridesNCH.y;
+
+        rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+        rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+        magnitude_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+        rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+    }
+}
+
+template <typename T>
+__global__ void magnitude_pkd3_pln3_hip_tensor(T *srcPtr1,
+                                               T *srcPtr2,
+                                               uint2 srcStridesNH,
+                                               T *dstPtr,
+                                               uint3 dstStridesNCH,
+                                               RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+    uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+    d_float24 src1_f24, src2_f24, dst_f24;
+
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+    magnitude_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+    magnitude_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+    magnitude_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+    rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
+}
+
+template <typename T>
+__global__ void magnitude_pln3_pkd3_hip_tensor(T *srcPtr1,
+                                               T *srcPtr2,
+                                               uint3 srcStridesNCH,
+                                               T *dstPtr,
+                                               uint2 dstStridesNH,
+                                               RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+    d_float24 src1_f24, src2_f24, dst_f24;
+
+    rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24);
+    rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24);
+    magnitude_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+    magnitude_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+    magnitude_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+    rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template <typename T>
+RppStatus hip_exec_magnitude_tensor(T *srcPtr1,
+                                    T *srcPtr2,
+                                    RpptDescPtr srcDescPtr,
+                                    T *dstPtr,
+                                    RpptDescPtr dstDescPtr,
+                                    RpptROIPtr roiTensorPtrSrc,
+                                    RpptRoiType roiType,
+                                    rpp::Handle& handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+    int globalThreads_x = (dstDescPtr->w + 7) >> 3;
+    int globalThreads_y = dstDescPtr->h;
+    int globalThreads_z = handle.GetBatchSize();
+
+    if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+    {
+        hipLaunchKernelGGL(magnitude_pkd_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr1,
+                           srcPtr2,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           dstPtr,
+                           make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                           roiTensorPtrSrc);
+    }
+    else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+    {
+        hipLaunchKernelGGL(magnitude_pln_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr1,
+                           srcPtr2,
+                           make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                           dstPtr,
+                           make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+                           dstDescPtr->c,
+                           roiTensorPtrSrc);
+    }
+    else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+    {
+        if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            hipLaunchKernelGGL(magnitude_pkd3_pln3_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr1,
+                               srcPtr2,
+                               make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                               dstPtr,
+                               make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+                               roiTensorPtrSrc);
+        }
+        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
+            hipLaunchKernelGGL(magnitude_pln3_pkd3_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr1,
+                               srcPtr2,
+                               make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                               dstPtr,
+                               make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                               roiTensorPtrSrc);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/multiply_scalar.hpp b/src/modules/hip/kernel/multiply_scalar.hpp
new file mode 100644
index 000000000..e0816576a
--- /dev/null
+++ b/src/modules/hip/kernel/multiply_scalar.hpp
@@ -0,0 +1,114 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+
+__global__ void multiply_scalar_ncdhw_hip_tensor(float *srcPtr,
+                                                 uint3 srcStridesCDH,
+                                                 float *dstPtr,
+                                                 uint3 dstStridesCDH,
+                                                 int channels,
+                                                 float mulParam,
+                                                 RpptROI3DPtr roiGenericPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // W - inner most dim vectorized
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
+
+    if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x);
+    uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+
+    d_float8 val_f8;
+    for(int c = 0; c < channels; c++)
+    {
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8);
+        rpp_hip_math_multiply8_const(&val_f8, &val_f8, static_cast<float4>(mulParam));
+        rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+        srcIdx += srcStridesCDH.x;
+        dstIdx += dstStridesCDH.x;
+    }
+}
+
+__global__ void multiply_scalar_ndhwc_hip_tensor(float *srcPtr,
+                                                 uint2 srcStridesDH,
+                                                 float *dstPtr,
+                                                 uint2 dstStridesDH,
+                                                 float mulParam,
+                                                 RpptROI3DPtr roiGenericPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // WC - inner most dim vectorized
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
+
+    if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3;
+    uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+
+    d_float24 val_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
+    rpp_hip_math_multiply24_const(&val_f24, &val_f24, static_cast<float4>(mulParam));
+    rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+RppStatus hip_exec_multiply_scalar_tensor(Rpp32f *srcPtr,
+                                          RpptGenericDescPtr srcGenericDescPtr,
+                                          Rpp32f *dstPtr,
+                                          RpptGenericDescPtr dstGenericDescPtr,
+                                          RpptROI3DPtr roiGenericPtrSrc,
+                                          Rpp32f *mulTensor,
+                                          rpp::Handle& handle)
+{
+    if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+    {
+        int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[3];               // H - height (y direction)
+        int globalThreads_z = dstGenericDescPtr->dims[2];               // D - depth (z direction)
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            hipLaunchKernelGGL(multiply_scalar_ncdhw_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+                               make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                               make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+                               dstGenericDescPtr->dims[1],
+                               mulTensor[batchCount],
+                               &roiGenericPtrSrc[batchCount]);
+        }
+    }
+    else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+    {
+        int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[2];                   // H - height (y direction)
+        int globalThreads_z = dstGenericDescPtr->dims[1];                   // D - depth (z direction)
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            hipLaunchKernelGGL(multiply_scalar_ndhwc_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+                               make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                               make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+                               mulTensor[batchCount],
+                               &roiGenericPtrSrc[batchCount]);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/subtract_scalar.hpp b/src/modules/hip/kernel/subtract_scalar.hpp
new file mode 100644
index 000000000..7ee128709
--- /dev/null
+++ b/src/modules/hip/kernel/subtract_scalar.hpp
@@ -0,0 +1,114 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+
+__global__ void subtract_scalar_ncdhw_hip_tensor(float *srcPtr,
+                                                 uint3 srcStridesCDH,
+                                                 float *dstPtr,
+                                                 uint3 dstStridesCDH,
+                                                 int channels,
+                                                 float subtractParam,
+                                                 RpptROI3DPtr roiGenericPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // W - inner most dim vectorized
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
+
+    if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x);
+    uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+
+    d_float8 val_f8;
+    for(int c = 0; c < channels; c++)
+    {
+        rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8);
+        rpp_hip_math_subtract8_const(&val_f8, &val_f8, static_cast<float4>(subtractParam));
+        rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+        srcIdx += srcStridesCDH.x;
+        dstIdx += dstStridesCDH.x;
+    }
+}
+
+__global__ void subtract_scalar_ndhwc_hip_tensor(float *srcPtr,
+                                                 uint2 srcStridesDH,
+                                                 float *dstPtr,
+                                                 uint2 dstStridesDH,
+                                                 float subtractParam,
+                                                 RpptROI3DPtr roiGenericPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;        // WC - inner most dim vectorized
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;              // H - second to inner
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;              // D - outer most dim
+
+    if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3;
+    uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+
+    d_float24 val_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
+    rpp_hip_math_subtract24_const(&val_f24, &val_f24, static_cast<float4>(subtractParam));
+    rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+RppStatus hip_exec_subtract_scalar_tensor(Rpp32f *srcPtr,
+                                          RpptGenericDescPtr srcGenericDescPtr,
+                                          Rpp32f *dstPtr,
+                                          RpptGenericDescPtr dstGenericDescPtr,
+                                          RpptROI3DPtr roiGenericPtrSrc,
+                                          Rpp32f *subtractTensor,
+                                          rpp::Handle& handle)
+{
+    if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+    {
+        int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[3];               // H - height (y direction)
+        int globalThreads_z = dstGenericDescPtr->dims[2];               // D - depth (z direction)
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            hipLaunchKernelGGL(subtract_scalar_ncdhw_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+                               make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                               make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+                               dstGenericDescPtr->dims[1],
+                               subtractTensor[batchCount],
+                               &roiGenericPtrSrc[batchCount]);
+        }
+    }
+    else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+    {
+        int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+        int globalThreads_y = dstGenericDescPtr->dims[2];                   // H - height (y direction)
+        int globalThreads_z = dstGenericDescPtr->dims[1];                   // D - depth (z direction)
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            hipLaunchKernelGGL(subtract_scalar_ndhwc_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+                               make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                               make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+                               subtractTensor[batchCount],
+                               &roiGenericPtrSrc[batchCount]);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/tensor_max.hpp b/src/modules/hip/kernel/tensor_max.hpp
new file mode 100644
index 000000000..b47fce024
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_max.hpp
@@ -0,0 +1,400 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+
+template <typename T>
+__global__ void tensor_max_grid_3channel_result_hip(float *srcPtr,
+                                                    uint xBufferLength,
+                                                    T *dstPtr)
+{
+    int id_x = hipThreadIdx_x * 8;
+    int id_z = hipBlockIdx_z;
+
+    __shared__ float partialRMax_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+    __shared__ float partialGMax_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+    __shared__ float partialBMax_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+    uint srcIdx = (id_z * xBufferLength) * 3;
+    partialRMax_smem[hipThreadIdx_x] = srcPtr[srcIdx];                  // initialization of LDS for R channel to start of R channel using all 256 x 1 threads
+    partialGMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1];              // initialization of LDS for G channel to start of G channel using all 256 x 1 threads
+    partialBMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2];              // initialization of LDS for B channel to start of B channel using all 256 x 1 threads
+
+    if (id_x >= xBufferLength)
+        return;
+
+    srcIdx += id_x * 3;
+
+    if (id_x + 8 > xBufferLength)
+        srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3);     // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+    d_float24 src_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24);           // load 24 pixels to local mmemory
+
+    rpp_hip_math_max8(&src_f24.f8[0], &partialRMax_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[1], &partialGMax_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[2], &partialBMax_smem[hipThreadIdx_x]);
+    __syncthreads();                                                                    // syncthreads after max compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMax_smem[hipThreadIdx_x] = fmaxf(partialRMax_smem[hipThreadIdx_x], partialRMax_smem[hipThreadIdx_x + threadMax]);
+            partialGMax_smem[hipThreadIdx_x] = fmaxf(partialGMax_smem[hipThreadIdx_x], partialGMax_smem[hipThreadIdx_x + threadMax]);
+            partialBMax_smem[hipThreadIdx_x] = fmaxf(partialBMax_smem[hipThreadIdx_x], partialBMax_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+    {
+        int dstIdx = hipBlockIdx_z * 4;
+        dstPtr[dstIdx] = (T) partialRMax_smem[0];
+        dstPtr[dstIdx + 1] = (T) partialGMax_smem[0];
+        dstPtr[dstIdx + 2] = (T) partialBMax_smem[0];
+        dstPtr[dstIdx + 3] = (T) (fmaxf(fmaxf(partialRMax_smem[0], partialGMax_smem[0]), partialBMax_smem[0]));
+    }
+}
+
+template <typename T>
+__global__ void tensor_max_grid_result_hip(float *srcPtr,
+                                           uint xBufferLength,
+                                           T *dstPtr)
+{
+    int id_x = hipThreadIdx_x * 8;
+    int id_z = hipBlockIdx_z;
+
+    __shared__ float partialMax_smem[256];                            // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+    uint srcIdx = (id_z * xBufferLength);
+    partialMax_smem[hipThreadIdx_x] = srcPtr[srcIdx];                         // initialization of LDS to start of buffer using all 256 x 1 threads
+
+    if (id_x >= xBufferLength)
+        return;
+
+    srcIdx += id_x;
+
+    if (id_x + 8 > xBufferLength)
+        srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7)));       // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);   // load 8 pixels to local memory
+    rpp_hip_math_max8(&src_f8, &partialMax_smem[hipThreadIdx_x]);
+    __syncthreads();                                                // syncthreads after max compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialMax_smem[hipThreadIdx_x] = fmaxf(partialMax_smem[hipThreadIdx_x], partialMax_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+        dstPtr[hipBlockIdx_z] = (T) (partialMax_smem[0]);
+}
+
+
+// -------------------- Set 1 - Reduction Stage 1 --------------------
+
+template <typename T>
+__global__ void tensor_max_pkd3_hip(T *srcPtr,
+                                    uint2 srcStridesNH,
+                                    float *maxArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialRMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel
+    __shared__ float partialGMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel
+    __shared__ float partialBMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel
+
+    float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for R Channel
+    float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for G Channel
+    float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for B Channel
+    uint srcIdx = (id_z * srcStridesNH.x);
+    partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+    partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1];                      // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads
+    partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2];                      // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+
+    d_float24 src_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24);           // load 24 pixels to local memory
+
+    rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+            {
+                partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]);
+                partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]);
+                partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]);
+            }
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+        {
+            int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+            maxArr[idx] = partialRMaxRowPtr_smem[0];
+            maxArr[idx + 1] = partialGMaxRowPtr_smem[0];
+            maxArr[idx + 2] = partialBMaxRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void tensor_max_pln3_hip(T *srcPtr,
+                                    uint3 srcStridesNCH,
+                                    float *maxArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialRMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+    __shared__ float partialGMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+    __shared__ float partialBMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+    float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    uint srcIdx = (id_z * srcStridesNCH.x);
+    partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+    partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y];        // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads
+    partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y];    // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+    d_float24 src_f24;
+    rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24);
+
+    rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();                                                         // syncthreads after max compute
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+            {
+                partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]);
+                partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]);
+                partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]);
+            }
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+        {
+            int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+            maxArr[idx] = partialRMaxRowPtr_smem[0];
+            maxArr[idx + 1] = partialGMaxRowPtr_smem[0];
+            maxArr[idx + 2] = partialBMaxRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void tensor_max_pln1_hip(T *srcPtr,
+                                    uint2 srcStridesNH,
+                                    float *maxArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0];         // float pointer to beginning of each row in LDS
+    partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                     // initialization of LDS to start value using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+
+    rpp_hip_math_max8(&src_f8, &partialMaxRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();                                                        // syncthreads after max compute
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+                partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]);
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+            maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0];
+    }
+}
+
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+template <typename T, typename U>
+RppStatus hip_exec_tensor_max(T *srcPtr,
+                              RpptDescPtr srcDescPtr,
+                              U *maxArr,
+                              RpptROIPtr roiTensorPtrSrc,
+                              RpptRoiType roiType,
+                              rpp::Handle& handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+    int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+    int globalThreads_y = srcDescPtr->h;
+    int globalThreads_z = handle.GetBatchSize();
+    int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+    int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+    int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+    float2 bitDepthMinMax_f2;
+    getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2);
+    float minimum = bitDepthMinMax_f2.x;
+
+    if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+    {
+        Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z;
+        float *partialMaxArr;
+        partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_pln1_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           partialMaxArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_grid_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMaxArr,
+                           gridDim_x * gridDim_y,
+                           maxArr);
+    }
+    else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+    {
+        Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+        float *partialMaxArr;
+        partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_pln3_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                           partialMaxArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMaxArr,
+                           gridDim_x * gridDim_y,
+                           maxArr);
+    }
+    else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+    {
+        Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+        float *partialMaxArr;
+        partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_pkd3_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           partialMaxArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMaxArr,
+                           gridDim_x * gridDim_y,
+                           maxArr);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/tensor_min.hpp b/src/modules/hip/kernel/tensor_min.hpp
new file mode 100644
index 000000000..a883c4f3b
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_min.hpp
@@ -0,0 +1,410 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+
+template <typename T>
+__global__ void tensor_min_grid_3channel_result_hip(float *srcPtr,
+                                                    uint xBufferLength,
+                                                    T *dstPtr)
+{
+    int id_x = hipThreadIdx_x * 8;
+    int id_z = hipBlockIdx_z;
+
+    __shared__ float partialRMin_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+    __shared__ float partialGMin_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+    __shared__ float partialBMin_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+    uint srcIdx = (id_z * xBufferLength) * 3;
+    partialRMin_smem[hipThreadIdx_x] = srcPtr[srcIdx];                  // initialization of LDS for R channel to start of R channel using all 256 x 1 threads
+    partialGMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1];              // initialization of LDS for G channel to start of G channel using all 256 x 1 threads
+    partialBMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2];              // initialization of LDS for B channel to start of B channel using all 256 x 1 threads
+
+    if (id_x >= xBufferLength)
+        return;
+
+    srcIdx += id_x * 3;
+
+    if (id_x + 8 > xBufferLength)
+        srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3);     // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+    d_float24 src_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24);          // load 24 pixels to local memory
+
+    rpp_hip_math_min8(&src_f24.f8[0], &partialRMin_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[1], &partialGMin_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[2], &partialBMin_smem[hipThreadIdx_x]);
+    __syncthreads();                                                                    // syncthreads after min compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMin_smem[hipThreadIdx_x] = fminf(partialRMin_smem[hipThreadIdx_x], partialRMin_smem[hipThreadIdx_x + threadMax]);
+            partialGMin_smem[hipThreadIdx_x] = fminf(partialGMin_smem[hipThreadIdx_x], partialGMin_smem[hipThreadIdx_x + threadMax]);
+            partialBMin_smem[hipThreadIdx_x] = fminf(partialBMin_smem[hipThreadIdx_x], partialBMin_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+    {
+        int dstIdx = hipBlockIdx_z * 4;
+        dstPtr[dstIdx] = (T) partialRMin_smem[0];
+        dstPtr[dstIdx + 1] = (T) partialGMin_smem[0];
+        dstPtr[dstIdx + 2] = (T) partialBMin_smem[0];
+        dstPtr[dstIdx + 3] = (T) (fminf(fminf(partialRMin_smem[0], partialGMin_smem[0]), partialBMin_smem[0]));
+    }
+}
+
+template <typename T>
+__global__ void tensor_min_grid_result_hip(float *srcPtr,
+                                           uint xBufferLength,
+                                           T *dstPtr)
+{
+    int id_x = hipThreadIdx_x * 8;
+    int id_z = hipBlockIdx_z;
+
+    __shared__ float partialMin_smem[256];                          // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+    uint srcIdx = (id_z * xBufferLength);
+    partialMin_smem[hipThreadIdx_x] = srcPtr[srcIdx];               // initialization of LDS to start of buffer using all 256 x 1 threads
+
+    if (id_x >= xBufferLength)
+        return;
+
+    srcIdx += id_x;
+
+    if (id_x + 8 > xBufferLength)
+        srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7)));       // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);   // load 8 pixels to local memory
+    rpp_hip_math_min8(&src_f8, &partialMin_smem[hipThreadIdx_x]);
+    __syncthreads();                                                // syncthreads after min compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialMin_smem[hipThreadIdx_x] = fminf(partialMin_smem[hipThreadIdx_x], partialMin_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+        dstPtr[hipBlockIdx_z] = (T) (partialMin_smem[0]);
+}
+
+
+// -------------------- Set 1 - Reduction Stage 1 --------------------
+
+template <typename T>
+__global__ void tensor_min_pkd3_hip(T *srcPtr,
+                                    uint2 srcStridesNH,
+                                    float *minArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialRMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel
+    __shared__ float partialGMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel
+    __shared__ float partialBMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel
+
+    float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for R Channel
+    float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for G Channel
+    float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for B Channel
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+    partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1];                      // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads
+    partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2];                      // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+
+    if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+        srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth) * 3;
+
+    d_float24 src_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24);           // load 24 pixels to local memory
+
+    rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+            {
+                partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]);
+                partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]);
+                partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]);
+            }
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+        {
+            int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+            minArr[idx] = partialRMinRowPtr_smem[0];
+            minArr[idx + 1] = partialGMinRowPtr_smem[0];
+            minArr[idx + 2] = partialBMinRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void tensor_min_pln3_hip(T *srcPtr,
+                                    uint3 srcStridesNCH,
+                                    float *minArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialRMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+    __shared__ float partialGMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+    __shared__ float partialBMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+    float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+
+    uint srcIdx = (id_z * srcStridesNCH.x);
+    partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+    partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y];        // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads
+    partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y];    // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+    if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+        srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth);
+
+    d_float24 src_f24;
+    rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24);
+
+    rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();                                                         // syncthreads after min compute
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+            {
+                partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]);
+                partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]);
+                partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]);
+            }
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+        {
+            int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+            minArr[idx] = partialRMinRowPtr_smem[0];
+            minArr[idx + 1] = partialGMinRowPtr_smem[0];
+            minArr[idx + 2] = partialBMinRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void tensor_min_pln1_hip(T *srcPtr,
+                                    uint2 srcStridesNH,
+                                    float *minArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    float *partialMinRowPtr_smem = &partialMin_smem[hipThreadIdx_y][0];         // float pointer to beginning of each row in LDS
+    partialMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                     // initialization of LDS to start value using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+    if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+        srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth);
+
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+    rpp_hip_math_min8(&src_f8, &partialMinRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();                                                        // syncthreads after min compute
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialMinRowPtr_smem[hipThreadIdx_x] = fminf(partialMinRowPtr_smem[hipThreadIdx_x], partialMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+                partialMinRowPtr_smem[0] = fminf(partialMinRowPtr_smem[0], partialMinRowPtr_smem[increment]);
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+            minArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMinRowPtr_smem[0];
+    }
+}
+
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+template <typename T, typename U>
+RppStatus hip_exec_tensor_min(T *srcPtr,
+                              RpptDescPtr srcDescPtr,
+                              U *minArr,
+                              RpptROIPtr roiTensorPtrSrc,
+                              RpptRoiType roiType,
+                              rpp::Handle &handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+    int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+    int globalThreads_y = srcDescPtr->h;
+    int globalThreads_z = handle.GetBatchSize();
+    int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+    int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+    int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+    float2 bitDepthMinMax_f2;
+    getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2);
+    float maximum = bitDepthMinMax_f2.y;
+
+    if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+    {
+        Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z;
+        float *partialMinArr;
+        partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_pln1_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           partialMinArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_grid_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMinArr,
+                           gridDim_x * gridDim_y,
+                           minArr);
+    }
+    else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+    {
+        Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+        float *partialMinArr;
+        partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_pln3_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                           partialMinArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMinArr,
+                           gridDim_x * gridDim_y,
+                           minArr);
+    }
+    else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+    {
+        Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+        float *partialMinArr;
+        partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_pkd3_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           partialMinArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMinArr,
+                           gridDim_x * gridDim_y,
+                           minArr);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp
index daf0479ee..8f88ba90f 100644
--- a/src/modules/rppt_tensor_arithmetic_operations.cpp
+++ b/src/modules/rppt_tensor_arithmetic_operations.cpp
@@ -73,6 +73,188 @@ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr,
     return RPP_SUCCESS;
 }
 
+/******************** add_scalar ********************/
+
+RppStatus rppt_add_scalar_host(RppPtr_t srcPtr,
+                               RpptGenericDescPtr srcGenericDescPtr,
+                               RppPtr_t dstPtr,
+                               RpptGenericDescPtr dstGenericDescPtr,
+                               Rpp32f *addTensor,
+                               RpptROI3DPtr roiGenericPtrSrc,
+                               RpptRoi3DType roiType,
+                               rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams;
+    if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+    else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+
+    if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+    if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+    if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        add_scalar_f32_f32_host_tensor(reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                       srcGenericDescPtr,
+                                       reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                       dstGenericDescPtr,
+                                       addTensor,
+                                       roiGenericPtrSrc,
+                                       roiType,
+                                       layoutParams,
+                                       rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
+/******************** subtract_scalar ********************/
+
+RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr,
+                                    RpptGenericDescPtr srcGenericDescPtr,
+                                    RppPtr_t dstPtr,
+                                    RpptGenericDescPtr dstGenericDescPtr,
+                                    Rpp32f *subtractTensor,
+                                    RpptROI3DPtr roiGenericPtrSrc,
+                                    RpptRoi3DType roiType,
+                                    rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams;
+    if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+    else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+
+    if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+    if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+    if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        subtract_scalar_f32_f32_host_tensor(reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                            srcGenericDescPtr,
+                                            reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                            dstGenericDescPtr,
+                                            subtractTensor,
+                                            roiGenericPtrSrc,
+                                            roiType,
+                                            layoutParams,
+                                            rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
+/******************** multiply_scalar ********************/
+
+RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr,
+                                    RpptGenericDescPtr srcGenericDescPtr,
+                                    RppPtr_t dstPtr,
+                                    RpptGenericDescPtr dstGenericDescPtr,
+                                    Rpp32f *mulTensor,
+                                    RpptROI3DPtr roiGenericPtrSrc,
+                                    RpptRoi3DType roiType,
+                                    rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams;
+    if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+    else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+        layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+
+    if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+    if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+    if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        multiply_scalar_f32_f32_host_tensor(reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                            srcGenericDescPtr,
+                                            reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                            dstGenericDescPtr,
+                                            mulTensor,
+                                            roiGenericPtrSrc,
+                                            roiType,
+                                            layoutParams,
+                                            rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
+/******************** magnitude ********************/
+
+RppStatus rppt_magnitude_host(RppPtr_t srcPtr1,
+                              RppPtr_t srcPtr2,
+                              RpptDescPtr srcDescPtr,
+                              RppPtr_t dstPtr,
+                              RpptDescPtr dstDescPtr,
+                              RpptROIPtr roiTensorPtrSrc,
+                              RpptRoiType roiType,
+                              rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        magnitude_u8_u8_host_tensor(static_cast<Rpp8u*>(srcPtr1) + srcDescPtr->offsetInBytes,
+                                    static_cast<Rpp8u*>(srcPtr2) + srcDescPtr->offsetInBytes,
+                                    srcDescPtr,
+                                    static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                    dstDescPtr,
+                                    roiTensorPtrSrc,
+                                    roiType,
+                                    layoutParams,
+                                    rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        magnitude_f16_f16_host_tensor(reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(srcPtr1) + srcDescPtr->offsetInBytes),
+                                      reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(srcPtr2) + srcDescPtr->offsetInBytes),
+                                      srcDescPtr,
+                                      reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                      dstDescPtr,
+                                      roiTensorPtrSrc,
+                                      roiType,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        magnitude_f32_f32_host_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr1) + srcDescPtr->offsetInBytes),
+                                      reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr2) + srcDescPtr->offsetInBytes),
+                                      srcDescPtr,
+                                      reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                      dstDescPtr,
+                                      roiTensorPtrSrc,
+                                      roiType,
+                                      layoutParams,
+                                      rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        magnitude_i8_i8_host_tensor(static_cast<Rpp8s*>(srcPtr1) + srcDescPtr->offsetInBytes,
+                                    static_cast<Rpp8s*>(srcPtr2) + srcDescPtr->offsetInBytes,
+                                    srcDescPtr,
+                                    static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                    dstDescPtr,
+                                    roiTensorPtrSrc,
+                                    roiType,
+                                    layoutParams,
+                                    rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
 /********************************************************************************************************************/
 /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
 /********************************************************************************************************************/
@@ -113,4 +295,163 @@ RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr,
 #endif // backend
 }
 
+/******************** add_scalar ********************/
+
+RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr,
+                              RpptGenericDescPtr srcGenericDescPtr,
+                              RppPtr_t dstPtr,
+                              RpptGenericDescPtr dstGenericDescPtr,
+                              Rpp32f *addTensor,
+                              RpptROI3DPtr roiGenericPtrSrc,
+                              RpptRoi3DType roiType,
+                              rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+    if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+    hip_exec_add_scalar_tensor(reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                               srcGenericDescPtr,
+                               reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                               dstGenericDescPtr,
+                               roiGenericPtrSrc,
+                               addTensor,
+                               rpp::deref(rppHandle));
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** subtract_scalar ********************/
+
+RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr,
+                                   RpptGenericDescPtr srcGenericDescPtr,
+                                   RppPtr_t dstPtr,
+                                   RpptGenericDescPtr dstGenericDescPtr,
+                                   Rpp32f *subtractTensor,
+                                   RpptROI3DPtr roiGenericPtrSrc,
+                                   RpptRoi3DType roiType,
+                                   rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+    if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+    hip_exec_subtract_scalar_tensor(reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                    srcGenericDescPtr,
+                                    reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                    dstGenericDescPtr,
+                                    roiGenericPtrSrc,
+                                    subtractTensor,
+                                    rpp::deref(rppHandle));
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** multiply_scalar ********************/
+
+RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr,
+                                   RpptGenericDescPtr srcGenericDescPtr,
+                                   RppPtr_t dstPtr,
+                                   RpptGenericDescPtr dstGenericDescPtr,
+                                   Rpp32f *mulTensor,
+                                   RpptROI3DPtr roiGenericPtrSrc,
+                                   RpptRoi3DType roiType,
+                                   rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+    if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+    hip_exec_multiply_scalar_tensor(reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                    srcGenericDescPtr,
+                                    reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                    dstGenericDescPtr,
+                                    roiGenericPtrSrc,
+                                    mulTensor,
+                                    rpp::deref(rppHandle));
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** magnitude ********************/
+
+RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1,
+                             RppPtr_t srcPtr2,
+                             RpptDescPtr srcDescPtr,
+                             RppPtr_t dstPtr,
+                             RpptDescPtr dstDescPtr,
+                             RpptROIPtr roiTensorPtrSrc,
+                             RpptRoiType roiType,
+                             rppHandle_t rppHandle)
+{
+    #ifdef HIP_COMPILE
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        hip_exec_magnitude_tensor(static_cast<Rpp8u*>(srcPtr1) + srcDescPtr->offsetInBytes,
+                                  static_cast<Rpp8u*>(srcPtr2) + srcDescPtr->offsetInBytes,
+                                  srcDescPtr,
+                                  static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                  dstDescPtr,
+                                  roiTensorPtrSrc,
+                                  roiType,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_magnitude_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr1) + srcDescPtr->offsetInBytes),
+                                  reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr2) + srcDescPtr->offsetInBytes),
+                                  srcDescPtr,
+                                  reinterpret_cast<half*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                  dstDescPtr,
+                                  roiTensorPtrSrc,
+                                  roiType,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_magnitude_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr1) + srcDescPtr->offsetInBytes),
+                                  reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr2) + srcDescPtr->offsetInBytes),
+                                  srcDescPtr,
+                                  reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                  dstDescPtr,
+                                  roiTensorPtrSrc,
+                                  roiType,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        hip_exec_magnitude_tensor(static_cast<Rpp8s*>(srcPtr1) + srcDescPtr->offsetInBytes,
+                                  static_cast<Rpp8s*>(srcPtr2) + srcDescPtr->offsetInBytes,
+                                  srcDescPtr,
+                                  static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                  dstDescPtr,
+                                  roiTensorPtrSrc,
+                                  roiType,
+                                  rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
 #endif // GPU_SUPPORT
diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
index 23b52bc44..d78b8890a 100644
--- a/src/modules/rppt_tensor_audio_augmentations.cpp
+++ b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -126,3 +126,31 @@ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr,
         return RPP_ERROR_NOT_IMPLEMENTED;
     }
 }
+
+/******************** down_mixing ********************/
+
+RppStatus rppt_down_mixing_host(RppPtr_t srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                RppPtr_t dstPtr,
+                                RpptDescPtr dstDescPtr,
+                                Rpp32s *srcDimsTensor,
+                                bool  normalizeWeights,
+                                rppHandle_t rppHandle)
+{
+    if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        down_mixing_host_tensor(static_cast<Rpp32f*>(srcPtr),
+                                srcDescPtr,
+                                static_cast<Rpp32f*>(dstPtr),
+                                dstDescPtr,
+                                srcDimsTensor,
+                                normalizeWeights,
+                                rpp::deref(rppHandle));
+
+        return RPP_SUCCESS;
+    }
+    else
+    {
+        return RPP_ERROR_NOT_IMPLEMENTED;
+    }
+}
diff --git a/src/modules/rppt_tensor_color_augmentations.cpp b/src/modules/rppt_tensor_color_augmentations.cpp
index be61b6da1..3023973fc 100644
--- a/src/modules/rppt_tensor_color_augmentations.cpp
+++ b/src/modules/rppt_tensor_color_augmentations.cpp
@@ -411,7 +411,7 @@ RppStatus rppt_color_cast_host(RppPtr_t srcPtr,
 {
     if (srcDescPtr->c != 3)
     {
-        return RPP_ERROR_INVALID_ARGUMENTS;
+        return RPP_ERROR_INVALID_CHANNELS;
     }
 
     RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
@@ -671,6 +671,72 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr,
     return RPP_SUCCESS;
 }
 
+/******************** color_temperature ********************/
+
+RppStatus rppt_color_temperature_host(RppPtr_t srcPtr,
+                                      RpptDescPtr srcDescPtr,
+                                      RppPtr_t dstPtr,
+                                      RpptDescPtr dstDescPtr,
+                                      Rpp8s *adjustmentValueTensor,
+                                      RpptROIPtr roiTensorPtrSrc,
+                                      RpptRoiType roiType,
+                                      rppHandle_t rppHandle)
+{
+    if (srcDescPtr->c != 3)
+    {
+        return RPP_ERROR_INVALID_CHANNELS;
+    }
+
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        color_temperature_u8_u8_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                            srcDescPtr,
+                                            static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                            dstDescPtr,
+                                            adjustmentValueTensor,
+                                            roiTensorPtrSrc,
+                                            roiType,
+                                            layoutParams);
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        color_temperature_f16_f16_host_tensor(reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                              srcDescPtr,
+                                              reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                              dstDescPtr,
+                                              adjustmentValueTensor,
+                                              roiTensorPtrSrc,
+                                              roiType,
+                                              layoutParams);
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        color_temperature_f32_f32_host_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                              srcDescPtr,
+                                              reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                              dstDescPtr,
+                                              adjustmentValueTensor,
+                                              roiTensorPtrSrc,
+                                              roiType,
+                                              layoutParams);
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        color_temperature_i8_i8_host_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                            srcDescPtr,
+                                            static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                            dstDescPtr,
+                                            adjustmentValueTensor,
+                                            roiTensorPtrSrc,
+                                            roiType,
+                                            layoutParams);
+    }
+
+    return RPP_SUCCESS;
+}
+
 /********************************************************************************************************************/
 /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
 /********************************************************************************************************************/
@@ -887,7 +953,7 @@ RppStatus rppt_color_twist_gpu(RppPtr_t srcPtr,
 #ifdef HIP_COMPILE
     if (srcDescPtr->c != 3)
     {
-        return RPP_ERROR_INVALID_ARGUMENTS;
+        return RPP_ERROR_INVALID_CHANNELS;
     }
 
     Rpp32u paramIndex = 0;
@@ -958,7 +1024,7 @@ RppStatus rppt_color_cast_gpu(RppPtr_t srcPtr,
 #ifdef HIP_COMPILE
     if (srcDescPtr->c != 3)
     {
-        return RPP_ERROR_INVALID_ARGUMENTS;
+        return RPP_ERROR_INVALID_CHANNELS;
     }
 
     Rpp32u paramIndex = 0;
@@ -1204,4 +1270,71 @@ RppStatus rppt_lut_gpu(RppPtr_t srcPtr,
 #endif // backend
 }
 
+/******************** color_temperature ********************/
+
+RppStatus rppt_color_temperature_gpu(RppPtr_t srcPtr,
+                                     RpptDescPtr srcDescPtr,
+                                     RppPtr_t dstPtr,
+                                     RpptDescPtr dstDescPtr,
+                                     Rpp32s *adjustmentValueTensor,
+                                     RpptROIPtr roiTensorPtrSrc,
+                                     RpptRoiType roiType,
+                                     rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if (srcDescPtr->c != 3)
+    {
+        return RPP_ERROR_INVALID_CHANNELS;
+    }
+
+    Rpp32u paramIndex = 0;
+    copy_param_int(adjustmentValueTensor, rpp::deref(rppHandle), paramIndex++);
+
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        hip_exec_color_temperature_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                          srcDescPtr,
+                                          static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                          dstDescPtr,
+                                          roiTensorPtrSrc,
+                                          roiType,
+                                          rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_color_temperature_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                          srcDescPtr,
+                                          reinterpret_cast<half*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                          dstDescPtr,
+                                          roiTensorPtrSrc,
+                                          roiType,
+                                          rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_color_temperature_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                          srcDescPtr,
+                                          reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                          dstDescPtr,
+                                          roiTensorPtrSrc,
+                                          roiType,
+                                          rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        hip_exec_color_temperature_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                          srcDescPtr,
+                                          static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                          dstDescPtr,
+                                          roiTensorPtrSrc,
+                                          roiType,
+                                          rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
 #endif // GPU_SUPPORT
diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp
index da1036256..fff62d085 100644
--- a/src/modules/rppt_tensor_geometric_augmentations.cpp
+++ b/src/modules/rppt_tensor_geometric_augmentations.cpp
@@ -1036,11 +1036,11 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr,
     else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
         layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
 
-    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
-    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
-    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_SRC_DST_LAYOUT_MISMATCH;
     if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
     if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
 
     if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
     {
@@ -1823,11 +1823,11 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr,
                               rppHandle_t rppHandle)
 {
 #ifdef HIP_COMPILE
-    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
-    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
-    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_SRC_DST_LAYOUT_MISMATCH;
     if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
     if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+    if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+    if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+    if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
 
     if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
     {
diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp
index f17028e5e..28313a88f 100644
--- a/src/modules/rppt_tensor_statistical_operations.cpp
+++ b/src/modules/rppt_tensor_statistical_operations.cpp
@@ -107,6 +107,140 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr,
     return RPP_SUCCESS;
 }
 
+/******************** tensor_min ********************/
+
+RppStatus rppt_tensor_min_host(RppPtr_t srcPtr,
+                               RpptDescPtr srcDescPtr,
+                               RppPtr_t minArr,
+                               Rpp32u minArrLength,
+                               RpptROIPtr roiTensorPtrSrc,
+                               RpptRoiType roiType,
+                               rppHandle_t rppHandle)
+{
+    if (srcDescPtr->c == 1)
+    {
+        if (minArrLength < srcDescPtr->n)      // 1 min for each image
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+    else if (srcDescPtr->c == 3)
+    {
+        if (minArrLength < srcDescPtr->n * 4)  // min of each channel, and min of all 3 channels
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+    if (srcDescPtr->dataType == RpptDataType::U8)
+    {
+        tensor_min_u8_u8_host(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8u*>(minArr),
+                              minArrLength,
+                              roiTensorPtrSrc,
+                              roiType,
+                              layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F16)
+    {
+        tensor_min_f16_f16_host((Rpp16f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                 srcDescPtr,
+                                 static_cast<Rpp16f*>(minArr),
+                                 minArrLength,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        tensor_min_f32_f32_host((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                 srcDescPtr,
+                                 static_cast<Rpp32f*>(minArr),
+                                 minArrLength,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::I8)
+    {
+        tensor_min_i8_i8_host(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8s*>(minArr),
+                              minArrLength,
+                              roiTensorPtrSrc,
+                              roiType,
+                              layoutParams);
+    }
+
+    return RPP_SUCCESS;
+}
+
+/******************** tensor_max ********************/
+
+RppStatus rppt_tensor_max_host(RppPtr_t srcPtr,
+                               RpptDescPtr srcDescPtr,
+                               RppPtr_t maxArr,
+                               Rpp32u maxArrLength,
+                               RpptROIPtr roiTensorPtrSrc,
+                               RpptRoiType roiType,
+                               rppHandle_t rppHandle)
+{
+    if (srcDescPtr->c == 1)
+    {
+        if (maxArrLength < srcDescPtr->n)      // 1 min for each image
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+    else if (srcDescPtr->c == 3)
+    {
+        if (maxArrLength < srcDescPtr->n * 4)  // min of each channel, and min of all 3 channels
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+    if (srcDescPtr->dataType == RpptDataType::U8)
+    {
+        tensor_max_u8_u8_host(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8u*>(maxArr),
+                              maxArrLength,
+                              roiTensorPtrSrc,
+                              roiType,
+                              layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F16)
+    {
+        tensor_max_f16_f16_host((Rpp16f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                 srcDescPtr,
+                                 static_cast<Rpp16f*>(maxArr),
+                                 maxArrLength,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        tensor_max_f32_f32_host((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                 srcDescPtr,
+                                 static_cast<Rpp32f*>(maxArr),
+                                 maxArrLength,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::I8)
+    {
+        tensor_max_i8_i8_host(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8s*>(maxArr),
+                              maxArrLength,
+                              roiTensorPtrSrc,
+                              roiType,
+                              layoutParams);
+    }
+
+    return RPP_SUCCESS;
+}
+
 
 /********************************************************************************************************************/
 /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
@@ -184,4 +318,126 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
 
     return RPP_SUCCESS;
 }
+
+/******************** tensor_min ********************/
+
+RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
+                              RpptDescPtr srcDescPtr,
+                              RppPtr_t imageMinArr,
+                              Rpp32u imageMinArrLength,
+                              RpptROIPtr roiTensorPtrSrc,
+                              RpptRoiType roiType,
+                              rppHandle_t rppHandle)
+{
+    if (srcDescPtr->c == 1)
+    {
+        if (imageMinArrLength < srcDescPtr->n)   // min of single channel
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+    else if (srcDescPtr->c == 3)
+    {
+        if (imageMinArrLength < srcDescPtr->n * 4)   // min of each channel, and overall min of all 3 channels
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+
+    if (srcDescPtr->dataType == RpptDataType::U8)
+    {
+        hip_exec_tensor_min(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                            srcDescPtr,
+                            static_cast<Rpp8u*>(imageMinArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F16)
+    {
+        hip_exec_tensor_min((half*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                            srcDescPtr,
+                            static_cast<half*>(imageMinArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        hip_exec_tensor_min((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                            srcDescPtr,
+                            static_cast<Rpp32f*>(imageMinArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::I8)
+    {
+        hip_exec_tensor_min(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                            srcDescPtr,
+                            static_cast<Rpp8s*>(imageMinArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
+/******************** tensor_max ********************/
+
+RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
+                              RpptDescPtr srcDescPtr,
+                              RppPtr_t imageMaxArr,
+                              Rpp32u imageMaxArrLength,
+                              RpptROIPtr roiTensorPtrSrc,
+                              RpptRoiType roiType,
+                              rppHandle_t rppHandle)
+{
+    if (srcDescPtr->c == 1)
+    {
+        if (imageMaxArrLength < srcDescPtr->n)   // max of single channel
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+    else if (srcDescPtr->c == 3)
+    {
+        if (imageMaxArrLength < srcDescPtr->n * 4)   // max of each channel, and overall max of all 3 channels
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+
+    if (srcDescPtr->dataType == RpptDataType::U8)
+    {
+        hip_exec_tensor_max(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                            srcDescPtr,
+                            static_cast<Rpp8u*>(imageMaxArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F16)
+    {
+        hip_exec_tensor_max((half*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                            srcDescPtr,
+                            static_cast<half*>(imageMaxArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        hip_exec_tensor_max((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                            srcDescPtr,
+                            static_cast<Rpp32f*>(imageMaxArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::I8)
+    {
+        hip_exec_tensor_max(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                            srcDescPtr,
+                            static_cast<Rpp8s*>(imageMaxArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
 #endif // backend
diff --git a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp
index 250ceadfc..e298ebd99 100644
--- a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp
+++ b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp
@@ -1356,8 +1356,8 @@ int main(int argc, char **argv)
 
             for (i = 0; i < images; i++)
             {
-                dstSize[i].height = srcSize[i].height / 3;
-                dstSize[i].width = srcSize[i].width / 1.1;
+                dstSize[i].height = srcSize[i].height / 2;
+                dstSize[i].width = srcSize[i].width / 2;
                 if (maxDstHeight < dstSize[i].height)
                     maxDstHeight = dstSize[i].height;
                 if (maxDstWidth < dstSize[i].width)
diff --git a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp
index fbffdbe68..dc8679e5d 100644
--- a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp
+++ b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp
@@ -1357,8 +1357,8 @@ int main(int argc, char **argv)
 
             for (i = 0; i < images; i++)
             {
-                dstSize[i].height = srcSize[i].height / 3;
-                dstSize[i].width = srcSize[i].width / 1.1;
+                dstSize[i].height = srcSize[i].height / 2;
+                dstSize[i].width = srcSize[i].width / 2;
                 if (maxDstHeight < dstSize[i].height)
                     maxDstHeight = dstSize[i].height;
                 if (maxDstWidth < dstSize[i].width)
diff --git a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp
index ed1e7751b..271ed3d1c 100644
--- a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp
+++ b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp
@@ -1459,8 +1459,8 @@ int main(int argc, char **argv)
 
             for (i = 0; i < images; i++)
             {
-                dstSize[i].height = srcSize[i].height / 3;
-                dstSize[i].width = srcSize[i].width / 1.1;
+                dstSize[i].height = srcSize[i].height / 2;
+                dstSize[i].width = srcSize[i].width / 2;
                 if (maxDstHeight < dstSize[i].height)
                     maxDstHeight = dstSize[i].height;
                 if (maxDstWidth < dstSize[i].width)
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 04831ddf4..7bd46b39e 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -65,12 +65,12 @@ int main(int argc, char **argv)
 
     bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
     bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
-    bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 63);
+    bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
     bool randomOutputCase = (testCase == 84 || testCase == 49 || testCase == 54);
     bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
+    bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
     bool noiseTypeCase = (testCase == 8);
     bool pln1OutTypeCase = (testCase == 86);
-    bool reductionTypeCase = (testCase == 87);
 
     unsigned int verbosity = atoi(argv[11]);
     unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1;
@@ -104,7 +104,7 @@ int main(int argc, char **argv)
 
     if (layoutType == 2)
     {
-        if(testCase == 36 || testCase == 31 || testCase == 86)
+        if(testCase == 36 || testCase == 31 || testCase == 45 || testCase == 86)
         {
             printf("\ncase %d does not exist for PLN1 layout\n", testCase);
             return -1;
@@ -323,35 +323,20 @@ int main(int argc, char **argv)
     double wallTime;
     string testCaseName;
 
-    if(testCase == 82 && imagesMixed)
-    {
-        std::cerr<<"\n RICAP only works with same dimension images";
-        exit(0);
-    }
-
-    if(testCase == 82 && batchSize < 2)
-    {
-        std::cerr<<"\n RICAP only works with BatchSize > 1";
-        exit(0);
-    }
-
-    // Initialize buffers for any reductionType functions
+    // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums)
     void *reductionFuncResultArr;
     Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
-
-    if(reductionTypeCase)
+    if (reductionTypeCase)
     {
-        if(dstDescPtr->dataType == RpptDataType::U8)
-            CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64u)));
-        else if(dstDescPtr->dataType == RpptDataType::F16)
-            CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f)));
-        else if(dstDescPtr->dataType == RpptDataType::F32)
-            CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f)));
-        else if(dstDescPtr->dataType == RpptDataType::I8)
-            CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64s)));
+        int bitDepthByteSize = 0;
+        if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
+            bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+        else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
+            bitDepthByteSize = sizeof(Rpp32f);  // using 32f outputs for 16f and 32f
+        CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
     }
 
-    //Allocate hip memory for src/dst
+    // Allocate hip memory for src/dst
     CHECK(hipMalloc(&d_input, inputBufferSize));
     CHECK(hipMalloc(&d_output, outputBufferSize));
     if(dualInputCase)
@@ -827,6 +812,22 @@ int main(int argc, char **argv)
 
                 break;
             }
+            case 45:
+            {
+                testCaseName = "color_temperature";
+
+                Rpp32s adjustment[batchSize];
+                for (i = 0; i < batchSize; i++)
+                    adjustment[i] = 70;
+
+                startWallTime = omp_get_wtime();
+                if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                    rppt_color_temperature_gpu(d_input, srcDescPtr, d_output, dstDescPtr, adjustment, roiTensorPtrSrc, roiTypeSrc, handle);
+                else
+                    missingFuncFlag = 1;
+
+                break;
+            }
             case 49:
             {
                 testCaseName = "box_filter";
@@ -859,6 +860,18 @@ int main(int argc, char **argv)
 
                 break;
             }
+            case 61:
+            {
+                testCaseName = "magnitude";
+
+                startWallTime = omp_get_wtime();
+                if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                    rppt_magnitude_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+                else
+                    missingFuncFlag = 1;
+
+                break;
+            }
             case 63:
             {
                 testCaseName = "phase";
@@ -1028,6 +1041,30 @@ int main(int argc, char **argv)
 
                 break;
             }
+            case 88:
+            {
+                testCaseName = "tensor_min";
+
+                startWallTime = omp_get_wtime();
+                if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                    rppt_tensor_min_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+                else
+                    missingFuncFlag = 1;
+
+                break;
+            }
+            case 89:
+            {
+                testCaseName = "tensor_max";
+
+                startWallTime = omp_get_wtime();
+                if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                    rppt_tensor_max_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+                else
+                    missingFuncFlag = 1;
+
+                break;
+            }
             default:
                 missingFuncFlag = 1;
                 break;
@@ -1055,33 +1092,41 @@ int main(int argc, char **argv)
                     if(srcDescPtr->c == 3)
                         printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): ");
                     else if(srcDescPtr->c == 1)
+                    {
                         printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): ");
+                        reductionFuncResultArrLength = srcDescPtr->n;
+                    }
 
-                    if(dstDescPtr->dataType == RpptDataType::U8)
+                    // print reduction functions output array based on different bit depths, and precision desired
+                    int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
+                    if (dstDescPtr->dataType == RpptDataType::U8)
                     {
-                        Rpp64u *reductionOutPtr = static_cast<Rpp64u*>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %llu ", reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp64u *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp8u *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::F16)
+                    else if (dstDescPtr->dataType == RpptDataType::F16)
                     {
-                        Rpp32f *reductionOutPtr = static_cast<Rpp32f *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %0.3f ", (float)reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp16f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::F32)
+                    else if (dstDescPtr->dataType == RpptDataType::F32)
                     {
-                        Rpp32f *reductionOutPtr = static_cast<Rpp32f *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %0.3f ", (float)reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::I8)
+                    else if (dstDescPtr->dataType == RpptDataType::I8)
                     {
-                        Rpp64s *reductionOutPtr = static_cast<Rpp64s *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %lld ", reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp64s *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp8s *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-
                     printf("\n");
 
                     /*Compare the output of the function with golden outputs only if
@@ -1089,7 +1134,12 @@ int main(int argc, char **argv)
                     2.input bit depth 0 (U8)
                     3.source and destination layout are the same*/
                     if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
-                        compare_reduction_output(static_cast<Rpp64u *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                    {
+                        if (testCase == 87)
+                            compare_reduction_output(static_cast<uint64_t *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                        else
+                            compare_reduction_output(static_cast<Rpp8u *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                    }
                 }
                 else
                 {
@@ -1175,4 +1225,4 @@ int main(int argc, char **argv)
         CHECK(hipFree(d_input_second));
     CHECK(hipFree(d_output));
     return 0;
-}
\ No newline at end of file
+}
diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
index f4741ad78..e8dc4e365 100644
--- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
@@ -55,8 +55,6 @@ int main(int argc, char * argv[])
         fprintf(stdout, "\nUsage: %s <header file> <data file> <layoutType = 0 - PKD3/ 1 - PLN3/ 2 - PLN1>\n", argv[0]);
         exit(1);
     }
-
-
     if(batchSize > MAX_BATCH_SIZE)
     {
         std::cout << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!";
@@ -268,6 +266,38 @@ int main(int argc, char * argv[])
 
                     break;
                 }
+                case 2:
+                {
+                    testCaseName = "add_scalar";
+                    Rpp32f addTensor[batchSize];
+
+                    for (int i = 0; i < batchSize; i++)
+                        addTensor[i] = 40;
+
+                    startWallTime = omp_get_wtime();
+                    if (inputBitDepth == 2)
+                        rppt_add_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, addTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
+                case 3:
+                {
+                    testCaseName = "subtract_scalar";
+                    Rpp32f subtractTensor[batchSize];
+
+                    for (int i = 0; i < batchSize; i++)
+                        subtractTensor[i] = 40;
+
+                    startWallTime = omp_get_wtime();
+                    if (inputBitDepth == 2)
+                        rppt_subtract_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, subtractTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 case 4:
                 {
                     testCaseName = "flip_voxel";
@@ -292,6 +322,22 @@ int main(int argc, char * argv[])
 
                     break;
                 }
+                case 5:
+                {
+                    testCaseName = "multiply_scalar";
+                    Rpp32f mulTensor[batchSize];
+
+                    for (int i = 0; i < batchSize; i++)
+                        mulTensor[i] = 80;
+
+                    startWallTime = omp_get_wtime();
+                    if (inputBitDepth == 2)
+                        rppt_multiply_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, mulTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 default:
                 {
                     missingFuncFlag = 1;
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index 6150ad97c..2e8054332 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -153,7 +153,7 @@ def get_log_file_list(preserveOutput):
 
 # Functionality group finder
 def func_group_finder(case_number):
-    if case_number < 5 or case_number == 13 or case_number == 36:
+    if case_number < 5 or case_number == 13 or case_number == 36 or case_number == 45:
         return "color_augmentations"
     elif case_number == 8 or case_number == 30 or case_number == 82 or case_number == 83 or case_number == 84:
         return "effects_augmentations"
@@ -165,6 +165,8 @@ def func_group_finder(case_number):
         return "filter_augmentations"
     elif case_number < 40:
         return "geometric_augmentations"
+    elif case_number == 61:
+        return "arithmetic_operations"
     elif case_number < 87:
         return "data_exchange_operations"
     elif case_number < 88:
@@ -313,11 +315,11 @@ def rpp_test_suite_parser_and_validator():
     parser = argparse.ArgumentParser()
     parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1")
     parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2")
-    parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)")
-    parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)")
-    parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
-    parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
-    parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
+    parser.add_argument("--case_start", type = int, default = 0, help="Testing range starting case # - (0:90)")
+    parser.add_argument("--case_end", type = int, default = 90, help="Testing range ending case # - (0:90)")
+    parser.add_argument('--test_type', type = int, default = 0, help="Type of Test - (0 = Unit tests / 1 = Performance tests)")
+    parser.add_argument('--case_list', nargs = "+", help="List of case numbers to list", required=False)
+    parser.add_argument('--profiling', type = str , default='NO', help='Run with profiler? - (YES/NO)', required=False)
     parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False)
     parser.add_argument('--decoder_type', type = int, default = 0, help = "Type of Decoder to decode the input data - (0 = TurboJPEG / 1 = OpenCV)")
     parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
@@ -332,8 +334,8 @@ def rpp_test_suite_parser_and_validator():
     validate_path(qaInputFile)
 
     # validate the parameters passed by user
-    if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)):
-        print("Starting case# and Ending case# must be in the 0:87 range. Aborting!")
+    if ((args.case_start < 0 or args.case_start > 90) or (args.case_end < 0 or args.case_end > 90)):
+        print("Starting case# and Ending case# must be in the 0:90 range. Aborting!")
         exit(0)
     elif args.case_end < args.case_start:
         print("Ending case# must be greater than starting case#. Aborting!")
@@ -347,7 +349,7 @@ def rpp_test_suite_parser_and_validator():
     elif args.decoder_type < 0 or args.decoder_type > 1:
         print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting")
         exit(0)
-    elif args.case_list is not None and args.case_start > 0 and args.case_end < 87:
+    elif args.case_list is not None and args.case_start > 0 and args.case_end < 90:
         print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
         exit(0)
     elif args.num_runs <= 0:
@@ -374,8 +376,8 @@ def rpp_test_suite_parser_and_validator():
         args.case_list = [str(x) for x in args.case_list]
     else:
         for case in args.case_list:
-            if int(case) < 0 or int(case) > 87:
-                 print("The case# must be in the 0:87 range!")
+            if int(case) < 0 or int(case) > 90:
+                 print("The case# must be in the 0:90 range!")
                  exit(0)
 
     return args
@@ -456,8 +458,8 @@ def rpp_test_suite_parser_and_validator():
         if qaMode == 1 and case != "82":
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
-        if int(case) < 0 or int(case) > 87:
-            print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+        if int(case) < 0 or int(case) > 89:
+            print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
             continue
         for layout in range(3):
             dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath)
@@ -474,8 +476,8 @@ def rpp_test_suite_parser_and_validator():
 else:
     if (testType == 1 and profilingOption == "NO"):
         for case in caseList:
-            if int(case) < 0 or int(case) > 87:
-                print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+            if int(case) < 0 or int(case) > 89:
+                print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
                 continue
             if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
                 srcPath1 = ricapInFilePath
@@ -489,8 +491,8 @@ def rpp_test_suite_parser_and_validator():
         NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69]
 
         for case in caseList:
-            if int(case) < 0 or int(case) > 87:
-                print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+            if int(case) < 0 or int(case) > 89:
+                print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
                 continue
             if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
                 srcPath1 = ricapInFilePath
@@ -627,7 +629,9 @@ def rpp_test_suite_parser_and_validator():
     "effects_augmentations",
     "filter_augmentations",
     "geometric_augmentations",
-    "morphological_operations"
+    "morphological_operations",
+    "arithmetic_operations",
+    "statistical_operations"
     ]
     for log_file in log_file_list:
         # Opening log file
@@ -692,7 +696,7 @@ def rpp_test_suite_parser_and_validator():
         f.close()
 
 # print the results of qa tests
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '54', '63', '70', '80', '82', '83', '84', '85', '86', '87']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89']
 nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
 
 if qaMode and testType == 0:
@@ -717,4 +721,4 @@ def rpp_test_suite_parser_and_validator():
         resultsInfo += "\n    - Total augmentations with golden output QA test support = " + str(len(supportedCaseList) - len(nonQACaseList))
         resultsInfo += "\n    - Total augmentations without golden ouput QA test support (due to randomization involved) = " + str(len(nonQACaseList))
         f.write(resultsInfo)
-    print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------")
\ No newline at end of file
+    print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------")
diff --git a/utilities/test_suite/HIP/runTests_voxel.py b/utilities/test_suite/HIP/runTests_voxel.py
index 2f007ecaa..b6648affb 100644
--- a/utilities/test_suite/HIP/runTests_voxel.py
+++ b/utilities/test_suite/HIP/runTests_voxel.py
@@ -39,7 +39,7 @@
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
 caseMin = 0
-caseMax = 4
+caseMax = 5
 
 # Check if folder path is empty, if it is the root folder, or if it exists, and remove its contents
 def validate_and_remove_contents(path):
@@ -258,8 +258,8 @@ def rpp_test_suite_parser_and_validator():
     parser = argparse.ArgumentParser()
     parser.add_argument("--header_path", type = str, default = headerFilePath, help = "Path to the nii header")
     parser.add_argument("--data_path", type = str, default = dataFilePath, help = "Path to the nii data file")
-    parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing range starting case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
-    parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing range ending case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
     parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
     parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
     parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
@@ -309,8 +309,8 @@ def rpp_test_suite_parser_and_validator():
     else:
         for case in args.case_list:
             if int(case) < caseMin or int(case) > caseMax:
-                 print("The case# must be in the 0:1 range!")
-                 exit(0)
+                print("The case# must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+                exit(0)
 
     # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs
     if args.qa_mode:
@@ -470,7 +470,7 @@ def rpp_test_suite_parser_and_validator():
             print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv")
 
 # print the results of qa tests
-supportedCaseList = ['0', '1', '4']
+supportedCaseList = ['0', '1', '2', '3', '4', '5']
 nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support
 
 if qaMode and testType == 0:
diff --git a/utilities/test_suite/HOST/CMakeLists.txt b/utilities/test_suite/HOST/CMakeLists.txt
index 6adc461b3..b7abf5d77 100644
--- a/utilities/test_suite/HOST/CMakeLists.txt
+++ b/utilities/test_suite/HOST/CMakeLists.txt
@@ -82,8 +82,15 @@ if (OpenCV_FOUND)
     link_directories(${ROCM_PATH}/lib /usr/local/lib)
 
     add_executable(Tensor_host Tensor_host.cpp)
+    add_executable(BatchPD_host_pkd3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp)
+    add_executable(BatchPD_host_pln1 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp)
+    add_executable(BatchPD_host_pln3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp)
+
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
     target_link_libraries(Tensor_host ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
+    target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
+    target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
+    target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
 else()
     message("-- ${Red}Error: OpenCV must be installed to install ${PROJECT_NAME} successfully!${ColourReset}")
 endif()
@@ -102,7 +109,7 @@ else()
 endif()
 
 if(NOT libsnd_LIBS)
-    message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_voxel_host successfully!${ColourReset}")
+    message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_host successfully!${ColourReset}")
 else()
     message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}")
     include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include)
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index 1e416ed52..b698a2def 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -65,14 +65,15 @@ int main(int argc, char **argv)
     int batchSize = atoi(argv[14]);
 
     bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24);
-    bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 63);
+    bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
     bool randomOutputCase = (testCase == 84);
     bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
+    bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
     bool noiseTypeCase = (testCase == 8);
     bool pln1OutTypeCase = (testCase == 86);
+
     unsigned int verbosity = atoi(argv[11]);
     unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1;
-    bool reductionTypeCase = (testCase == 87);
     int roiList[4] = {atoi(argv[15]), atoi(argv[16]), atoi(argv[17]), atoi(argv[18])};
     string scriptPath = argv[19];
 
@@ -102,7 +103,7 @@ int main(int argc, char **argv)
 
     if (layoutType == 2)
     {
-        if(testCase == 36 || testCase == 31 || testCase == 86)
+        if(testCase == 31 || testCase == 36 || testCase == 45 || testCase == 86)
         {
             printf("\ncase %d does not exist for PLN1 layout\n", testCase);
             return -1;
@@ -140,6 +141,11 @@ int main(int argc, char **argv)
         std::cerr << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!";
         exit(0);
     }
+    else if(testCase == 82 && batchSize < 2)
+    {
+        std::cerr<<"\n RICAP only works with BatchSize > 1";
+        exit(0);
+    }
 
     // Get function name
     string funcName = augmentationMap[testCase];
@@ -310,6 +316,24 @@ int main(int argc, char **argv)
     input_second = static_cast<Rpp8u *>(calloc(inputBufferSize, 1));
     output = static_cast<Rpp8u *>(calloc(outputBufferSize, 1));
 
+    // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums)
+    void *reductionFuncResultArr;
+    Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
+    if (reductionTypeCase)
+    {
+        int bitDepthByteSize = 0;
+        if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
+        {
+            bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+            reductionFuncResultArr = static_cast<void *>(calloc(reductionFuncResultArrLength, bitDepthByteSize));
+        }
+        else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
+        {
+            bitDepthByteSize = sizeof(Rpp32f);  // using 32f outputs for 16f and 32f
+            reductionFuncResultArr = static_cast<Rpp32f *>(calloc(reductionFuncResultArrLength, bitDepthByteSize));
+        }
+    }
+
     // Set the number of threads to be used by OpenMP pragma for RPP batch processing on host.
     // If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size
     Rpp32u numThreads = 0;
@@ -321,33 +345,6 @@ int main(int argc, char **argv)
     double cpuTime, wallTime;
     string testCaseName;
 
-    if(testCase == 82 && imagesMixed)
-    {
-        std::cerr<<"\n RICAP only works with same dimension images";
-        exit(0);
-    }
-
-    if(testCase == 82 && batchSize < 2)
-    {
-        std::cerr<<"\n RICAP only works with BatchSize > 1";
-        exit(0);
-    }
-
-    // Initialize buffers for any reductionType functions
-    void *reductionFuncResultArr;
-    Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
-    if(reductionTypeCase)
-    {
-        if(dstDescPtr->dataType == RpptDataType::U8)
-            reductionFuncResultArr = static_cast<Rpp64u*>(calloc(reductionFuncResultArrLength, sizeof(Rpp64u)));
-        else if(dstDescPtr->dataType == RpptDataType::F16)
-            reductionFuncResultArr = static_cast<Rpp32f*>(calloc(reductionFuncResultArrLength, sizeof(Rpp32f)));
-        else if(dstDescPtr->dataType == RpptDataType::F32)
-            reductionFuncResultArr = static_cast<Rpp32f*>(calloc(reductionFuncResultArrLength, sizeof(Rpp32f)));
-        else if(dstDescPtr->dataType == RpptDataType::I8)
-            reductionFuncResultArr = static_cast<Rpp64s*>(calloc(reductionFuncResultArrLength, sizeof(Rpp64s)));
-    }
-
     // case-wise RPP API and measure time script for Unit and Performance test
     printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize);
     for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++)
@@ -818,6 +815,36 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 45:
+                {
+                    testCaseName = "color_temperature";
+
+                    Rpp8s adjustment[batchSize];
+                    for (i = 0; i < batchSize; i++)
+                        adjustment[i] = 70;
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_color_temperature_host(input, srcDescPtr, output, dstDescPtr, adjustment, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
+                case 61:
+                {
+                    testCaseName = "magnitude";
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_magnitude_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 case 63:
                 {
                     testCaseName = "phase";
@@ -1032,6 +1059,40 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 88:
+                {
+                    testCaseName = "tensor_min";
+
+                    if(srcDescPtr->c == 1)
+                        reductionFuncResultArrLength = srcDescPtr->n;
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_tensor_min_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
+                case 89:
+                {
+                    testCaseName = "tensor_max";
+
+                    if(srcDescPtr->c == 1)
+                        reductionFuncResultArrLength = srcDescPtr->n;
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_tensor_max_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 default:
                     missingFuncFlag = 1;
                     break;
@@ -1064,33 +1125,41 @@ int main(int argc, char **argv)
                     if(srcDescPtr->c == 3)
                         printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): ");
                     else if(srcDescPtr->c == 1)
+                    {
                         printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): ");
+                        reductionFuncResultArrLength = srcDescPtr->n;
+                    }
 
-                    if(dstDescPtr->dataType == RpptDataType::U8)
+                    // print reduction functions output array based on different bit depths, and precision desired
+                    int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
+                    if (dstDescPtr->dataType == RpptDataType::U8)
                     {
-                        Rpp64u *reductionOutPtr = static_cast<Rpp64u*>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %llu ", reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp64u *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp8u *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::F16)
+                    else if (dstDescPtr->dataType == RpptDataType::F16)
                     {
-                        Rpp32f *reductionOutPtr = static_cast<Rpp32f *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %0.3f ", (float)reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp16f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::F32)
+                    else if (dstDescPtr->dataType == RpptDataType::F32)
                     {
-                        Rpp32f *reductionOutPtr = static_cast<Rpp32f *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %0.3f ", (float)reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::I8)
+                    else if (dstDescPtr->dataType == RpptDataType::I8)
                     {
-                        Rpp64s *reductionOutPtr = static_cast<Rpp64s *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %lld ", reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp64s *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp8s *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-
                     printf("\n");
 
                     /*Compare the output of the function with golden outputs only if
@@ -1098,7 +1167,12 @@ int main(int argc, char **argv)
                     2.input bit depth 0 (U8)
                     3.source and destination layout are the same*/
                     if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
-                        compare_reduction_output(static_cast<Rpp64u *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                    {
+                        if (testCase == 87)
+                            compare_reduction_output(static_cast<uint64_t *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                        else
+                            compare_reduction_output(static_cast<Rpp8u *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                    }
                 }
                 else
                 {
@@ -1181,4 +1255,4 @@ int main(int argc, char **argv)
     if(reductionTypeCase)
         free(reductionFuncResultArr);
     return 0;
-}
\ No newline at end of file
+}
diff --git a/utilities/test_suite/HOST/Tensor_host_audio.cpp b/utilities/test_suite/HOST/Tensor_host_audio.cpp
index 139e7e97e..fe6fa1246 100644
--- a/utilities/test_suite/HOST/Tensor_host_audio.cpp
+++ b/utilities/test_suite/HOST/Tensor_host_audio.cpp
@@ -197,6 +197,25 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 3:
+                {
+                    testCaseName = "down_mixing";
+                    bool normalizeWeights = false;
+                    Rpp32s srcDimsTensor[batchSize * 2];
+
+                    for (int i = 0, j = 0; i < batchSize; i++, j += 2)
+                    {
+                        srcDimsTensor[j] = srcLengthTensor[i];
+                        srcDimsTensor[j + 1] = channelsTensor[i];
+                        dstDims[i].height = srcLengthTensor[i];
+                        dstDims[i].width = 1;
+                    }
+
+                    startWallTime = omp_get_wtime();
+                    rppt_down_mixing_host(inputf32, srcDescPtr, outputf32, dstDescPtr, srcDimsTensor, normalizeWeights, handle);
+
+                    break;
+                }
                 default:
                 {
                     missingFuncFlag = 1;
@@ -263,4 +282,4 @@ int main(int argc, char **argv)
     free(inputf32);
     free(outputf32);
     return 0;
-}
+}
\ No newline at end of file
diff --git a/utilities/test_suite/HOST/Tensor_voxel_host.cpp b/utilities/test_suite/HOST/Tensor_voxel_host.cpp
index 15cdbedd3..ebaaaf639 100644
--- a/utilities/test_suite/HOST/Tensor_voxel_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_voxel_host.cpp
@@ -55,7 +55,10 @@ int main(int argc, char * argv[])
         fprintf(stdout, "\nUsage: %s <header file> <data file> <layoutType = 0 - PKD3/ 1 - PLN3/ 2 - PLN1>\n", argv[0]);
         exit(1);
     }
+<<<<<<< HEAD
 
+=======
+>>>>>>> abishek_rpp/develop
     if(batchSize > MAX_BATCH_SIZE)
     {
         std::cout << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!";
@@ -252,6 +255,38 @@ int main(int argc, char * argv[])
 
                     break;
                 }
+                case 2:
+                {
+                    testCaseName = "add_scalar";
+                    Rpp32f addTensor[batchSize];
+
+                    for (int i = 0; i < batchSize; i++)
+                        addTensor[i] = 40;
+
+                    startWallTime = omp_get_wtime();
+                    if(inputBitDepth == 2)
+                        rppt_add_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, addTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
+                case 3:
+                {
+                    testCaseName = "subtract_scalar";
+                    Rpp32f subtractTensor[batchSize];
+
+                    for (int i = 0; i < batchSize; i++)
+                        subtractTensor[i] = 40;
+
+                    startWallTime = omp_get_wtime();
+                    if (inputBitDepth == 2)
+                        rppt_subtract_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, subtractTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 case 4:
                 {
                     testCaseName = "flip_voxel";
@@ -267,10 +302,28 @@ int main(int argc, char * argv[])
                     }
 
                     startWallTime = omp_get_wtime();
-                    if(inputBitDepth == 0)
+                    if (inputBitDepth == 0)
                         rppt_flip_voxel_host(inputU8, descriptorPtr3D, outputU8, descriptorPtr3D, horizontalTensor, verticalTensor, depthTensor, roiGenericSrcPtr, roiTypeSrc, handle);
-                    else
+                    else if(inputBitDepth == 2)
                         rppt_flip_voxel_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, horizontalTensor, verticalTensor, depthTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
+                case 5:
+                {
+                    testCaseName = "multiply_scalar";
+                    Rpp32f mulTensor[batchSize];
+
+                    for (int i = 0; i < batchSize; i++)
+                        mulTensor[i] = 80;
+
+                    startWallTime = omp_get_wtime();
+                    if (inputBitDepth == 2)
+                        rppt_multiply_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, mulTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
 
                     break;
                 }
diff --git a/utilities/test_suite/HOST/runAudioTests.py b/utilities/test_suite/HOST/runAudioTests.py
index c05a7a011..70ec00026 100644
--- a/utilities/test_suite/HOST/runAudioTests.py
+++ b/utilities/test_suite/HOST/runAudioTests.py
@@ -37,7 +37,7 @@
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
 caseMin = 0
-caseMax = 2
+caseMax = 3
 
 # Checks if the folder path is empty, or is it a root folder, or if it exists, and remove its contents
 def validate_and_remove_files(path):
@@ -235,13 +235,31 @@ def rpp_test_suite_parser_and_validator():
         exit(0)
 
     for case in caseList:
+        if "--input_path" not in sys.argv:
+            if case == "3":
+                srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1"
+            else:
+                srcPath = inFilePath
+        if int(case) < 0 or int(case) > 3:
+            print(f"Invalid case number {case}. Case number must be 0-3 range!")
+            continue
+
         run_unit_test(srcPath, case, numRuns, testType, batchSize, outFilePath)
 else:
     for case in caseList:
+        if "--input_path" not in sys.argv:
+            if case == "3":
+                srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1"
+            else:
+                srcPath = inFilePath
+        if int(case) < 0 or int(case) > 3:
+            print(f"Invalid case number {case}. Case number must be 0-3 range!")
+            continue
+
         run_performance_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath)
 
 # print the results of qa tests
-supportedCaseList = ['0', '1', '2']
+supportedCaseList = ['0', '1', '2', '3']
 nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support
 
 if testType == 0:
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index bd938e218..b08c4d5e8 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -28,6 +28,7 @@
 import sys
 import datetime
 import shutil
+import pandas as pd
 
 # Set the timestamp
 timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
@@ -37,6 +38,7 @@
 inFilePath2 = scriptPath + "/../TEST_IMAGES/three_images_mixed_src2"
 ricapInFilePath = scriptPath + "/../TEST_IMAGES/three_images_150x150_src1"
 qaInputFile = scriptPath + "/../TEST_IMAGES/three_images_mixed_src1"
+perfQaInputFile = scriptPath + "/../TEST_IMAGES/eight_images_mixed_src1"
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
 
@@ -113,12 +115,14 @@ def get_log_file_list(preserveOutput):
 
 # Functionality group finder
 def func_group_finder(case_number):
-    if case_number < 5 or case_number == 13 or case_number == 36 or case_number == 31:
+    if case_number < 5 or case_number == 13 or case_number == 36 or case_number == 31 or case_number == 45:
         return "color_augmentations"
     elif case_number == 8 or case_number == 30 or case_number == 82 or case_number == 83 or case_number == 84:
         return "effects_augmentations"
     elif case_number < 40:
         return "geometric_augmentations"
+    elif case_number == 61:
+        return "arithmetic_operations"
     elif case_number < 87:
         return "data_exchange_operations"
     elif case_number < 88:
@@ -126,7 +130,7 @@ def func_group_finder(case_number):
     else:
         return "miscellaneous"
 
- # Generate a directory name based on certain parameters
+# Generate a directory name based on certain parameters
 def directory_name_generator(qaMode, affinity, layoutType, case, path):
     if qaMode == 0:
         functionality_group = func_group_finder(int(case))
@@ -155,7 +159,11 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
     print("--------------------------------")
     print("Running a New Functionality...")
     print("--------------------------------")
-    for bitDepth in range(7):
+    if qaMode:
+        maxBitdepth = 1
+    else:
+        maxBitdepth = 7
+    for bitDepth in range(maxBitdepth):
         print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
 
         for outputFormatToggle in range(2):
@@ -183,6 +191,16 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
             print("------------------------------------------------------------------------------------------")
 
 def run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
+    if qaMode == 1:
+        with open("{}/BatchPD_host_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file:
+            process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + log_file_layout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+            while True:
+                output = process.stdout.readline()
+                if not output and process.poll() is not None:
+                    break
+                print(output.strip())
+                log_file.write(output)
+
     with open("{}/Tensor_host_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file:
         print(f"./Tensor_host {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ")
         process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
@@ -198,8 +216,11 @@ def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dst
     print("--------------------------------")
     print("Running a New Functionality...")
     print("--------------------------------")
-
-    for bitDepth in range(7):
+    if qaMode:
+        maxBitdepth = 1
+    else:
+        maxBitdepth = 7
+    for bitDepth in range(maxBitdepth):
         print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n")
 
         for outputFormatToggle in range(2):
@@ -223,8 +244,8 @@ def rpp_test_suite_parser_and_validator():
     parser = argparse.ArgumentParser()
     parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1")
     parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2")
-    parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)")
-    parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)")
+    parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:89)")
+    parser.add_argument("--case_end", type = int, default = 89, help = "Testing range ending case # - (0:89)")
     parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
     parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
     parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False)
@@ -239,10 +260,11 @@ def rpp_test_suite_parser_and_validator():
     validate_path(args.input_path1)
     validate_path(args.input_path2)
     validate_path(qaInputFile)
+    validate_path(perfQaInputFile)
 
     # validate the parameters passed by user
-    if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)):
-        print("Starting case# and Ending case# must be in the 0:87 range. Aborting!")
+    if ((args.case_start < 0 or args.case_start > 89) or (args.case_end < 0 or args.case_end > 89)):
+        print("Starting case# and Ending case# must be in the 0:89 range. Aborting!")
         exit(0)
     elif args.case_end < args.case_start:
         print("Ending case# must be greater than starting case#. Aborting!")
@@ -256,7 +278,7 @@ def rpp_test_suite_parser_and_validator():
     elif args.decoder_type < 0 or args.decoder_type > 1:
         print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting")
         exit(0)
-    elif args.case_list is not None and args.case_start > 0 and args.case_end < 87:
+    elif args.case_list is not None and args.case_start > 0 and args.case_end < 89:
         print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
         exit(0)
     elif args.num_runs <= 0:
@@ -280,8 +302,8 @@ def rpp_test_suite_parser_and_validator():
         args.case_list = [str(x) for x in args.case_list]
     else:
         for case in args.case_list:
-            if int(case) < 0 or int(case) > 87:
-                 print("The case# must be in the 0:87 range!")
+            if int(case) < 0 or int(case) > 89:
+                 print("The case# must be in the 0:89 range!")
                  exit(0)
 
     return args
@@ -300,10 +322,14 @@ def rpp_test_suite_parser_and_validator():
 batchSize = args.batch_size
 roiList = ['0', '0', '0', '0'] if args.roi is None else args.roi
 
-if qaMode and batchSize != 3:
+if qaMode and testType == 0 and batchSize != 3:
     print("QA mode can only run with a batch size of 3.")
     exit(0)
 
+if qaMode and testType == 1 and batchSize != 8:
+    print("Performance QA mode can only run with a batch size of 8.")
+    exit(0)
+
 # set the output folders and number of runs based on type of test (unit test / performance test)
 if(testType == 0):
     if qaMode:
@@ -355,8 +381,8 @@ def rpp_test_suite_parser_and_validator():
         if qaMode == 1 and case != "82":
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
-        if int(case) < 0 or int(case) > 87:
-            print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!")
+        if int(case) < 0 or int(case) > 89:
+            print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
             continue
         for layout in range(3):
             dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath)
@@ -371,19 +397,22 @@ def rpp_test_suite_parser_and_validator():
         create_layout_directories(dstPath, layoutDict)
 else:
     for case in caseList:
-        if int(case) < 0 or int(case) > 87:
-            print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!")
+        if int(case) < 0 or int(case) > 89:
+            print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
             continue
+        # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs
+        if qaMode == 1 and case != "82":
+            srcPath1 = inFilePath1
+            srcPath2 = inFilePath2
         if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
-                srcPath1 = ricapInFilePath
-                srcPath2 = ricapInFilePath
+            srcPath1 = ricapInFilePath
+            srcPath2 = ricapInFilePath
         for layout in range(3):
             dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath)
-
             run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
 
 # print the results of qa tests
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '54', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89']
 nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
 
 if qaMode and testType == 0:
@@ -415,7 +444,146 @@ def rpp_test_suite_parser_and_validator():
 if testType == 0 and qaMode == 0:
     create_layout_directories(dstPath, layoutDict)
 # Performance tests
-elif (testType == 1):
+elif (testType == 1 and qaMode == 1):
+    columns = ['BatchPD_Augmentation_Type', 'Tensor_Augmentation_Type', 'Performance Speedup (%)', 'Test_Result']
+    tensorAugVariations = []
+    batchPDAugVariations = []
+    achievedPerf = []
+    status = []
+    df = pd.DataFrame(columns=columns)
+    tensorLogFileList = get_log_file_list(preserveOutput)
+    batchpdLogFileList = [sub.replace("Tensor_host", "BatchPD_host") for sub in tensorLogFileList] # will be needed only in qa mode
+
+    stats = []
+    tensorVal = []
+    batchpdVal = []
+    functions = []
+    functionsBatchPD = []
+    funcCount = 0
+    performanceNoise = 10
+    perfQASupportCaseList = ["resize", "color_twist", "phase"]
+    for i in range(3):
+        tensorLogFile = tensorLogFileList[i]
+        batchpdLogFile = batchpdLogFileList[i]
+        # Opening log file
+        try:
+            tensorFile = open(tensorLogFile,"r")
+        except IOError:
+            print("Skipping file -> "+ tensorLogFile)
+            continue
+
+        # Opening log file
+        try:
+            batchpdFile = open(batchpdLogFile,"r")
+        except IOError:
+            print("Skipping file -> "+ batchpdLogFile)
+            continue
+
+        prevLine = ""
+        # Loop over each line
+        for line in tensorFile:
+            if "max,min,avg wall times in ms/batch" in line and "u8_Tensor" in prevLine:
+                layoutCheck = "PKD3_toPKD3" in prevLine or "PLN3_toPLN3" in prevLine or "PLN1_toPLN1" in prevLine
+                interpolationCheck = "interpolationType" not in prevLine or "interpolationTypeBilinear" in prevLine
+                if layoutCheck and interpolationCheck:
+                    splitWordStart = "Running "
+                    splitWordEnd = " " + str(numRuns)
+                    prevLine = prevLine.partition(splitWordStart)[2].partition(splitWordEnd)[0]
+                    splitWordStart = "max,min,avg wall times in ms/batch = "
+                    splitWordEnd = "\n"
+                    if prevLine not in functions:
+                        functions.append(prevLine)
+                        stats = line.partition(splitWordStart)[2].partition(splitWordEnd)[0].split(",")
+                        tensorVal.append(float(stats[2]))
+                        funcCount += 1
+
+            if line != "\n":
+                prevLine = line
+
+        # Closing log file
+        tensorFile.close()
+
+        stats = []
+        prevLine = ""
+        for line in batchpdFile:
+            if "max,min,avg" in line and "u8_BatchPD" in prevLine:
+                if "PKD3_toPKD3" in prevLine or "PLN3_toPLN3" in prevLine or "PLN1_toPLN1" in prevLine:
+                    splitWordStart = "Running "
+                    splitWordEnd = " " + str(numRuns)
+                    prevLine = prevLine.partition(splitWordStart)[2].partition(splitWordEnd)[0]
+                    splitWordStart = "max,min,avg"
+                    splitWordEnd = "\n"
+                    if prevLine not in functionsBatchPD:
+                        functionsBatchPD.append(prevLine)
+                        stats = line.partition(splitWordStart)[2].partition(splitWordEnd)[0].split(",")
+                        batchpdVal.append(float(stats[2]) * float(1000.0))
+
+            if line != "\n":
+                prevLine = line
+
+        # Closing log file
+        batchpdFile.close()
+
+    print("---------------------------------- Results of QA Test - Tensor_host ----------------------------------\n")
+    qaFilePath = os.path.join(outFilePath, "QA_results.txt")
+    excelFilePath = os.path.join(outFilePath, "performance_qa_results.xlsx")
+    f = open(qaFilePath, 'w')
+    numLines = 0
+    numPassed = 0
+    removalList = ["_HOST", "_toPKD3", "_toPLN3", "_toPLN1"]
+    for i in range(len(functions)):
+        perfImprovement = int(((batchpdVal[i] - tensorVal[i]) / batchpdVal[i]) * 100)
+        numLines += 1
+        funcName = functions[i]
+        caseName = funcName.split("_u8_")[0]
+        for string in removalList:
+            funcName = funcName.replace(string, "")
+        if caseName not in perfQASupportCaseList:
+            print("Error! QA mode is not yet available for variant: " + funcName)
+            continue
+        achievedPerf.append(perfImprovement)
+        tensorAugVariations.append(funcName)
+        if perfImprovement > -performanceNoise:
+            numPassed += 1
+            status.append("PASSED")
+            print(funcName + ": PASSED")
+        else:
+            status.append("FAILED")
+            print(funcName + ": FAILED")
+
+    resultsInfo = "\n\nFinal Results of Tests:"
+    resultsInfo += "\n    - Total test cases including all subvariants REQUESTED = " + str(numLines)
+    resultsInfo += "\n    - Total test cases including all subvariants PASSED = " + str(numPassed)
+    f.write(resultsInfo)
+    batchPDAugVariations = [s.replace('Tensor', 'BatchPD') for s in tensorAugVariations]
+    df['Tensor_Augmentation_Type'] = tensorAugVariations
+    df['BatchPD_Augmentation_Type'] = batchPDAugVariations
+    df['Performance Speedup (%)'] = achievedPerf
+    df['Test_Result'] = status
+    # Calculate the number of cases passed and failed
+    passedCases = df['Test_Result'].eq('PASSED').sum()
+    failedCases = df['Test_Result'].eq('FAILED').sum()
+
+    summary_row = {'BatchPD_Augmentation_Type': pd.NA,
+                   'Tensor_Augmentation_Type': pd.NA,
+                   'Performance Speedup (%)': pd.NA,
+                   'Test_Result': f'Final Results of Tests: Passed: {passedCases}, Failed: {failedCases}'}
+
+    print("\n", df.to_markdown())
+
+    # Append the summary row to the DataFrame
+    # Convert the dictionary to a DataFrame
+    summary_row = pd.DataFrame([summary_row])
+    df = pd.concat([df, summary_row], ignore_index=True)
+
+    df.to_excel(excelFilePath, index=False)
+    print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------")
+    print("\nIMPORTANT NOTE:")
+    print("- The following performance comparison shows Performance Speedup percentages between times measured on previous generation RPP-BatchPD APIs against current generation RPP-Tensor APIs.")
+    print(f"- All APIs have been improved for performance ranging from {0}% (almost same) to {100}% faster.")
+    print("- Random observations of negative speedups might always occur due to current test machine temperature/load variances or other CPU/GPU state-dependent conditions.")
+    print("\n-------------------------------------------------------------------\n")
+elif (testType == 1 and qaMode == 0):
     log_file_list = get_log_file_list(preserveOutput)
 
     functionality_group_list = [
@@ -423,6 +591,8 @@ def rpp_test_suite_parser_and_validator():
         "data_exchange_operations",
         "effects_augmentations",
         "geometric_augmentations",
+        "arithmetic_operations",
+        "statistical_operations",
     ]
 
     for log_file in log_file_list:
@@ -485,4 +655,4 @@ def rpp_test_suite_parser_and_validator():
             print("No variants under this category")
 
         # Closing log file
-        f.close()
\ No newline at end of file
+        f.close()
diff --git a/utilities/test_suite/HOST/runTests_voxel.py b/utilities/test_suite/HOST/runTests_voxel.py
index 97b6eb3d7..93318d8c7 100644
--- a/utilities/test_suite/HOST/runTests_voxel.py
+++ b/utilities/test_suite/HOST/runTests_voxel.py
@@ -39,7 +39,7 @@
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
 caseMin = 0
-caseMax = 4
+caseMax = 5
 
 # Check if folder path is empty, if it is the root folder, or if it exists, and remove its contents
 def validate_and_remove_contents(path):
@@ -185,8 +185,8 @@ def rpp_test_suite_parser_and_validator():
     parser = argparse.ArgumentParser()
     parser.add_argument("--header_path", type = str, default = headerFilePath, help = "Path to the nii header")
     parser.add_argument("--data_path", type = str, default = dataFilePath, help = "Path to the nii data file")
-    parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing range starting case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
-    parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing range ending case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
     parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
     parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
     parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False)
@@ -232,8 +232,8 @@ def rpp_test_suite_parser_and_validator():
     else:
         for case in args.case_list:
             if int(case) < caseMin or int(case) > caseMax:
-                 print("The case# must be in the 0:1 range!")
-                 exit(0)
+                print("The case# must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+                exit(0)
 
     # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs
     if args.qa_mode:
@@ -321,7 +321,7 @@ def rpp_test_suite_parser_and_validator():
             run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize)
 
 # print the results of qa tests
-supportedCaseList = ['0', '1', '4']
+supportedCaseList = ['0', '1', '2', '3', '4', '5']
 nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support
 
 if qaMode and testType == 0:
diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md
index cc4c662f0..067bedb1d 100644
--- a/utilities/test_suite/README.md
+++ b/utilities/test_suite/README.md
@@ -80,7 +80,12 @@ This repository contains three test suites for the AMD ROCm Performance Primitiv
   sudo make -j$nproc install
   ```
 
-## RPP Image Test Suite
+* Openpyxl
+  ```
+  pip install openpyxl
+  ```
+
+## Rpp Image Test Suite
 The image test suite can be executed under 2 backend scenarios - (HOST/HIP):
 -   HOST backend - (On a CPU with HOST backend)
 -   HIP backend - (On a GPU with HIP backend)
@@ -89,8 +94,8 @@ The image test suite can be executed under 2 backend scenarios - (HOST/HIP):
 The image test suite accepts the following command line arguments:
 -   input_path1: The path to the input folder 1. Default is $cwd/../TEST_IMAGES/three_images_mixed_src1
 -   input_path2: The path to the input folder 2. Default is $cwd/../TEST_IMAGES/three_images_mixed_src2
--   case_start: The starting case number for the test range (0-87). Default is 0
--   case_end: The ending case number for the test range (0-87). Default is 87
+-   case_start: The starting case number for the test range (0-89). Default is 0
+-   case_end: The ending case number for the test range (0-89). Default is 89
 -   test_type: The type of test to run (0 = Unit tests, 1 = Performance tests). Default is 0
 -   case_list: A list of specific case numbers to run. Must be used in conjunction with --test_type
 -   profiling: Run the tests with a profiler (YES/NO). Default is NO. This option is only available with HIP backend
@@ -112,23 +117,27 @@ The test suite can be run with the following command:
 python runTests.py --input_path1 <input_path1> --input_path2 <input_path2> --case_start <case_start> --case_end <case_end> --test_type <test_type> --profiling <profiling>
 ```
 
-### Modes of operation (RPP Image Test Suite)
--   QA mode - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3.
+## Modes of operation (Rpp Image Test Suite)
+-   QA mode (Unit tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3.
 Note: QA mode is not supported for case 84 due to run-to-run variation of outputs.
 ``` python
-python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 1 --batch_size 3
+python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 1 --batch_size 3
+```
+-   QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement
+``` python
+python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 --num_runs 100
 ```
 -   Unit test mode - Unit tests allowing users to pass a path to a folder containing images, to execute the desired functionality and variant once, report RPP execution wall time, save and view output images
 Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
       RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
 ``` python
-python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 0
+python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 0
 ```
 -   Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time, or optionally, AMD rocprof kernel profiler max/min/avg time for HIP backend variants.
 Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
       RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
 ``` python
-python runTests.py --case_start 0 --case_end 87 --test_type 1
+python runTests.py --case_start 0 --case_end 89 --test_type 1
 ```
 
 To run the unit tests / performance tests for specific case numbers. please case use case_list parameter. Example as below
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/color_temperature/color_temperature_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/color_temperature/color_temperature_u8_Tensor.bin
new file mode 100644
index 000000000..696f0daa5
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/color_temperature/color_temperature_u8_Tensor.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/magnitude/magnitude_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/magnitude/magnitude_u8_Tensor.bin
new file mode 100644
index 000000000..fbf86994b
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/magnitude/magnitude_u8_Tensor.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/tensor_sum/tensor_sum_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/tensor_sum/tensor_sum_u8_Tensor.bin
deleted file mode 100644
index dacf51e6e..000000000
Binary files a/utilities/test_suite/REFERENCE_OUTPUT/tensor_sum/tensor_sum_u8_Tensor.bin and /dev/null differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/down_mixing/down_mixing.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/down_mixing/down_mixing.bin
new file mode 100644
index 000000000..cb7c8bb84
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/down_mixing/down_mixing.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin
new file mode 100644
index 000000000..628d3785b
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/multiply_scalar/multiply_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/multiply_scalar/multiply_scalar_nifti_output.bin
new file mode 100644
index 000000000..aae9ff96c
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/multiply_scalar/multiply_scalar_nifti_output.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin
new file mode 100644
index 000000000..9b9328536
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin differ
diff --git a/utilities/test_suite/TEST_AUDIO_FILES/single_sample_multi_channel_src1/sample.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample1.wav
similarity index 100%
rename from utilities/test_suite/TEST_AUDIO_FILES/single_sample_multi_channel_src1/sample.wav
rename to utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample1.wav
diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample2.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample2.wav
new file mode 100644
index 000000000..4847f78cd
Binary files /dev/null and b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample2.wav differ
diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample3.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample3.wav
new file mode 100644
index 000000000..a506e1762
Binary files /dev/null and b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample3.wav differ
diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img1024x768.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1024x768.jpg
similarity index 100%
rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img1024x768.jpg
rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1024x768.jpg
diff --git a/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1280x720.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1280x720.jpg
new file mode 100644
index 000000000..8995dbbb6
Binary files /dev/null and b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1280x720.jpg differ
diff --git a/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img150x150.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img150x150.jpg
new file mode 100644
index 000000000..a283d2472
Binary files /dev/null and b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img150x150.jpg differ
diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img1920x1080.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1920x1080.jpg
similarity index 100%
rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img1920x1080.jpg
rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1920x1080.jpg
diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img224x224.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img224x224.jpg
similarity index 100%
rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img224x224.jpg
rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img224x224.jpg
diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img256x256.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img256x256.jpg
similarity index 100%
rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img256x256.jpg
rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img256x256.jpg
diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img300x300.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img300x300.jpg
similarity index 100%
rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img300x300.jpg
rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img300x300.jpg
diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img3840x2160.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img3840x2160.jpg
similarity index 100%
rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img3840x2160.jpg
rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img3840x2160.jpg
diff --git a/utilities/test_suite/rpp_test_suite_audio.h b/utilities/test_suite/rpp_test_suite_audio.h
index 2ac174042..ec962270a 100644
--- a/utilities/test_suite/rpp_test_suite_audio.h
+++ b/utilities/test_suite/rpp_test_suite_audio.h
@@ -39,6 +39,7 @@ std::map<int, string> audioAugmentationMap =
     {0, "non_silent_region_detection"},
     {1, "to_decibels"},
     {2, "pre_emphasis_filter"},
+    {3, "down_mixing"},
 };
 
 // Golden outputs for Non Silent Region Detection
@@ -137,7 +138,7 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst
     // read data from golden outputs
     Rpp64u oBufferSize = dstDescPtr->n * dstDescPtr->strides.nStride;
     Rpp32f *refOutput = static_cast<Rpp32f *>(malloc(oBufferSize * sizeof(float)));
-    string outFile = scriptPath + testCase + "/" + testCase + ".bin";
+    string outFile = scriptPath + "/../REFERENCE_OUTPUTS_AUDIO/" + testCase + "/" + testCase + ".bin";
     std::fstream fin(outFile, std::ios::in | std::ios::binary);
     if(fin.is_open())
     {
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index c227567b5..58fee0c5d 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -86,8 +86,10 @@ std::map<int, string> augmentationMap =
     {37, "crop"},
     {38, "crop_mirror_normalize"},
     {39, "resize_crop_mirror"},
+    {45, "color_temperature"},
     {49, "box_filter"},
     {54, "gaussian_filter"},
+    {61, "magnitude"},
     {63, "phase"},
     {70, "copy"},
     {80, "resize_mirror_normalize"},
@@ -97,7 +99,30 @@ std::map<int, string> augmentationMap =
     {84, "spatter"},
     {85, "swap_channels"},
     {86, "color_to_greyscale"},
-    {87, "tensor_sum"}
+    {87, "tensor_sum"},
+    {88, "tensor_min"},
+    {89, "tensor_max"},
+};
+
+// Golden outputs for Tensor min Kernel
+std::map<int, std::vector<int>> TensorMinReferenceOutputs =
+{
+    {1, {1, 1, 7}},
+    {3, {0, 0, 0, 0, 2, 0, 0, 0, 7, 9, 0, 0}}
+};
+
+// Golden outputs for Tensor max Kernel
+std::map<int, std::vector<int>> TensorMaxReferenceOutputs =
+{
+    {1, {239, 245, 255}},
+    {3, {255, 240, 236, 255, 255, 242, 241, 255, 253, 255, 255, 255}}
+};
+
+// Golden outputs for Tensor sum Kernel
+std::map<int, std::vector<int>> TensorSumReferenceOutputs =
+{
+    {1, {334225, 813471, 2631125}},
+    {3, {348380, 340992, 262616, 951988, 1056552, 749506, 507441, 2313499, 2170646, 2732368, 3320699, 8223713}}
 };
 
 template <typename T>
@@ -1050,8 +1075,6 @@ inline void compare_output(T* output, string funcName, RpptDescPtr srcDescPtr, R
         binFile += "_noiseType" + noiseTypeName;
     }
     refFile = scriptPath + "/../REFERENCE_OUTPUT/" + funcName + "/"+ binFile + ".bin";
-    string line,word;
-    int index = 0;
     int fileMatch = 0;
 
     Rpp8u *binaryContent = (Rpp8u *)malloc(binOutputSize * sizeof(Rpp8u));
@@ -1088,17 +1111,14 @@ inline void compare_output(T* output, string funcName, RpptDescPtr srcDescPtr, R
     free(binaryContent);
 }
 
-inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPtr srcDescPtr, int testCase, string dst, string scriptPath)
+// compares reduction type functions outputs
+template <typename T>
+inline void compare_reduction_output(T* output, string funcName, RpptDescPtr srcDescPtr, int testCase, string dst, string scriptPath)
 {
     string func = funcName;
-    string refFile = "";
-    int pln1RefStride = srcDescPtr->n * 4;
-    Rpp64u binaryOutputSize = srcDescPtr->n * 5;
-
     string dataType[4] = {"_u8_", "_f16_", "_f32_", "_i8_"};
 
     func += dataType[srcDescPtr->dataType];
-    std::string binFile = func + "Tensor";
 
     if(srcDescPtr->layout == RpptLayout::NHWC)
         func += "Tensor_PKD3";
@@ -1110,21 +1130,29 @@ inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPt
             func += "Tensor_PLN1";
     }
 
-    refFile = scriptPath + "/../REFERENCE_OUTPUT/" + funcName + "/"+ binFile + ".bin";
-
-    string line,word;
-    int index = 0;
     int fileMatch = 0;
     int matched_values = 0;
-    Rpp64u *binaryContent = (Rpp64u *)malloc(binaryOutputSize * sizeof(Rpp64u));
-    read_bin_file(refFile, binaryContent);
+
+    T *refOutput;
+    refOutput = (T *)calloc(srcDescPtr->n * 4, sizeof(T));
+    int numChannels = (srcDescPtr->c == 1) ? 1 : 3;
+    int numOutputs = (srcDescPtr->c == 1) ? srcDescPtr->n : srcDescPtr->n * 4;
+    std::vector<int> ref;
+    if(testCase == 88)
+        ref = TensorMinReferenceOutputs[numChannels];
+    else if(testCase == 89)
+        ref = TensorMaxReferenceOutputs[numChannels];
+    else if(testCase == 87)
+        ref = TensorSumReferenceOutputs[numChannels];
+
+    for (int i = 0; i < numOutputs; i++)
+        refOutput[i] = (T)ref[i];
 
     if(srcDescPtr->c == 1)
     {
-        binaryContent += pln1RefStride;
         for(int i = 0; i < srcDescPtr->n; i++)
         {
-            int diff = output[i] - binaryContent[i];
+            int diff = abs(static_cast<int>(output[i] - refOutput[i]));
             if(diff <= CUTOFF)
                 fileMatch++;
         }
@@ -1136,7 +1164,7 @@ inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPt
             matched_values = 0;
             for(int j = 0; j < 4; j++)
             {
-                int diff = output[(i * 4) + j] - binaryContent[(i * 4) + j];
+                int diff = abs(static_cast<int>(output[(i * 4) + j] - refOutput[(i * 4) + j]));
                 if(diff <= CUTOFF)
                     matched_values++;
             }
@@ -1144,6 +1172,7 @@ inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPt
                 fileMatch++;
         }
     }
+    free(refOutput);
 
     std::cout << std::endl << "Results for " << func << " :" << std::endl;
     std::string status = func + ": ";
@@ -1166,7 +1195,14 @@ inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPt
         qaResults << status << std::endl;
         qaResults.close();
     }
-    free(binaryContent);
+}
+
+// print array of any bit depth for specified length
+template <typename T>
+inline void print_array(T *src, Rpp32u length, Rpp32u precision)
+{
+    for (int i = 0; i < length; i++)
+        std::cout << " " << std::fixed << std::setprecision(precision) << static_cast<Rpp32f>(src[i]) << " ";
 }
 
 // Used to randomly swap values present in array of size n
@@ -1255,4 +1291,4 @@ void inline init_ricap(int width, int height, int batchSize, Rpp32u *permutation
     roiPtrInputCropRegion[1].xywhROI = {randrange(0, part0Width - 8), randrange(0, height - part0Height), width - part0Width, part0Height};
     roiPtrInputCropRegion[2].xywhROI = {randrange(0, width - part0Width - 8), randrange(0, part0Height), part0Width, height - part0Height};
     roiPtrInputCropRegion[3].xywhROI = {randrange(0, part0Width - 8), randrange(0, part0Height), width - part0Width, height - part0Height};
-}
+}
\ No newline at end of file
diff --git a/utilities/test_suite/rpp_test_suite_voxel.h b/utilities/test_suite/rpp_test_suite_voxel.h
index 8ed011ad6..d72a959d7 100644
--- a/utilities/test_suite/rpp_test_suite_voxel.h
+++ b/utilities/test_suite/rpp_test_suite_voxel.h
@@ -61,7 +61,10 @@ std::map<int, string> augmentationMap =
 {
     {0, "fused_multiply_add_scalar"},
     {1, "slice"},
+    {2, "add_scalar"},
+    {3, "subtract_scalar"},
     {4, "flip_voxel"},
+    {5, "multiply_scalar"}
 };
 
 void replicate_last_file_to_fill_batch(const string& lastFilePath, vector<string>& filePathVector, vector<string>& fileNamesVector, const string& lastFileName, int noOfFiles, int batchCount)