diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index b61af9dde..3cb49a82b 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -78,6 +78,78 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
 RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorSumArr, Rpp32u tensorSumArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/*! \brief Tensor min operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] minArr destination array in HOST memory
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t minArr, Rpp32u minArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor min operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] minArr destination array in HIP memory
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMinArr, Rpp32u imageMinArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Tensor max operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] maxArr destination array in HOST memory
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t maxArr, Rpp32u maxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor max operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] maxArr destination array in HIP memory
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! @}
  */
 
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index 86391108d..1e748cc86 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -5996,4 +5996,284 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2
     pSumB[0] = _mm256_add_pd(_mm256_add_pd(p[4], p[5]), pSumB[0]); //add 8B values and bring it down to 4
 }
 
-#endif //RPP_CPU_COMMON_H
\ No newline at end of file
+inline void reduce_min_32_host(__m256i *pMin, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+    px[0] = _mm256_castsi256_si128(pMin[0]);
+    px[1] = _mm256_extracti128_si256(pMin[0], 1);
+    px[0] = _mm_min_epu8(px[0], px[1]);
+    px[1] = _mm_unpacklo_epi8(zero, px[0]);
+    px[0] = _mm_unpackhi_epi8(zero, px[0]);
+    px[0] = _mm_min_epu8(px[0], px[1]);
+    px[1] = _mm_unpacklo_epi16(zero, px[0]);
+    px[0] = _mm_unpackhi_epi16(zero, px[0]);
+    px[0] = _mm_min_epu16(px[0], px[1]);
+    px[1] = _mm_unpacklo_epi32(zero, px[0]);
+    px[0] = _mm_unpackhi_epi32(zero, px[0]);
+    px[0] = _mm_min_epu32(px[0], px[1]);
+    result[0] = _mm_shuffle_epi8(px[0], mask);
+}
+
+inline void compute_min_96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB)
+{
+    pMinR[0] = _mm256_min_epu8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min
+    pMinG[0] = _mm256_min_epu8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min
+    pMinB[0] = _mm256_min_epu8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min
+}
+
+inline void reduce_min_96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result)
+{
+    __m128i px[4];
+    __m128i zero = _mm_setzero_si128();
+    px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1));
+    px[1] = _mm_min_epu8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1));
+    px[1] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+    px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1));
+    px[0] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+    px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB)
+{
+    pMinR[0] = _mm_min_epu8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min
+    pMinG[0] = _mm_min_epu8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min
+    pMinB[0] = _mm_min_epu8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min
+}
+
+inline void reduce_min_48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    px[1] = _mm_min_epu8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0]));
+    px[0] = _mm_min_epu8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero));
+    px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void reduce_max_32_host(__m256i *pMax, __m128i *result)
+{
+    __m128i px;
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+    px = _mm_max_epu8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1));
+    px = _mm_max_epu8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px));
+    px = _mm_max_epu16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px));
+    px = _mm_max_epu32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px));
+    result[0] = _mm_shuffle_epi8(px, mask);
+}
+
+inline void compute_max_96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB)
+{
+    pMaxR[0] = _mm256_max_epu8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max
+    pMaxG[0] = _mm256_max_epu8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max
+    pMaxB[0] = _mm256_max_epu8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max
+}
+
+inline void reduce_max_96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result)
+{
+    __m128i px[4];
+    __m128i zero = _mm_setzero_si128();
+    px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1));
+    px[1] = _mm_max_epu8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1));
+    px[1] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+    px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1));
+    px[0] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+    px[1] = _mm_max_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_max_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_max_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_max_48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB)
+{
+    pMaxR[0] = _mm_max_epu8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max
+    pMaxG[0] = _mm_max_epu8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max
+    pMaxB[0] = _mm_max_epu8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max
+}
+
+inline void reduce_max_48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero));
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_float8_host(__m256 *p1, __m256 *pMin)
+{
+    pMin[0] = _mm256_min_ps(p1[0], pMin[0]); //compare and store min of 8 values into global min
+}
+
+inline void reduce_min_float8_host(__m256 *pMin, __m128 *result)
+{
+    __m128 px;
+    px = _mm_min_ps(_mm256_castps256_ps128(pMin[0]), _mm256_extractf128_ps(pMin[0], 1));
+    px = _mm_min_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px));
+    result[0] = _mm_shuffle_ps(px, px, 39);
+}
+
+inline void compute_min_float24_host(__m256 *p1, __m256 *pMinR, __m256 *pMinG, __m256 *pMinB)
+{
+    pMinR[0] = _mm256_min_ps(p1[0], pMinR[0]); //compare and store min of 8 R values into global min
+    pMinG[0] = _mm256_min_ps(p1[1], pMinG[0]); //compare and store min of 8 G values into global min
+    pMinB[0] = _mm256_min_ps(p1[2], pMinB[0]); //compare and store min of 8 B values into global min
+}
+
+inline void reduce_min_float24_host(__m256 *pMinR, __m256 *pMinG, __m256 *pMinB, __m256 *result)   // TO CHANGE
+{
+    __m128 px[2];
+    px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinR[0]), _mm256_extractf128_ps(pMinR[0], 1));
+    px[1] = _mm_min_ps(_mm256_castps256_ps128(pMinG[0]), _mm256_extractf128_ps(pMinG[0], 1));
+    px[0] = _mm_min_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1]));
+    px[0] = _mm_permute_ps(px[0], 0b11011000);
+    result[0] = _mm256_castps128_ps256(px[0]);
+    px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinB[0]), _mm256_extractf128_ps(pMinB[0], 1));
+    px[1] = _mm_min_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0));
+    px[0] = _mm_shuffle_ps(px[1], px[1], 34);
+    result[0] = _mm256_insertf128_ps(result[0], px[0], 1);
+}
+
+inline void compute_max_float8_host(__m256 *p1, __m256 *pMax)
+{
+    pMax[0] = _mm256_max_ps(p1[0], pMax[0]); //compare and store max of 8 values into global min
+}
+
+inline void reduce_max_float8_host(__m256 *pMax, __m128 *result)
+{
+    __m128 px;
+    px = _mm_max_ps(_mm256_castps256_ps128(pMax[0]), _mm256_extractf128_ps(pMax[0], 1));
+    px = _mm_max_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px));
+    result[0] = _mm_shuffle_ps(px, px, 39);
+}
+
+inline void compute_max_float24_host(__m256 *p1, __m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB)
+{
+    pMaxR[0] = _mm256_max_ps(p1[0], pMaxR[0]); //compare and store max of 8 R values into global min
+    pMaxG[0] = _mm256_max_ps(p1[1], pMaxG[0]); //compare and store max of 8 G values into global min
+    pMaxB[0] = _mm256_max_ps(p1[2], pMaxB[0]); //compare and store max of 8 B values into global min
+}
+
+inline void reduce_max_float24_host(__m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB, __m256 *result)
+{
+    __m128 px[2];
+    px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxR[0]), _mm256_extractf128_ps(pMaxR[0], 1));
+    px[1] = _mm_max_ps(_mm256_castps256_ps128(pMaxG[0]), _mm256_extractf128_ps(pMaxG[0], 1));
+    px[0] = _mm_max_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1]));
+    px[0] = _mm_permute_ps(px[0], 0b11011000);
+    result[0] = _mm256_castps128_ps256(px[0]);
+    px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxB[0]), _mm256_extractf128_ps(pMaxB[0], 1));
+    px[1] = _mm_max_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0));
+    px[0] = _mm_shuffle_ps(px[1], px[1], 34);
+    result[0] = _mm256_insertf128_ps(result[0], px[0], 1);
+}
+
+inline void reduce_min_i32_host(__m256i *pMin, __m128i *result)
+{
+    __m128i px;
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+    px = _mm_min_epi8(_mm256_castsi256_si128(pMin[0]), _mm256_extracti128_si256(pMin[0], 1));
+    px = _mm_min_epi8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px));
+    px = _mm_min_epi16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px));
+    px = _mm_min_epi32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px));
+    result[0] = _mm_shuffle_epi8(px, mask);
+}
+
+inline void compute_min_i96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB)
+{
+    pMinR[0] = _mm256_min_epi8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min
+    pMinG[0] = _mm256_min_epi8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min
+    pMinB[0] = _mm256_min_epi8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min
+}
+
+inline void reduce_min_i96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result)
+{
+    __m128i px[4];
+    __m128i zero = _mm_setzero_si128();
+    px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1));
+    px[1] = _mm_min_epi8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1));
+    px[1] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+    px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1));
+    px[0] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+    px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_i48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB)
+{
+    pMinR[0] = _mm_min_epi8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min
+    pMinG[0] = _mm_min_epi8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min
+    pMinB[0] = _mm_min_epi8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min
+}
+
+inline void reduce_min_i48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    px[1] = _mm_min_epi8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0]));
+    px[0] = _mm_min_epi8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero));
+    px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void reduce_max_i32_host(__m256i *pMax, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+    px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi8(zero, px[0]), _mm_unpackhi_epi8(zero, px[0]));
+    px[0] = _mm_max_epi16(_mm_unpacklo_epi16(zero, px[0]), _mm_unpackhi_epi16(zero, px[0]));
+    px[0] = _mm_max_epi32(_mm_unpacklo_epi32(zero, px[0]), _mm_unpackhi_epi32(zero, px[0]));
+    result[0] = _mm_shuffle_epi8(px[0], mask);
+}
+
+inline void compute_max_i96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB)
+{
+    pMaxR[0] = _mm256_max_epi8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max
+    pMaxG[0] = _mm256_max_epi8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max
+    pMaxB[0] = _mm256_max_epi8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max
+}
+
+inline void reduce_max_i96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result)
+{
+    __m128i px[4];
+    __m128i zero = _mm_setzero_si128();
+    px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1));
+    px[1] = _mm_max_epi8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1));
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+    px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_max_i48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB)
+{
+    pMaxR[0] = _mm_max_epi8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max
+    pMaxG[0] = _mm_max_epi8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max
+    pMaxB[0] = _mm_max_epi8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max
+}
+
+inline void reduce_max_i48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result)
+{
+    __m128i px[2];
+    __m128i zero = _mm_setzero_si128();
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero));
+    px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+    px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+    result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+#endif //RPP_CPU_COMMON_H
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index ff30de027..d03ec0e79 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -75,7 +75,7 @@ typedef union
 
 #define SIMD_GET_PS(name) (*(const __m128  *)_xmm_const_##name)
 
-const __m128 xmm_p0 = _mm_set1_ps(0.0f);
+const __m128 xmm_p0 = _mm_setzero_ps();
 const __m128 xmm_p1 = _mm_set1_ps(1.0f);
 const __m128 xmm_p2 = _mm_set1_ps(2.0f);
 const __m128 xmm_pm2 = _mm_set1_ps(-2.0f);
@@ -243,7 +243,7 @@ inline void rpp_mm256_print_epi8(__m256i vPrintArray)
     printf("\n");
     for (int ct = 0; ct < 32; ct++)
     {
-        printf("%d ", printArray[ct]);
+        printf("%d ", (unsigned char)printArray[ct]);
     }
 }
 
@@ -1271,6 +1271,20 @@ inline void rpp_load16_u8_to_u32_avx(Rpp8u *srcPtr, __m256i *p)
     p[1] = _mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15));    /* Contains pixels 09-16 */
 }
 
+inline void rpp_load96_u8_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256i *p)
+{
+    p[0] = _mm256_loadu_si256((__m256i *)srcPtrR);
+    p[1] = _mm256_loadu_si256((__m256i *)srcPtrG);
+    p[2] = _mm256_loadu_si256((__m256i *)srcPtrB);
+}
+
+inline void rpp_load96_i8_avx(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *p)
+{
+    p[0] = _mm256_load_si256((__m256i *)srcPtrR);
+    p[1] = _mm256_load_si256((__m256i *)srcPtrG);
+    p[2] = _mm256_load_si256((__m256i *)srcPtrB);
+}
+
 inline void rpp_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p)
 {
     __m128 p128[8];
@@ -1478,6 +1492,16 @@ inline void rpp_store4_f64_to_f64_avx(Rpp64f *dstPtr, __m256d *p)
     _mm256_storeu_pd(dstPtr, p[0]);
 }
 
+inline void rpp_store16_u8_to_u8(Rpp8u *dstPtr, __m128i *p)
+{
+    _mm_storeu_si128((__m128i *)dstPtr, p[0]);
+}
+
+inline void rpp_store16_i8(Rpp8s *dstPtr, __m128i *p)
+{
+    _mm_store_si128((__m128i *)dstPtr, p[0]);
+}
+
 inline void rpp_store8_f32_to_f16_avx(Rpp16f *dstPtr, __m256 *p)
 {
     __m128i px128 = _mm256_cvtps_ph(p[0], _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index a7412aa2d..d9c0ce02d 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -184,6 +184,13 @@ inline void generate_gaussian_kernel_gpu(Rpp32f stdDev, Rpp32f* kernel, Rpp32u k
     }
 }
 
+// Retrieve Min and Max given a datatype
+
+inline void getImageBitDepthMinMax(uchar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(float *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(half *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(schar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(-128, 127); }
+
 /******************** DEVICE FUNCTIONS ********************/
 
 // -------------------- Set 0 - Range checks and Range adjustment --------------------
@@ -1560,6 +1567,20 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_to_int24_pln3(schar *srcPtr,
 
 // /******************** DEVICE MATH HELPER FUNCTIONS ********************/
 
+// float8 min
+
+__device__ __forceinline__ void rpp_hip_math_min8(d_float8 *srcPtr_f8, float *dstPtr)
+{
+    *dstPtr = fminf(fminf(fminf(fminf(fminf(fminf(fminf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]);
+}
+
+// float8 max
+
+__device__ __forceinline__ void rpp_hip_math_max8(d_float8 *srcPtr_f8, float *dstPtr)
+{
+    *dstPtr = fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]);
+}
+
 // d_float16 floor
 
 __device__ __forceinline__ void rpp_hip_math_floor16(d_float16 *srcPtr_f16, d_float16 *dstPtr_f16)
diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp
index dae3e6236..32b8b62b5 100644
--- a/src/modules/cpu/host_tensor_statistical_operations.hpp
+++ b/src/modules/cpu/host_tensor_statistical_operations.hpp
@@ -26,5 +26,7 @@ SOFTWARE.
 #define HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
 
 #include "kernel/tensor_sum.hpp"
+#include "kernel/tensor_min.hpp"
+#include "kernel/tensor_max.hpp"
 
 #endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/tensor_max.hpp b/src/modules/cpu/kernel/tensor_max.hpp
new file mode 100644
index 000000000..0380f4ef6
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_max.hpp
@@ -0,0 +1,847 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus tensor_max_u8_u8_host(Rpp8u *srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                Rpp8u *maxArr,
+                                Rpp32u maxArrLength,
+                                RpptROIPtr roiTensorPtrSrc,
+                                RpptRoiType roiType,
+                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8u *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8u *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 96) * 96;
+        Rpp32u vectorIncrement = 96;
+        Rpp32u vectorIncrementPerChannel = 32;
+
+        // Tensor max 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp8u max = 0;
+            Rpp8u resultAvx[16];
+
+            Rpp8u *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+                __m256i pMax = _mm256_setzero_si256();
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp);
+                        pMax = _mm256_max_epu8(p1, pMax); //compare and store max of 32 values into global max
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        max = std::max(*srcPtrTemp++, max);
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_32_host(&pMax, &result);
+                rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+                max = std::max(resultAvx[0], max);
+#endif
+            maxArr[batchCount] = max;
+        }
+        // Tensor max 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0;
+            Rpp8u resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+                srcPtrRowR = srcPtrChannel;
+                srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+                srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+                __m256i pMaxR = _mm256_setzero_si256();
+                __m256i pMaxG = pMaxR;
+                __m256i pMaxB = pMaxR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                    srcPtrTempR = srcPtrRowR;
+                    srcPtrTempG = srcPtrRowG;
+                    srcPtrTempB = srcPtrRowB;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                    {
+                        __m256i p[3];
+                        rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                        compute_max_96_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTempR += vectorIncrementPerChannel;
+                        srcPtrTempG += vectorIncrementPerChannel;
+                        srcPtrTempB += vectorIncrementPerChannel;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        maxR = std::max(*srcPtrTempR++, maxR);
+                        maxG = std::max(*srcPtrTempG++, maxG);
+                        maxB = std::max(*srcPtrTempB++, maxB);
+                    }
+                    srcPtrRowR += srcDescPtr->strides.hStride;
+                    srcPtrRowG += srcDescPtr->strides.hStride;
+                    srcPtrRowB += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_96_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+                maxR = std::max(resultAvx[0], maxR);
+                maxG = std::max(resultAvx[1], maxG);
+                maxB = std::max(resultAvx[2], maxB);
+#endif
+            }
+            maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+            maxArr[maxArrIndex + 1] = maxG;
+            maxArr[maxArrIndex + 2] = maxB;
+            maxArr[maxArrIndex + 3] = maxC;
+        }
+
+        // Tensor max 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+            Rpp32u vectorIncrement = 48;
+            Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0;
+            Rpp8u resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8u *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+                __m128i pMaxR = _mm_setzero_si128();
+                __m128i pMaxG = pMaxR;
+                __m128i pMaxB = pMaxR;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m128i p[3];
+                        rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p);
+                        compute_max_48_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        maxR = std::max(srcPtrTemp[0], maxR);
+                        maxG = std::max(srcPtrTemp[1], maxG);
+                        maxB = std::max(srcPtrTemp[2], maxB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_48_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+                maxR = std::max(resultAvx[0], maxR);
+                maxG = std::max(resultAvx[1], maxG);
+                maxB = std::max(resultAvx[2], maxB);
+#endif
+            }
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+			maxArr[maxArrIndex + 1] = maxG;
+			maxArr[maxArrIndex + 2] = maxB;
+			maxArr[maxArrIndex + 3] = maxC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_f32_f32_host(Rpp32f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp32f *maxArr,
+                                  Rpp32u maxArrLength,
+                                  RpptROIPtr roiTensorPtrSrc,
+                                  RpptRoiType roiType,
+                                  RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp32f *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+
+        // Tensor max 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp32f max = 0.0;
+            Rpp32f resultAvx[4];
+
+            Rpp32f *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256 pMax = _mm256_setzero_ps();
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p1;
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1);
+                    compute_max_float8_host(&p1, &pMax);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    max = std::max(*srcPtrTemp++, max);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128 result;
+            reduce_max_float8_host(&pMax, &result);
+            rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+            max = std::max(std::max(resultAvx[0], resultAvx[1]), max);
+#endif
+            maxArr[batchCount] = max;
+        }
+
+        // Tensor max 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+            Rpp32f resultAvx[8];
+
+            Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256 pMaxR = _mm256_setzero_ps();
+            __m256 pMaxG = pMaxR;
+            __m256 pMaxB = pMaxR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256 p[3];
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                    compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    maxR = std::max(*srcPtrTempR++, maxR);
+                    maxG = std::max(*srcPtrTempG++, maxG);
+                    maxB = std::max(*srcPtrTempB++, maxB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m256 result;
+            reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+            rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+            maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+            maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+            maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+			maxArr[maxArrIndex + 1] = maxG;
+			maxArr[maxArrIndex + 2] = maxB;
+			maxArr[maxArrIndex + 3] = maxC;
+        }
+
+        // Tensor max 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+            Rpp32u vectorIncrement = 24;
+            Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+            Rpp32f resultAvx[8];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp32f *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+                __m256 pMaxR = _mm256_setzero_ps();
+                __m256 pMaxG = pMaxR;
+                __m256 pMaxB = pMaxR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp32f *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[3];
+                        rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);
+                        compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        maxR = std::max(srcPtrTemp[0], maxR);
+                        maxG = std::max(srcPtrTemp[1], maxG);
+                        maxB = std::max(srcPtrTemp[2], maxB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m256 result;
+                reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+                maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+                maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+                maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+            }
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+			maxArr[maxArrIndex + 1] = maxG;
+			maxArr[maxArrIndex + 2] = maxB;
+			maxArr[maxArrIndex + 3] = maxC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_f16_f16_host(Rpp16f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp16f *maxArr,
+                                  Rpp32u maxArrLength,
+                                  RpptROIPtr roiTensorPtrSrc,
+                                  RpptRoiType roiType,
+                                  RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp16f *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp16f *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+
+        // Tensor max 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp32f max = 0.0;
+            Rpp32f resultAvx[4];
+
+            Rpp16f *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256 pMax = _mm256_setzero_ps();
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    Rpp32f srcPtrTemp_ps[8];
+                    for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                    {
+                        srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+                    }
+                    __m256 p1;
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1);
+                    compute_max_float8_host(&p1, &pMax);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    max = std::max((Rpp32f)*srcPtrTemp++, max);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128 result;
+            reduce_max_float8_host(&pMax, &result);
+            rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+            max = std::max(std::max(resultAvx[0], resultAvx[1]), max);
+#endif
+            maxArr[batchCount] = (Rpp16f)max;
+        }
+
+        // Tensor max 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+            Rpp32f resultAvx[8];
+
+            Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256 pMaxR = _mm256_setzero_ps();
+            __m256 pMaxG = pMaxR;
+            __m256 pMaxB = pMaxR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+                    for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                    {
+                        srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+                        srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+                        srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+                    }
+                    __m256 p[3];
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+                    compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    maxR = std::max((Rpp32f)*srcPtrTempR++, maxR);
+                    maxG = std::max((Rpp32f)*srcPtrTempG++, maxG);
+                    maxB = std::max((Rpp32f)*srcPtrTempB++, maxB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m256 result;
+            reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+            rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+            maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+            maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+            maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+
+#endif
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = (Rpp16f)maxR;
+			maxArr[maxArrIndex + 1] = (Rpp16f)maxG;
+			maxArr[maxArrIndex + 2] = (Rpp16f)maxB;
+			maxArr[maxArrIndex + 3] = (Rpp16f)maxC;
+        }
+
+        // Tensor max 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+            Rpp32u vectorIncrement = 24;
+            Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+            Rpp32f resultAvx[8];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp16f *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+                __m256 pMaxR = _mm256_setzero_ps();
+                __m256 pMaxG = pMaxR;
+                __m256 pMaxB = pMaxR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp16f *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        Rpp32f srcPtrTemp_ps[24];
+                        for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                        {
+                            srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+                        }
+                        __m256 p[3];
+                        rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);
+                        compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        maxR = std::max((Rpp32f)srcPtrTemp[0], maxR);
+                        maxG = std::max((Rpp32f)srcPtrTemp[1], maxG);
+                        maxB = std::max((Rpp32f)srcPtrTemp[2], maxB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m256 result;
+                reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+                maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+                maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+                maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+            }
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = (Rpp16f)maxR;
+			maxArr[maxArrIndex + 1] = (Rpp16f)maxG;
+			maxArr[maxArrIndex + 2] = (Rpp16f)maxB;
+			maxArr[maxArrIndex + 3] = (Rpp16f)maxC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_i8_i8_host(Rpp8s *srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                Rpp8s *maxArr,
+                                Rpp32u maxArrLength,
+                                RpptROIPtr roiTensorPtrSrc,
+                                RpptRoiType roiType,
+                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8s *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8s *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 96) * 96;
+        Rpp32u vectorIncrement = 96;
+        Rpp32u vectorIncrementPerChannel = 32;
+
+        // Tensor max 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp8s max = INT8_MIN;
+            Rpp8s resultAvx[16];
+
+            Rpp8s *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+                __m256i pMax = _mm256_set1_epi8(INT8_MIN);
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp);
+                        pMax = _mm256_max_epi8(p1, pMax); //compare and store max of 32 values into global max
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        max = std::max(*srcPtrTemp++, max);
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_i32_host(&pMax, &result);
+                rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+                max = std::max(resultAvx[0], max);
+#endif
+            maxArr[batchCount] = max;
+        }
+        // Tensor max 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN;
+            Rpp8s resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+                srcPtrRowR = srcPtrChannel;
+                srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+                srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+                __m256i pMaxR = _mm256_set1_epi8(INT8_MIN);
+                __m256i pMaxG = pMaxR;
+                __m256i pMaxB = pMaxR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                    srcPtrTempR = srcPtrRowR;
+                    srcPtrTempG = srcPtrRowG;
+                    srcPtrTempB = srcPtrRowB;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                    {
+                        __m256i p[3];
+                        rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                        compute_max_i96_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTempR += vectorIncrementPerChannel;
+                        srcPtrTempG += vectorIncrementPerChannel;
+                        srcPtrTempB += vectorIncrementPerChannel;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        maxR = std::max(*srcPtrTempR++, maxR);
+                        maxG = std::max(*srcPtrTempG++, maxG);
+                        maxB = std::max(*srcPtrTempB++, maxB);
+                    }
+                    srcPtrRowR += srcDescPtr->strides.hStride;
+                    srcPtrRowG += srcDescPtr->strides.hStride;
+                    srcPtrRowB += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_i96_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+                maxR = std::max(resultAvx[0], maxR);
+                maxG = std::max(resultAvx[1], maxG);
+                maxB = std::max(resultAvx[2], maxB);
+#endif
+            }
+            maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+            maxArr[maxArrIndex + 1] = maxG;
+            maxArr[maxArrIndex + 2] = maxB;
+            maxArr[maxArrIndex + 3] = maxC;
+        }
+
+        // Tensor max 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u maxArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+            Rpp32u vectorIncrement = 48;
+            Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN;
+            Rpp8s resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8s *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+                __m128i pMaxR = _mm_set1_epi8(INT8_MIN);
+                __m128i pMaxG = pMaxR;
+                __m128i pMaxB = pMaxR;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m128i p[3];
+                        rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p);
+                        compute_max_i48_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        maxR = std::max(srcPtrTemp[0], maxR);
+                        maxG = std::max(srcPtrTemp[1], maxG);
+                        maxB = std::max(srcPtrTemp[2], maxB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_max_i48_host(&pMaxR, &pMaxG, &pMaxB, &result);
+                rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+                maxR = std::max(resultAvx[0], maxR);
+                maxG = std::max(resultAvx[1], maxG);
+                maxB = std::max(resultAvx[2], maxB);
+#endif
+            }
+			maxC = std::max(std::max(maxR, maxG), maxB);
+            maxArr[maxArrIndex] = maxR;
+			maxArr[maxArrIndex + 1] = maxG;
+			maxArr[maxArrIndex + 2] = maxB;
+			maxArr[maxArrIndex + 3] = maxC;
+        }
+    }
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/tensor_min.hpp b/src/modules/cpu/kernel/tensor_min.hpp
new file mode 100644
index 000000000..15b9b77ba
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_min.hpp
@@ -0,0 +1,845 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus tensor_min_u8_u8_host(Rpp8u *srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                Rpp8u *minArr,
+                                Rpp32u minArrLength,
+                                RpptROIPtr roiTensorPtrSrc,
+                                RpptRoiType roiType,
+                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8u *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8u *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 96) * 96;
+        Rpp32u vectorIncrement = 96;
+        Rpp32u vectorIncrementPerChannel = 32;
+
+        // Tensor min 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp8u min = 255;
+            Rpp8u resultAvx[16];
+
+            Rpp8u *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256i pMin = _mm256_set1_epi8((char)255);
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp);
+                    pMin = _mm256_min_epu8(p1, pMin);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    min = std::min(*srcPtrTemp++, min);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128i result;
+            reduce_min_32_host(&pMin, &result);
+            rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+            min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+            minArr[batchCount] = min;
+        }
+
+        // Tensor min 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp8u minC = 255, minR = 255, minG = 255, minB = 255;
+            Rpp8u resultAvx[16];
+
+            Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256i pMinR = _mm256_set1_epi8((char)255);
+            __m256i pMinG = pMinR;
+            __m256i pMinB = pMinR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i p[3];
+                    rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                    compute_min_96_host(p, &pMinR, &pMinG, &pMinB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    minR = std::min(*srcPtrTempR++, minR);
+                    minG = std::min(*srcPtrTempG++, minG);
+                    minB = std::min(*srcPtrTempB++, minB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128i result;
+            reduce_min_96_host(&pMinR, &pMinG, &pMinB, &result);
+            rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+            minR = std::min(resultAvx[0], minR);
+            minG = std::min(resultAvx[1], minG);
+            minB = std::min(resultAvx[2], minB);
+#endif
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+
+        // Tensor min 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+            Rpp32u vectorIncrement = 48;
+            Rpp8u minC = 255, minR = 255, minG = 255, minB = 255;
+            Rpp8u resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8u *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+                __m128i pMinR = _mm_set1_epi8((char)255);
+                __m128i pMinG = pMinR;
+                __m128i pMinB = pMinR;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m128i p[3];
+                        rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p);
+                        compute_min_48_host(p, &pMinR, &pMinG, &pMinB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        minR = std::min(srcPtrTemp[0], minR);
+                        minG = std::min(srcPtrTemp[1], minG);
+                        minB = std::min(srcPtrTemp[2], minB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+
+                __m128i result;
+                reduce_min_48_host(&pMinR, &pMinG, &pMinB, &result);
+                rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+                minR = std::min(resultAvx[0], minR);
+                minG = std::min(resultAvx[1], minG);
+                minB = std::min(resultAvx[2], minB);
+            }
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_f32_f32_host(Rpp32f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp32f *minArr,
+                                  Rpp32u minArrLength,
+                                  RpptROIPtr roiTensorPtrSrc,
+                                  RpptRoiType roiType,
+                                  RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32f *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp32f *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+
+        // Tensor min 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp32f min = 255.0;
+            Rpp32f resultAvx[4];
+
+            Rpp32f *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256 pMin = _mm256_set1_ps(255.0);
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p1;
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1);
+                    compute_min_float8_host(&p1, &pMin);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    min = std::min(*srcPtrTemp++, min);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+
+#if __AVX2__
+            __m128 result;
+            reduce_min_float8_host(&pMin, &result);
+            rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+            min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+            minArr[batchCount] = min;
+        }
+
+        // Tensor min 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+            Rpp32f resultAvx[8];
+
+            Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256 pMinR = _mm256_set1_ps(255.0);
+            __m256 pMinG = pMinR;
+            __m256 pMinB = pMinR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256 p[3];
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                    compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    minR = std::min(*srcPtrTempR++, minR);
+                    minG = std::min(*srcPtrTempG++, minG);
+                    minB = std::min(*srcPtrTempB++, minB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m256 result;
+            reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+            rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+            minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+            minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+            minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+
+        // Tensor min 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+            Rpp32u vectorIncrement = 24;
+            Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+            Rpp32f resultAvx[8];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp32f *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+                __m256 pMinR = _mm256_set1_ps(255.0);
+                __m256 pMinG = pMinR;
+                __m256 pMinB = pMinR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp32f *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[3];
+                        rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);
+                        compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        minR = std::min(srcPtrTemp[0], minR);
+                        minG = std::min(srcPtrTemp[1], minG);
+                        minB = std::min(srcPtrTemp[2], minB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+
+#if __AVX2__
+                __m256 result;
+                reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+                rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+                minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+                minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+                minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+            }
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_f16_f16_host(Rpp16f *srcPtr,
+                                  RpptDescPtr srcDescPtr,
+                                  Rpp16f *minArr,
+                                  Rpp32u minArrLength,
+                                  RpptROIPtr roiTensorPtrSrc,
+                                  RpptRoiType roiType,
+                                  RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp16f *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp16f *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 24) * 24;
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+
+        // Tensor min 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp32f min = 255.0;
+            Rpp32f resultAvx[4];
+
+            Rpp16f *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256 pMin = _mm256_set1_ps(255.0);
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    Rpp32f srcPtrTemp_ps[8];
+                    for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                    {
+                        srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+                    }
+                    __m256 p1;
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1);
+                    compute_min_float8_host(&p1, &pMin);
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    min = std::min((Rpp32f)*srcPtrTemp++, min);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+
+#if __AVX2__
+            __m128 result;
+            reduce_min_float8_host(&pMin, &result);
+            rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+            min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+            minArr[batchCount] = (Rpp16f) min;
+        }
+
+        // Tensor min 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+            Rpp32f resultAvx[8];
+
+            Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256 pMinR = _mm256_set1_ps(255.0);
+            __m256 pMinG = pMinR;
+            __m256 pMinB = pMinR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+                    for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                    {
+                        srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+                        srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+                        srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+                    }
+                    __m256 p[3];
+                    rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+                    compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    minR = std::min((Rpp32f)*srcPtrTempR++, minR);
+                    minG = std::min((Rpp32f)*srcPtrTempG++, minG);
+                    minB = std::min((Rpp32f)*srcPtrTempB++, minB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m256 result;
+            reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+            rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+            minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+            minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+            minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = (Rpp16f) minR;
+			minArr[minArrIndex + 1] = (Rpp16f) minG;
+			minArr[minArrIndex + 2] = (Rpp16f) minB;
+			minArr[minArrIndex + 3] = (Rpp16f) minC;
+        }
+
+        // Tensor min 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 24) * 24;
+            Rpp32u vectorIncrement = 24;
+            Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+            Rpp32f resultAvx[8];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp16f *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+                __m256 pMinR = _mm256_set1_ps(255.0);
+                __m256 pMinG = pMinR;
+                __m256 pMinB = pMinR;
+#endif
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp16f *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        Rpp32f srcPtrTemp_ps[24];
+                        for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                        {
+                            srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+                        }
+                        __m256 p[3];
+                        rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);
+                        compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        minR = std::min((Rpp32f)srcPtrTemp[0], minR);
+                        minG = std::min((Rpp32f)srcPtrTemp[1], minG);
+                        minB = std::min((Rpp32f)srcPtrTemp[2], minB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+
+#if __AVX2__
+                __m256 result;
+                reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+                rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+                minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+                minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+                minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+            }
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = (Rpp16f) minR;
+			minArr[minArrIndex + 1] = (Rpp16f) minG;
+			minArr[minArrIndex + 2] = (Rpp16f) minB;
+			minArr[minArrIndex + 3] = (Rpp16f) minC;
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_i8_i8_host(Rpp8s *srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                Rpp8s *minArr,
+                                Rpp32u minArrLength,
+                                RpptROIPtr roiTensorPtrSrc,
+                                RpptRoiType roiType,
+                                RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp8s *srcPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        Rpp8s *srcPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+        Rpp32u alignedLength = (bufferLength / 96) * 96;
+        Rpp32u vectorIncrement = 96;
+        Rpp32u vectorIncrementPerChannel = 32;
+
+        // Tensor min 1 channel (NCHW)
+        if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+            vectorIncrement = vectorIncrementPerChannel;
+            Rpp8s min = 127;
+            Rpp8s resultAvx[16];
+
+            Rpp8s *srcPtrRow;
+            srcPtrRow = srcPtrChannel;
+#if __AVX2__
+            __m256i pMin = _mm256_set1_epi8((char)127);
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTemp;
+                srcPtrTemp = srcPtrRow;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp);
+                    pMin = _mm256_min_epi8(p1, pMin); //compare and store min of 32 values into global min
+
+                    srcPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    min = std::min((*srcPtrTemp++), min);
+                }
+                srcPtrRow += srcDescPtr->strides.hStride;
+            }
+
+#if __AVX2__
+            __m128i result;
+            reduce_min_i32_host(&pMin, &result);
+            rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+            min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+            minArr[batchCount] = min;
+        }
+
+        // Tensor min 3 channel (NCHW)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp8s minC = 127, minR = 127, minG = 127, minB = 127;
+            Rpp8s resultAvx[16];
+
+            Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+            __m256i pMinR = _mm256_set1_epi8((char)127);
+            __m256i pMinG = pMinR;
+            __m256i pMinB = pMinR;
+#endif
+            for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i p[3];
+                    rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+                    compute_min_i96_host(p, &pMinR, &pMinG, &pMinB);
+
+                    srcPtrTempR += vectorIncrementPerChannel;
+                    srcPtrTempG += vectorIncrementPerChannel;
+                    srcPtrTempB += vectorIncrementPerChannel;
+                }
+#endif
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    minR = std::min(*srcPtrTempR++, minR);
+                    minG = std::min(*srcPtrTempG++, minG);
+                    minB = std::min(*srcPtrTempB++, minB);
+                }
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+            }
+#if __AVX2__
+            __m128i result;
+            reduce_min_i96_host(&pMinR, &pMinG, &pMinB, &result);
+            rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+            minR = std::min(resultAvx[0], minR);
+            minG = std::min(resultAvx[1], minG);
+            minB = std::min(resultAvx[2], minB);
+#endif
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+
+        // Tensor min 3 channel (NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u minArrIndex = batchCount * 4;
+            Rpp32u alignedLength = (bufferLength / 48) * 48;
+            Rpp32u vectorIncrement = 48;
+            Rpp8s minC = 127, minR = 127, minG = 127, minB = 127;
+            Rpp8s resultAvx[16];
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8s *srcPtrRow;
+                srcPtrRow = srcPtrChannel;
+
+                __m128i pMinR = _mm_set1_epi8((char)127);
+                __m128i pMinG = pMinR;
+                __m128i pMinB = pMinR;
+
+                for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m128i p[3];
+                        rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p);
+                        compute_min_i48_host(p, &pMinR, &pMinG, &pMinB);
+
+                        srcPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+                    {
+                        minR = std::min(srcPtrTemp[0], minR);
+                        minG = std::min(srcPtrTemp[1], minG);
+                        minB = std::min(srcPtrTemp[2], minB);
+                        srcPtrTemp += 3;
+                    }
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                }
+#if __AVX2__
+                __m128i result;
+                reduce_min_i48_host(&pMinR, &pMinG, &pMinB, &result);
+                rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+                minR = std::min(resultAvx[0], minR);
+                minG = std::min(resultAvx[1], minG);
+                minB = std::min(resultAvx[2], minB);
+#endif
+            }
+			minC = std::min(std::min(minR, minG), minB);
+            minArr[minArrIndex] = minR;
+			minArr[minArrIndex + 1] = minG;
+			minArr[minArrIndex + 2] = minB;
+			minArr[minArrIndex + 3] = minC;
+        }
+    }
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp
index 328a232a1..c79e0a951 100644
--- a/src/modules/hip/hip_tensor_statistical_operations.hpp
+++ b/src/modules/hip/hip_tensor_statistical_operations.hpp
@@ -23,8 +23,9 @@ SOFTWARE.
 */
 
 #ifndef HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
-#define HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
 
 #include "kernel/tensor_sum.hpp"
+#include "kernel/tensor_min.hpp"
+#include "kernel/tensor_max.hpp"
 
-#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
+#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/tensor_max.hpp b/src/modules/hip/kernel/tensor_max.hpp
new file mode 100644
index 000000000..b47fce024
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_max.hpp
@@ -0,0 +1,400 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+
+template <typename T>
+__global__ void tensor_max_grid_3channel_result_hip(float *srcPtr,
+                                                    uint xBufferLength,
+                                                    T *dstPtr)
+{
+    int id_x = hipThreadIdx_x * 8;
+    int id_z = hipBlockIdx_z;
+
+    __shared__ float partialRMax_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+    __shared__ float partialGMax_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+    __shared__ float partialBMax_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+    uint srcIdx = (id_z * xBufferLength) * 3;
+    partialRMax_smem[hipThreadIdx_x] = srcPtr[srcIdx];                  // initialization of LDS for R channel to start of R channel using all 256 x 1 threads
+    partialGMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1];              // initialization of LDS for G channel to start of G channel using all 256 x 1 threads
+    partialBMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2];              // initialization of LDS for B channel to start of B channel using all 256 x 1 threads
+
+    if (id_x >= xBufferLength)
+        return;
+
+    srcIdx += id_x * 3;
+
+    if (id_x + 8 > xBufferLength)
+        srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3);     // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+    d_float24 src_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24);           // load 24 pixels to local mmemory
+
+    rpp_hip_math_max8(&src_f24.f8[0], &partialRMax_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[1], &partialGMax_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[2], &partialBMax_smem[hipThreadIdx_x]);
+    __syncthreads();                                                                    // syncthreads after max compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMax_smem[hipThreadIdx_x] = fmaxf(partialRMax_smem[hipThreadIdx_x], partialRMax_smem[hipThreadIdx_x + threadMax]);
+            partialGMax_smem[hipThreadIdx_x] = fmaxf(partialGMax_smem[hipThreadIdx_x], partialGMax_smem[hipThreadIdx_x + threadMax]);
+            partialBMax_smem[hipThreadIdx_x] = fmaxf(partialBMax_smem[hipThreadIdx_x], partialBMax_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+    {
+        int dstIdx = hipBlockIdx_z * 4;
+        dstPtr[dstIdx] = (T) partialRMax_smem[0];
+        dstPtr[dstIdx + 1] = (T) partialGMax_smem[0];
+        dstPtr[dstIdx + 2] = (T) partialBMax_smem[0];
+        dstPtr[dstIdx + 3] = (T) (fmaxf(fmaxf(partialRMax_smem[0], partialGMax_smem[0]), partialBMax_smem[0]));
+    }
+}
+
+template <typename T>
+__global__ void tensor_max_grid_result_hip(float *srcPtr,
+                                           uint xBufferLength,
+                                           T *dstPtr)
+{
+    int id_x = hipThreadIdx_x * 8;
+    int id_z = hipBlockIdx_z;
+
+    __shared__ float partialMax_smem[256];                            // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+    uint srcIdx = (id_z * xBufferLength);
+    partialMax_smem[hipThreadIdx_x] = srcPtr[srcIdx];                         // initialization of LDS to start of buffer using all 256 x 1 threads
+
+    if (id_x >= xBufferLength)
+        return;
+
+    srcIdx += id_x;
+
+    if (id_x + 8 > xBufferLength)
+        srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7)));       // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);   // load 8 pixels to local memory
+    rpp_hip_math_max8(&src_f8, &partialMax_smem[hipThreadIdx_x]);
+    __syncthreads();                                                // syncthreads after max compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialMax_smem[hipThreadIdx_x] = fmaxf(partialMax_smem[hipThreadIdx_x], partialMax_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+        dstPtr[hipBlockIdx_z] = (T) (partialMax_smem[0]);
+}
+
+
+// -------------------- Set 1 - Reduction Stage 1 --------------------
+
+template <typename T>
+__global__ void tensor_max_pkd3_hip(T *srcPtr,
+                                    uint2 srcStridesNH,
+                                    float *maxArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialRMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel
+    __shared__ float partialGMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel
+    __shared__ float partialBMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel
+
+    float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for R Channel
+    float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for G Channel
+    float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for B Channel
+    uint srcIdx = (id_z * srcStridesNH.x);
+    partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+    partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1];                      // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads
+    partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2];                      // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+
+    d_float24 src_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24);           // load 24 pixels to local memory
+
+    rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+            {
+                partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]);
+                partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]);
+                partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]);
+            }
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+        {
+            int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+            maxArr[idx] = partialRMaxRowPtr_smem[0];
+            maxArr[idx + 1] = partialGMaxRowPtr_smem[0];
+            maxArr[idx + 2] = partialBMaxRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void tensor_max_pln3_hip(T *srcPtr,
+                                    uint3 srcStridesNCH,
+                                    float *maxArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialRMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+    __shared__ float partialGMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+    __shared__ float partialBMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+    float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    uint srcIdx = (id_z * srcStridesNCH.x);
+    partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+    partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y];        // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads
+    partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y];    // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+    d_float24 src_f24;
+    rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24);
+
+    rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();                                                         // syncthreads after max compute
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+            {
+                partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]);
+                partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]);
+                partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]);
+            }
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+        {
+            int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+            maxArr[idx] = partialRMaxRowPtr_smem[0];
+            maxArr[idx + 1] = partialGMaxRowPtr_smem[0];
+            maxArr[idx + 2] = partialBMaxRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void tensor_max_pln1_hip(T *srcPtr,
+                                    uint2 srcStridesNH,
+                                    float *maxArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialMax_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0];         // float pointer to beginning of each row in LDS
+    partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                     // initialization of LDS to start value using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+
+    rpp_hip_math_max8(&src_f8, &partialMaxRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();                                                        // syncthreads after max compute
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+                partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]);
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+            maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0];
+    }
+}
+
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+template <typename T, typename U>
+RppStatus hip_exec_tensor_max(T *srcPtr,
+                              RpptDescPtr srcDescPtr,
+                              U *maxArr,
+                              RpptROIPtr roiTensorPtrSrc,
+                              RpptRoiType roiType,
+                              rpp::Handle& handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+    int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+    int globalThreads_y = srcDescPtr->h;
+    int globalThreads_z = handle.GetBatchSize();
+    int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+    int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+    int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+    float2 bitDepthMinMax_f2;
+    getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2);
+    float minimum = bitDepthMinMax_f2.x;
+
+    if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+    {
+        Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z;
+        float *partialMaxArr;
+        partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_pln1_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           partialMaxArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_grid_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMaxArr,
+                           gridDim_x * gridDim_y,
+                           maxArr);
+    }
+    else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+    {
+        Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+        float *partialMaxArr;
+        partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_pln3_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                           partialMaxArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMaxArr,
+                           gridDim_x * gridDim_y,
+                           maxArr);
+    }
+    else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+    {
+        Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+        float *partialMaxArr;
+        partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_pkd3_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           partialMaxArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMaxArr,
+                           gridDim_x * gridDim_y,
+                           maxArr);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/tensor_min.hpp b/src/modules/hip/kernel/tensor_min.hpp
new file mode 100644
index 000000000..a883c4f3b
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_min.hpp
@@ -0,0 +1,410 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+
+template <typename T>
+__global__ void tensor_min_grid_3channel_result_hip(float *srcPtr,
+                                                    uint xBufferLength,
+                                                    T *dstPtr)
+{
+    int id_x = hipThreadIdx_x * 8;
+    int id_z = hipBlockIdx_z;
+
+    __shared__ float partialRMin_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+    __shared__ float partialGMin_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+    __shared__ float partialBMin_smem[256];                             // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+    uint srcIdx = (id_z * xBufferLength) * 3;
+    partialRMin_smem[hipThreadIdx_x] = srcPtr[srcIdx];                  // initialization of LDS for R channel to start of R channel using all 256 x 1 threads
+    partialGMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1];              // initialization of LDS for G channel to start of G channel using all 256 x 1 threads
+    partialBMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2];              // initialization of LDS for B channel to start of B channel using all 256 x 1 threads
+
+    if (id_x >= xBufferLength)
+        return;
+
+    srcIdx += id_x * 3;
+
+    if (id_x + 8 > xBufferLength)
+        srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3);     // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+    d_float24 src_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24);          // load 24 pixels to local memory
+
+    rpp_hip_math_min8(&src_f24.f8[0], &partialRMin_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[1], &partialGMin_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[2], &partialBMin_smem[hipThreadIdx_x]);
+    __syncthreads();                                                                    // syncthreads after min compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMin_smem[hipThreadIdx_x] = fminf(partialRMin_smem[hipThreadIdx_x], partialRMin_smem[hipThreadIdx_x + threadMax]);
+            partialGMin_smem[hipThreadIdx_x] = fminf(partialGMin_smem[hipThreadIdx_x], partialGMin_smem[hipThreadIdx_x + threadMax]);
+            partialBMin_smem[hipThreadIdx_x] = fminf(partialBMin_smem[hipThreadIdx_x], partialBMin_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+    {
+        int dstIdx = hipBlockIdx_z * 4;
+        dstPtr[dstIdx] = (T) partialRMin_smem[0];
+        dstPtr[dstIdx + 1] = (T) partialGMin_smem[0];
+        dstPtr[dstIdx + 2] = (T) partialBMin_smem[0];
+        dstPtr[dstIdx + 3] = (T) (fminf(fminf(partialRMin_smem[0], partialGMin_smem[0]), partialBMin_smem[0]));
+    }
+}
+
+template <typename T>
+__global__ void tensor_min_grid_result_hip(float *srcPtr,
+                                           uint xBufferLength,
+                                           T *dstPtr)
+{
+    int id_x = hipThreadIdx_x * 8;
+    int id_z = hipBlockIdx_z;
+
+    __shared__ float partialMin_smem[256];                          // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+    uint srcIdx = (id_z * xBufferLength);
+    partialMin_smem[hipThreadIdx_x] = srcPtr[srcIdx];               // initialization of LDS to start of buffer using all 256 x 1 threads
+
+    if (id_x >= xBufferLength)
+        return;
+
+    srcIdx += id_x;
+
+    if (id_x + 8 > xBufferLength)
+        srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7)));       // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);   // load 8 pixels to local memory
+    rpp_hip_math_min8(&src_f8, &partialMin_smem[hipThreadIdx_x]);
+    __syncthreads();                                                // syncthreads after min compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialMin_smem[hipThreadIdx_x] = fminf(partialMin_smem[hipThreadIdx_x], partialMin_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+        dstPtr[hipBlockIdx_z] = (T) (partialMin_smem[0]);
+}
+
+
+// -------------------- Set 1 - Reduction Stage 1 --------------------
+
+template <typename T>
+__global__ void tensor_min_pkd3_hip(T *srcPtr,
+                                    uint2 srcStridesNH,
+                                    float *minArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialRMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel
+    __shared__ float partialGMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel
+    __shared__ float partialBMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel
+
+    float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for R Channel
+    float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for G Channel
+    float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS for B Channel
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+    partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1];                      // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads
+    partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2];                      // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+
+    if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+        srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth) * 3;
+
+    d_float24 src_f24;
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24);           // load 24 pixels to local memory
+
+    rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+            {
+                partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]);
+                partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]);
+                partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]);
+            }
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+        {
+            int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+            minArr[idx] = partialRMinRowPtr_smem[0];
+            minArr[idx + 1] = partialGMinRowPtr_smem[0];
+            minArr[idx + 2] = partialBMinRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void tensor_min_pln3_hip(T *srcPtr,
+                                    uint3 srcStridesNCH,
+                                    float *minArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialRMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+    __shared__ float partialGMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+    __shared__ float partialBMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+    float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+    float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0];        // float pointer to beginning of each row in LDS
+
+    uint srcIdx = (id_z * srcStridesNCH.x);
+    partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+    partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y];        // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads
+    partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y];    // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+    if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+        srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth);
+
+    d_float24 src_f24;
+    rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24);
+
+    rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]);
+    rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();                                                         // syncthreads after min compute
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+        {
+            partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+            partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+        }
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+            {
+                partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]);
+                partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]);
+                partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]);
+            }
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+        {
+            int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+            minArr[idx] = partialRMinRowPtr_smem[0];
+            minArr[idx + 1] = partialGMinRowPtr_smem[0];
+            minArr[idx + 2] = partialBMinRowPtr_smem[0];
+        }
+    }
+}
+
+template <typename T>
+__global__ void tensor_min_pln1_hip(T *srcPtr,
+                                    uint2 srcStridesNH,
+                                    float *minArr,
+                                    RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    __shared__ float partialMin_smem[16][16];                                 // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    float *partialMinRowPtr_smem = &partialMin_smem[hipThreadIdx_y][0];         // float pointer to beginning of each row in LDS
+    partialMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                     // initialization of LDS to start value using all 16 x 16 threads
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+    if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+        srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth);
+
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);           // load 8 pixels to local memory
+    rpp_hip_math_min8(&src_f8, &partialMinRowPtr_smem[hipThreadIdx_x]);
+    __syncthreads();                                                        // syncthreads after min compute
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialMinRowPtr_smem[hipThreadIdx_x] = fminf(partialMinRowPtr_smem[hipThreadIdx_x], partialMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+                partialMinRowPtr_smem[0] = fminf(partialMinRowPtr_smem[0], partialMinRowPtr_smem[increment]);
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+            minArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMinRowPtr_smem[0];
+    }
+}
+
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+template <typename T, typename U>
+RppStatus hip_exec_tensor_min(T *srcPtr,
+                              RpptDescPtr srcDescPtr,
+                              U *minArr,
+                              RpptROIPtr roiTensorPtrSrc,
+                              RpptRoiType roiType,
+                              rpp::Handle &handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+    int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+    int globalThreads_y = srcDescPtr->h;
+    int globalThreads_z = handle.GetBatchSize();
+    int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+    int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+    int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+    float2 bitDepthMinMax_f2;
+    getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2);
+    float maximum = bitDepthMinMax_f2.y;
+
+    if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+    {
+        Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z;
+        float *partialMinArr;
+        partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_pln1_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           partialMinArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_grid_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMinArr,
+                           gridDim_x * gridDim_y,
+                           minArr);
+    }
+    else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+    {
+        Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+        float *partialMinArr;
+        partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_pln3_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                           partialMinArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMinArr,
+                           gridDim_x * gridDim_y,
+                           minArr);
+    }
+    else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+    {
+        Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+        float *partialMinArr;
+        partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+        hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_pkd3_hip,
+                           dim3(gridDim_x, gridDim_y, gridDim_z),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           partialMinArr,
+                           roiTensorPtrSrc);
+        hipStreamSynchronize(handle.GetStream());
+        hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip,
+                           dim3(1, 1, gridDim_z),
+                           dim3(256, 1, 1),
+                           0,
+                           handle.GetStream(),
+                           partialMinArr,
+                           gridDim_x * gridDim_y,
+                           minArr);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp
index f17028e5e..28313a88f 100644
--- a/src/modules/rppt_tensor_statistical_operations.cpp
+++ b/src/modules/rppt_tensor_statistical_operations.cpp
@@ -107,6 +107,140 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr,
     return RPP_SUCCESS;
 }
 
+/******************** tensor_min ********************/
+
+RppStatus rppt_tensor_min_host(RppPtr_t srcPtr,
+                               RpptDescPtr srcDescPtr,
+                               RppPtr_t minArr,
+                               Rpp32u minArrLength,
+                               RpptROIPtr roiTensorPtrSrc,
+                               RpptRoiType roiType,
+                               rppHandle_t rppHandle)
+{
+    if (srcDescPtr->c == 1)
+    {
+        if (minArrLength < srcDescPtr->n)      // 1 min for each image
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+    else if (srcDescPtr->c == 3)
+    {
+        if (minArrLength < srcDescPtr->n * 4)  // min of each channel, and min of all 3 channels
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+    if (srcDescPtr->dataType == RpptDataType::U8)
+    {
+        tensor_min_u8_u8_host(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8u*>(minArr),
+                              minArrLength,
+                              roiTensorPtrSrc,
+                              roiType,
+                              layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F16)
+    {
+        tensor_min_f16_f16_host((Rpp16f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                 srcDescPtr,
+                                 static_cast<Rpp16f*>(minArr),
+                                 minArrLength,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        tensor_min_f32_f32_host((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                 srcDescPtr,
+                                 static_cast<Rpp32f*>(minArr),
+                                 minArrLength,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::I8)
+    {
+        tensor_min_i8_i8_host(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8s*>(minArr),
+                              minArrLength,
+                              roiTensorPtrSrc,
+                              roiType,
+                              layoutParams);
+    }
+
+    return RPP_SUCCESS;
+}
+
+/******************** tensor_max ********************/
+
+RppStatus rppt_tensor_max_host(RppPtr_t srcPtr,
+                               RpptDescPtr srcDescPtr,
+                               RppPtr_t maxArr,
+                               Rpp32u maxArrLength,
+                               RpptROIPtr roiTensorPtrSrc,
+                               RpptRoiType roiType,
+                               rppHandle_t rppHandle)
+{
+    if (srcDescPtr->c == 1)
+    {
+        if (maxArrLength < srcDescPtr->n)      // 1 min for each image
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+    else if (srcDescPtr->c == 3)
+    {
+        if (maxArrLength < srcDescPtr->n * 4)  // min of each channel, and min of all 3 channels
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+    if (srcDescPtr->dataType == RpptDataType::U8)
+    {
+        tensor_max_u8_u8_host(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8u*>(maxArr),
+                              maxArrLength,
+                              roiTensorPtrSrc,
+                              roiType,
+                              layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F16)
+    {
+        tensor_max_f16_f16_host((Rpp16f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                 srcDescPtr,
+                                 static_cast<Rpp16f*>(maxArr),
+                                 maxArrLength,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        tensor_max_f32_f32_host((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                 srcDescPtr,
+                                 static_cast<Rpp32f*>(maxArr),
+                                 maxArrLength,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams);
+    }
+    else if (srcDescPtr->dataType == RpptDataType::I8)
+    {
+        tensor_max_i8_i8_host(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8s*>(maxArr),
+                              maxArrLength,
+                              roiTensorPtrSrc,
+                              roiType,
+                              layoutParams);
+    }
+
+    return RPP_SUCCESS;
+}
+
 
 /********************************************************************************************************************/
 /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
@@ -184,4 +318,126 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
 
     return RPP_SUCCESS;
 }
+
+/******************** tensor_min ********************/
+
+RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
+                              RpptDescPtr srcDescPtr,
+                              RppPtr_t imageMinArr,
+                              Rpp32u imageMinArrLength,
+                              RpptROIPtr roiTensorPtrSrc,
+                              RpptRoiType roiType,
+                              rppHandle_t rppHandle)
+{
+    if (srcDescPtr->c == 1)
+    {
+        if (imageMinArrLength < srcDescPtr->n)   // min of single channel
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+    else if (srcDescPtr->c == 3)
+    {
+        if (imageMinArrLength < srcDescPtr->n * 4)   // min of each channel, and overall min of all 3 channels
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+
+    if (srcDescPtr->dataType == RpptDataType::U8)
+    {
+        hip_exec_tensor_min(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                            srcDescPtr,
+                            static_cast<Rpp8u*>(imageMinArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F16)
+    {
+        hip_exec_tensor_min((half*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                            srcDescPtr,
+                            static_cast<half*>(imageMinArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        hip_exec_tensor_min((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                            srcDescPtr,
+                            static_cast<Rpp32f*>(imageMinArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::I8)
+    {
+        hip_exec_tensor_min(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                            srcDescPtr,
+                            static_cast<Rpp8s*>(imageMinArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
+/******************** tensor_max ********************/
+
+RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
+                              RpptDescPtr srcDescPtr,
+                              RppPtr_t imageMaxArr,
+                              Rpp32u imageMaxArrLength,
+                              RpptROIPtr roiTensorPtrSrc,
+                              RpptRoiType roiType,
+                              rppHandle_t rppHandle)
+{
+    if (srcDescPtr->c == 1)
+    {
+        if (imageMaxArrLength < srcDescPtr->n)   // max of single channel
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+    else if (srcDescPtr->c == 3)
+    {
+        if (imageMaxArrLength < srcDescPtr->n * 4)   // max of each channel, and overall max of all 3 channels
+            return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+    }
+
+    if (srcDescPtr->dataType == RpptDataType::U8)
+    {
+        hip_exec_tensor_max(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                            srcDescPtr,
+                            static_cast<Rpp8u*>(imageMaxArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F16)
+    {
+        hip_exec_tensor_max((half*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                            srcDescPtr,
+                            static_cast<half*>(imageMaxArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        hip_exec_tensor_max((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                            srcDescPtr,
+                            static_cast<Rpp32f*>(imageMaxArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+    else if (srcDescPtr->dataType == RpptDataType::I8)
+    {
+        hip_exec_tensor_max(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                            srcDescPtr,
+                            static_cast<Rpp8s*>(imageMaxArr),
+                            roiTensorPtrSrc,
+                            roiType,
+                            rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
 #endif // backend
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 48537de29..7bd46b39e 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -68,9 +68,9 @@ int main(int argc, char **argv)
     bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
     bool randomOutputCase = (testCase == 84 || testCase == 49 || testCase == 54);
     bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
+    bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
     bool noiseTypeCase = (testCase == 8);
     bool pln1OutTypeCase = (testCase == 86);
-    bool reductionTypeCase = (testCase == 87);
 
     unsigned int verbosity = atoi(argv[11]);
     unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1;
@@ -323,23 +323,20 @@ int main(int argc, char **argv)
     double wallTime;
     string testCaseName;
 
-    // Initialize buffers for any reductionType functions
+    // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums)
     void *reductionFuncResultArr;
     Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
-
-    if(reductionTypeCase)
+    if (reductionTypeCase)
     {
-        if(dstDescPtr->dataType == RpptDataType::U8)
-            CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64u)));
-        else if(dstDescPtr->dataType == RpptDataType::F16)
-            CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f)));
-        else if(dstDescPtr->dataType == RpptDataType::F32)
-            CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f)));
-        else if(dstDescPtr->dataType == RpptDataType::I8)
-            CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64s)));
+        int bitDepthByteSize = 0;
+        if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
+            bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+        else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
+            bitDepthByteSize = sizeof(Rpp32f);  // using 32f outputs for 16f and 32f
+        CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
     }
 
-    //Allocate hip memory for src/dst
+    // Allocate hip memory for src/dst
     CHECK(hipMalloc(&d_input, inputBufferSize));
     CHECK(hipMalloc(&d_output, outputBufferSize));
     if(dualInputCase)
@@ -1044,6 +1041,30 @@ int main(int argc, char **argv)
 
                 break;
             }
+            case 88:
+            {
+                testCaseName = "tensor_min";
+
+                startWallTime = omp_get_wtime();
+                if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                    rppt_tensor_min_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+                else
+                    missingFuncFlag = 1;
+
+                break;
+            }
+            case 89:
+            {
+                testCaseName = "tensor_max";
+
+                startWallTime = omp_get_wtime();
+                if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                    rppt_tensor_max_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+                else
+                    missingFuncFlag = 1;
+
+                break;
+            }
             default:
                 missingFuncFlag = 1;
                 break;
@@ -1071,33 +1092,41 @@ int main(int argc, char **argv)
                     if(srcDescPtr->c == 3)
                         printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): ");
                     else if(srcDescPtr->c == 1)
+                    {
                         printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): ");
+                        reductionFuncResultArrLength = srcDescPtr->n;
+                    }
 
-                    if(dstDescPtr->dataType == RpptDataType::U8)
+                    // print reduction functions output array based on different bit depths, and precision desired
+                    int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
+                    if (dstDescPtr->dataType == RpptDataType::U8)
                     {
-                        Rpp64u *reductionOutPtr = static_cast<Rpp64u*>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %llu ", reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp64u *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp8u *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::F16)
+                    else if (dstDescPtr->dataType == RpptDataType::F16)
                     {
-                        Rpp32f *reductionOutPtr = static_cast<Rpp32f *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %0.3f ", (float)reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp16f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::F32)
+                    else if (dstDescPtr->dataType == RpptDataType::F32)
                     {
-                        Rpp32f *reductionOutPtr = static_cast<Rpp32f *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %0.3f ", (float)reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::I8)
+                    else if (dstDescPtr->dataType == RpptDataType::I8)
                     {
-                        Rpp64s *reductionOutPtr = static_cast<Rpp64s *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %lld ", reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp64s *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp8s *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-
                     printf("\n");
 
                     /*Compare the output of the function with golden outputs only if
@@ -1105,7 +1134,12 @@ int main(int argc, char **argv)
                     2.input bit depth 0 (U8)
                     3.source and destination layout are the same*/
                     if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
-                        compare_reduction_output(static_cast<Rpp64u *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                    {
+                        if (testCase == 87)
+                            compare_reduction_output(static_cast<uint64_t *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                        else
+                            compare_reduction_output(static_cast<Rpp8u *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                    }
                 }
                 else
                 {
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index cabc4015f..2e8054332 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -315,11 +315,11 @@ def rpp_test_suite_parser_and_validator():
     parser = argparse.ArgumentParser()
     parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1")
     parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2")
-    parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)")
-    parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)")
-    parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
-    parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
-    parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
+    parser.add_argument("--case_start", type = int, default = 0, help="Testing range starting case # - (0:90)")
+    parser.add_argument("--case_end", type = int, default = 90, help="Testing range ending case # - (0:90)")
+    parser.add_argument('--test_type', type = int, default = 0, help="Type of Test - (0 = Unit tests / 1 = Performance tests)")
+    parser.add_argument('--case_list', nargs = "+", help="List of case numbers to list", required=False)
+    parser.add_argument('--profiling', type = str , default='NO', help='Run with profiler? - (YES/NO)', required=False)
     parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False)
     parser.add_argument('--decoder_type', type = int, default = 0, help = "Type of Decoder to decode the input data - (0 = TurboJPEG / 1 = OpenCV)")
     parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
@@ -334,8 +334,8 @@ def rpp_test_suite_parser_and_validator():
     validate_path(qaInputFile)
 
     # validate the parameters passed by user
-    if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)):
-        print("Starting case# and Ending case# must be in the 0:87 range. Aborting!")
+    if ((args.case_start < 0 or args.case_start > 90) or (args.case_end < 0 or args.case_end > 90)):
+        print("Starting case# and Ending case# must be in the 0:90 range. Aborting!")
         exit(0)
     elif args.case_end < args.case_start:
         print("Ending case# must be greater than starting case#. Aborting!")
@@ -349,7 +349,7 @@ def rpp_test_suite_parser_and_validator():
     elif args.decoder_type < 0 or args.decoder_type > 1:
         print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting")
         exit(0)
-    elif args.case_list is not None and args.case_start > 0 and args.case_end < 87:
+    elif args.case_list is not None and args.case_start > 0 and args.case_end < 90:
         print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
         exit(0)
     elif args.num_runs <= 0:
@@ -376,8 +376,8 @@ def rpp_test_suite_parser_and_validator():
         args.case_list = [str(x) for x in args.case_list]
     else:
         for case in args.case_list:
-            if int(case) < 0 or int(case) > 87:
-                 print("The case# must be in the 0:87 range!")
+            if int(case) < 0 or int(case) > 90:
+                 print("The case# must be in the 0:90 range!")
                  exit(0)
 
     return args
@@ -458,8 +458,8 @@ def rpp_test_suite_parser_and_validator():
         if qaMode == 1 and case != "82":
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
-        if int(case) < 0 or int(case) > 87:
-            print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+        if int(case) < 0 or int(case) > 89:
+            print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
             continue
         for layout in range(3):
             dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath)
@@ -476,8 +476,8 @@ def rpp_test_suite_parser_and_validator():
 else:
     if (testType == 1 and profilingOption == "NO"):
         for case in caseList:
-            if int(case) < 0 or int(case) > 87:
-                print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+            if int(case) < 0 or int(case) > 89:
+                print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
                 continue
             if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
                 srcPath1 = ricapInFilePath
@@ -491,8 +491,8 @@ def rpp_test_suite_parser_and_validator():
         NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69]
 
         for case in caseList:
-            if int(case) < 0 or int(case) > 87:
-                print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+            if int(case) < 0 or int(case) > 89:
+                print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
                 continue
             if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
                 srcPath1 = ricapInFilePath
@@ -696,7 +696,7 @@ def rpp_test_suite_parser_and_validator():
         f.close()
 
 # print the results of qa tests
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89']
 nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
 
 if qaMode and testType == 0:
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index fd198ba23..b698a2def 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -68,11 +68,12 @@ int main(int argc, char **argv)
     bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
     bool randomOutputCase = (testCase == 84);
     bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
+    bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
     bool noiseTypeCase = (testCase == 8);
     bool pln1OutTypeCase = (testCase == 86);
+
     unsigned int verbosity = atoi(argv[11]);
     unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1;
-    bool reductionTypeCase = (testCase == 87);
     int roiList[4] = {atoi(argv[15]), atoi(argv[16]), atoi(argv[17]), atoi(argv[18])};
     string scriptPath = argv[19];
 
@@ -140,6 +141,11 @@ int main(int argc, char **argv)
         std::cerr << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!";
         exit(0);
     }
+    else if(testCase == 82 && batchSize < 2)
+    {
+        std::cerr<<"\n RICAP only works with BatchSize > 1";
+        exit(0);
+    }
 
     // Get function name
     string funcName = augmentationMap[testCase];
@@ -310,6 +316,24 @@ int main(int argc, char **argv)
     input_second = static_cast<Rpp8u *>(calloc(inputBufferSize, 1));
     output = static_cast<Rpp8u *>(calloc(outputBufferSize, 1));
 
+    // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums)
+    void *reductionFuncResultArr;
+    Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
+    if (reductionTypeCase)
+    {
+        int bitDepthByteSize = 0;
+        if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
+        {
+            bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+            reductionFuncResultArr = static_cast<void *>(calloc(reductionFuncResultArrLength, bitDepthByteSize));
+        }
+        else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
+        {
+            bitDepthByteSize = sizeof(Rpp32f);  // using 32f outputs for 16f and 32f
+            reductionFuncResultArr = static_cast<Rpp32f *>(calloc(reductionFuncResultArrLength, bitDepthByteSize));
+        }
+    }
+
     // Set the number of threads to be used by OpenMP pragma for RPP batch processing on host.
     // If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size
     Rpp32u numThreads = 0;
@@ -321,21 +345,6 @@ int main(int argc, char **argv)
     double cpuTime, wallTime;
     string testCaseName;
 
-    // Initialize buffers for any reductionType functions
-    void *reductionFuncResultArr;
-    Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
-    if(reductionTypeCase)
-    {
-        if(dstDescPtr->dataType == RpptDataType::U8)
-            reductionFuncResultArr = static_cast<Rpp64u*>(calloc(reductionFuncResultArrLength, sizeof(Rpp64u)));
-        else if(dstDescPtr->dataType == RpptDataType::F16)
-            reductionFuncResultArr = static_cast<Rpp32f*>(calloc(reductionFuncResultArrLength, sizeof(Rpp32f)));
-        else if(dstDescPtr->dataType == RpptDataType::F32)
-            reductionFuncResultArr = static_cast<Rpp32f*>(calloc(reductionFuncResultArrLength, sizeof(Rpp32f)));
-        else if(dstDescPtr->dataType == RpptDataType::I8)
-            reductionFuncResultArr = static_cast<Rpp64s*>(calloc(reductionFuncResultArrLength, sizeof(Rpp64s)));
-    }
-
     // case-wise RPP API and measure time script for Unit and Performance test
     printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize);
     for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++)
@@ -1050,6 +1059,40 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 88:
+                {
+                    testCaseName = "tensor_min";
+
+                    if(srcDescPtr->c == 1)
+                        reductionFuncResultArrLength = srcDescPtr->n;
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_tensor_min_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
+                case 89:
+                {
+                    testCaseName = "tensor_max";
+
+                    if(srcDescPtr->c == 1)
+                        reductionFuncResultArrLength = srcDescPtr->n;
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_tensor_max_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 default:
                     missingFuncFlag = 1;
                     break;
@@ -1082,33 +1125,41 @@ int main(int argc, char **argv)
                     if(srcDescPtr->c == 3)
                         printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): ");
                     else if(srcDescPtr->c == 1)
+                    {
                         printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): ");
+                        reductionFuncResultArrLength = srcDescPtr->n;
+                    }
 
-                    if(dstDescPtr->dataType == RpptDataType::U8)
+                    // print reduction functions output array based on different bit depths, and precision desired
+                    int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
+                    if (dstDescPtr->dataType == RpptDataType::U8)
                     {
-                        Rpp64u *reductionOutPtr = static_cast<Rpp64u*>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %llu ", reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp64u *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp8u *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::F16)
+                    else if (dstDescPtr->dataType == RpptDataType::F16)
                     {
-                        Rpp32f *reductionOutPtr = static_cast<Rpp32f *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %0.3f ", (float)reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp16f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::F32)
+                    else if (dstDescPtr->dataType == RpptDataType::F32)
                     {
-                        Rpp32f *reductionOutPtr = static_cast<Rpp32f *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %0.3f ", (float)reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp32f *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-                    else if(dstDescPtr->dataType == RpptDataType::I8)
+                    else if (dstDescPtr->dataType == RpptDataType::I8)
                     {
-                        Rpp64s *reductionOutPtr = static_cast<Rpp64s *>(reductionFuncResultArr);
-                        for (int i = 0; i < reductionFuncResultArrLength; i++)
-                            printf(" %lld ", reductionOutPtr[i]);
+                        if (testCase == 87)
+                            print_array(static_cast<Rpp64s *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+                        else
+                            print_array(static_cast<Rpp8s *>(reductionFuncResultArr), reductionFuncResultArrLength, precision);
                     }
-
                     printf("\n");
 
                     /*Compare the output of the function with golden outputs only if
@@ -1116,7 +1167,12 @@ int main(int argc, char **argv)
                     2.input bit depth 0 (U8)
                     3.source and destination layout are the same*/
                     if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
-                        compare_reduction_output(static_cast<Rpp64u *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                    {
+                        if (testCase == 87)
+                            compare_reduction_output(static_cast<uint64_t *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                        else
+                            compare_reduction_output(static_cast<Rpp8u *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+                    }
                 }
                 else
                 {
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index b40a0bf01..b08c4d5e8 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -244,8 +244,8 @@ def rpp_test_suite_parser_and_validator():
     parser = argparse.ArgumentParser()
     parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1")
     parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2")
-    parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)")
-    parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)")
+    parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:89)")
+    parser.add_argument("--case_end", type = int, default = 89, help = "Testing range ending case # - (0:89)")
     parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
     parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
     parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False)
@@ -263,8 +263,8 @@ def rpp_test_suite_parser_and_validator():
     validate_path(perfQaInputFile)
 
     # validate the parameters passed by user
-    if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)):
-        print("Starting case# and Ending case# must be in the 0:87 range. Aborting!")
+    if ((args.case_start < 0 or args.case_start > 89) or (args.case_end < 0 or args.case_end > 89)):
+        print("Starting case# and Ending case# must be in the 0:89 range. Aborting!")
         exit(0)
     elif args.case_end < args.case_start:
         print("Ending case# must be greater than starting case#. Aborting!")
@@ -278,7 +278,7 @@ def rpp_test_suite_parser_and_validator():
     elif args.decoder_type < 0 or args.decoder_type > 1:
         print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting")
         exit(0)
-    elif args.case_list is not None and args.case_start > 0 and args.case_end < 87:
+    elif args.case_list is not None and args.case_start > 0 and args.case_end < 89:
         print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
         exit(0)
     elif args.num_runs <= 0:
@@ -302,8 +302,8 @@ def rpp_test_suite_parser_and_validator():
         args.case_list = [str(x) for x in args.case_list]
     else:
         for case in args.case_list:
-            if int(case) < 0 or int(case) > 87:
-                 print("The case# must be in the 0:87 range!")
+            if int(case) < 0 or int(case) > 89:
+                 print("The case# must be in the 0:89 range!")
                  exit(0)
 
     return args
@@ -381,8 +381,8 @@ def rpp_test_suite_parser_and_validator():
         if qaMode == 1 and case != "82":
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
-        if int(case) < 0 or int(case) > 87:
-            print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!")
+        if int(case) < 0 or int(case) > 89:
+            print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
             continue
         for layout in range(3):
             dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath)
@@ -397,8 +397,8 @@ def rpp_test_suite_parser_and_validator():
         create_layout_directories(dstPath, layoutDict)
 else:
     for case in caseList:
-        if int(case) < 0 or int(case) > 87:
-            print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!")
+        if int(case) < 0 or int(case) > 89:
+            print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
             continue
         # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs
         if qaMode == 1 and case != "82":
@@ -412,7 +412,7 @@ def rpp_test_suite_parser_and_validator():
             run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
 
 # print the results of qa tests
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89']
 nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
 
 if qaMode and testType == 0:
diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md
index 76ecd9551..067bedb1d 100644
--- a/utilities/test_suite/README.md
+++ b/utilities/test_suite/README.md
@@ -94,8 +94,8 @@ The image test suite can be executed under 2 backend scenarios - (HOST/HIP):
 The image test suite accepts the following command line arguments:
 -   input_path1: The path to the input folder 1. Default is $cwd/../TEST_IMAGES/three_images_mixed_src1
 -   input_path2: The path to the input folder 2. Default is $cwd/../TEST_IMAGES/three_images_mixed_src2
--   case_start: The starting case number for the test range (0-87). Default is 0
--   case_end: The ending case number for the test range (0-87). Default is 87
+-   case_start: The starting case number for the test range (0-89). Default is 0
+-   case_end: The ending case number for the test range (0-89). Default is 89
 -   test_type: The type of test to run (0 = Unit tests, 1 = Performance tests). Default is 0
 -   case_list: A list of specific case numbers to run. Must be used in conjunction with --test_type
 -   profiling: Run the tests with a profiler (YES/NO). Default is NO. This option is only available with HIP backend
@@ -121,7 +121,7 @@ python runTests.py --input_path1 <input_path1> --input_path2 <input_path2> --cas
 -   QA mode (Unit tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3.
 Note: QA mode is not supported for case 84 due to run-to-run variation of outputs.
 ``` python
-python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 1 --batch_size 3
+python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 1 --batch_size 3
 ```
 -   QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement
 ``` python
@@ -131,13 +131,13 @@ python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8
 Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
       RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
 ``` python
-python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 0
+python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 0
 ```
 -   Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time, or optionally, AMD rocprof kernel profiler max/min/avg time for HIP backend variants.
 Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
       RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
 ``` python
-python runTests.py --case_start 0 --case_end 87 --test_type 1
+python runTests.py --case_start 0 --case_end 89 --test_type 1
 ```
 
 To run the unit tests / performance tests for specific case numbers. please case use case_list parameter. Example as below
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index 2bc914af9..58fee0c5d 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -99,11 +99,27 @@ std::map<int, string> augmentationMap =
     {84, "spatter"},
     {85, "swap_channels"},
     {86, "color_to_greyscale"},
-    {87, "tensor_sum"}
+    {87, "tensor_sum"},
+    {88, "tensor_min"},
+    {89, "tensor_max"},
+};
+
+// Golden outputs for Tensor min Kernel
+std::map<int, std::vector<int>> TensorMinReferenceOutputs =
+{
+    {1, {1, 1, 7}},
+    {3, {0, 0, 0, 0, 2, 0, 0, 0, 7, 9, 0, 0}}
+};
+
+// Golden outputs for Tensor max Kernel
+std::map<int, std::vector<int>> TensorMaxReferenceOutputs =
+{
+    {1, {239, 245, 255}},
+    {3, {255, 240, 236, 255, 255, 242, 241, 255, 253, 255, 255, 255}}
 };
 
 // Golden outputs for Tensor sum Kernel
-std::map<int, std::vector<Rpp64u>> TensorSumReferenceOutputs =
+std::map<int, std::vector<int>> TensorSumReferenceOutputs =
 {
     {1, {334225, 813471, 2631125}},
     {3, {348380, 340992, 262616, 951988, 1056552, 749506, 507441, 2313499, 2170646, 2732368, 3320699, 8223713}}
@@ -1118,11 +1134,19 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src
     int matched_values = 0;
 
     T *refOutput;
+    refOutput = (T *)calloc(srcDescPtr->n * 4, sizeof(T));
     int numChannels = (srcDescPtr->c == 1) ? 1 : 3;
     int numOutputs = (srcDescPtr->c == 1) ? srcDescPtr->n : srcDescPtr->n * 4;
-    std::vector<T> ref;
-    if(testCase == 87)
-        refOutput = TensorSumReferenceOutputs[numChannels].data();
+    std::vector<int> ref;
+    if(testCase == 88)
+        ref = TensorMinReferenceOutputs[numChannels];
+    else if(testCase == 89)
+        ref = TensorMaxReferenceOutputs[numChannels];
+    else if(testCase == 87)
+        ref = TensorSumReferenceOutputs[numChannels];
+
+    for (int i = 0; i < numOutputs; i++)
+        refOutput[i] = (T)ref[i];
 
     if(srcDescPtr->c == 1)
     {
@@ -1148,6 +1172,7 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src
                 fileMatch++;
         }
     }
+    free(refOutput);
 
     std::cout << std::endl << "Results for " << func << " :" << std::endl;
     std::string status = func + ": ";
@@ -1172,6 +1197,14 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src
     }
 }
 
+// print array of any bit depth for specified length
+template <typename T>
+inline void print_array(T *src, Rpp32u length, Rpp32u precision)
+{
+    for (int i = 0; i < length; i++)
+        std::cout << " " << std::fixed << std::setprecision(precision) << static_cast<Rpp32f>(src[i]) << " ";
+}
+
 // Used to randomly swap values present in array of size n
 inline void randomize(unsigned int arr[], unsigned int n)
 {