diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h index b61af9dde..3cb49a82b 100644 --- a/include/rppt_tensor_statistical_operations.h +++ b/include/rppt_tensor_statistical_operations.h @@ -78,6 +78,78 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorSumArr, Rpp32u tensorSumArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Tensor min operation on HOST backend for a NCHW/NHWC layout tensor + * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] minArr destination array in HOST memory + * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t minArr, Rpp32u minArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Tensor min operation on HIP backend for a NCHW/NHWC layout tensor + * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] minArr destination array in HIP memory + * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMinArr, Rpp32u imageMinArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! \brief Tensor max operation on HOST backend for a NCHW/NHWC layout tensor + * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] maxArr destination array in HOST memory + * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t maxArr, Rpp32u maxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Tensor max operation on HIP backend for a NCHW/NHWC layout tensor + * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] maxArr destination array in HIP memory + * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index 86391108d..1e748cc86 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -5996,4 +5996,284 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2 pSumB[0] = _mm256_add_pd(_mm256_add_pd(p[4], p[5]), pSumB[0]); //add 8B values and bring it down to 4 } -#endif //RPP_CPU_COMMON_H \ No newline at end of file +inline void reduce_min_32_host(__m256i *pMin, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7); + px[0] = _mm256_castsi256_si128(pMin[0]); + px[1] = _mm256_extracti128_si256(pMin[0], 1); + px[0] = _mm_min_epu8(px[0], px[1]); + px[1] = _mm_unpacklo_epi8(zero, px[0]); + px[0] = _mm_unpackhi_epi8(zero, px[0]); + px[0] = _mm_min_epu8(px[0], px[1]); + px[1] = _mm_unpacklo_epi16(zero, px[0]); + px[0] = _mm_unpackhi_epi16(zero, px[0]); + px[0] = _mm_min_epu16(px[0], px[1]); + px[1] = _mm_unpacklo_epi32(zero, px[0]); + px[0] = _mm_unpackhi_epi32(zero, px[0]); + px[0] = _mm_min_epu32(px[0], px[1]); + result[0] = _mm_shuffle_epi8(px[0], mask); +} + +inline void compute_min_96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB) +{ + pMinR[0] = _mm256_min_epu8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min + pMinG[0] = _mm256_min_epu8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min + pMinB[0] = _mm256_min_epu8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min +} + +inline void reduce_min_96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result) +{ + __m128i px[4]; + __m128i zero = _mm_setzero_si128(); + px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1)); + px[1] = _mm_min_epu8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1)); + px[1] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1])); + px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1)); + px[0] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero)); + px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_min_48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB) +{ + pMinR[0] = _mm_min_epu8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min + pMinG[0] = _mm_min_epu8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min + pMinB[0] = _mm_min_epu8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min +} + +inline void reduce_min_48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + px[1] = _mm_min_epu8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0])); + px[0] = _mm_min_epu8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero)); + px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void reduce_max_32_host(__m256i *pMax, __m128i *result) +{ + __m128i px; + __m128i zero = _mm_setzero_si128(); + __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7); + px = _mm_max_epu8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1)); + px = _mm_max_epu8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px)); + px = _mm_max_epu16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px)); + px = _mm_max_epu32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px)); + result[0] = _mm_shuffle_epi8(px, mask); +} + +inline void compute_max_96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB) +{ + pMaxR[0] = _mm256_max_epu8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max + pMaxG[0] = _mm256_max_epu8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max + pMaxB[0] = _mm256_max_epu8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max +} + +inline void reduce_max_96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result) +{ + __m128i px[4]; + __m128i zero = _mm_setzero_si128(); + px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1)); + px[1] = _mm_max_epu8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1)); + px[1] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1])); + px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1)); + px[0] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero)); + px[1] = _mm_max_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_max_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_max_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_max_48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB) +{ + pMaxR[0] = _mm_max_epu8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max + pMaxG[0] = _mm_max_epu8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max + pMaxB[0] = _mm_max_epu8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max +} + +inline void reduce_max_48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero)); + px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_min_float8_host(__m256 *p1, __m256 *pMin) +{ + pMin[0] = _mm256_min_ps(p1[0], pMin[0]); //compare and store min of 8 values into global min +} + +inline void reduce_min_float8_host(__m256 *pMin, __m128 *result) +{ + __m128 px; + px = _mm_min_ps(_mm256_castps256_ps128(pMin[0]), _mm256_extractf128_ps(pMin[0], 1)); + px = _mm_min_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px)); + result[0] = _mm_shuffle_ps(px, px, 39); +} + +inline void compute_min_float24_host(__m256 *p1, __m256 *pMinR, __m256 *pMinG, __m256 *pMinB) +{ + pMinR[0] = _mm256_min_ps(p1[0], pMinR[0]); //compare and store min of 8 R values into global min + pMinG[0] = _mm256_min_ps(p1[1], pMinG[0]); //compare and store min of 8 G values into global min + pMinB[0] = _mm256_min_ps(p1[2], pMinB[0]); //compare and store min of 8 B values into global min +} + +inline void reduce_min_float24_host(__m256 *pMinR, __m256 *pMinG, __m256 *pMinB, __m256 *result) // TO CHANGE +{ + __m128 px[2]; + px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinR[0]), _mm256_extractf128_ps(pMinR[0], 1)); + px[1] = _mm_min_ps(_mm256_castps256_ps128(pMinG[0]), _mm256_extractf128_ps(pMinG[0], 1)); + px[0] = _mm_min_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1])); + px[0] = _mm_permute_ps(px[0], 0b11011000); + result[0] = _mm256_castps128_ps256(px[0]); + px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinB[0]), _mm256_extractf128_ps(pMinB[0], 1)); + px[1] = _mm_min_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0)); + px[0] = _mm_shuffle_ps(px[1], px[1], 34); + result[0] = _mm256_insertf128_ps(result[0], px[0], 1); +} + +inline void compute_max_float8_host(__m256 *p1, __m256 *pMax) +{ + pMax[0] = _mm256_max_ps(p1[0], pMax[0]); //compare and store max of 8 values into global min +} + +inline void reduce_max_float8_host(__m256 *pMax, __m128 *result) +{ + __m128 px; + px = _mm_max_ps(_mm256_castps256_ps128(pMax[0]), _mm256_extractf128_ps(pMax[0], 1)); + px = _mm_max_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px)); + result[0] = _mm_shuffle_ps(px, px, 39); +} + +inline void compute_max_float24_host(__m256 *p1, __m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB) +{ + pMaxR[0] = _mm256_max_ps(p1[0], pMaxR[0]); //compare and store max of 8 R values into global min + pMaxG[0] = _mm256_max_ps(p1[1], pMaxG[0]); //compare and store max of 8 G values into global min + pMaxB[0] = _mm256_max_ps(p1[2], pMaxB[0]); //compare and store max of 8 B values into global min +} + +inline void reduce_max_float24_host(__m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB, __m256 *result) +{ + __m128 px[2]; + px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxR[0]), _mm256_extractf128_ps(pMaxR[0], 1)); + px[1] = _mm_max_ps(_mm256_castps256_ps128(pMaxG[0]), _mm256_extractf128_ps(pMaxG[0], 1)); + px[0] = _mm_max_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1])); + px[0] = _mm_permute_ps(px[0], 0b11011000); + result[0] = _mm256_castps128_ps256(px[0]); + px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxB[0]), _mm256_extractf128_ps(pMaxB[0], 1)); + px[1] = _mm_max_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0)); + px[0] = _mm_shuffle_ps(px[1], px[1], 34); + result[0] = _mm256_insertf128_ps(result[0], px[0], 1); +} + +inline void reduce_min_i32_host(__m256i *pMin, __m128i *result) +{ + __m128i px; + __m128i zero = _mm_setzero_si128(); + __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7); + px = _mm_min_epi8(_mm256_castsi256_si128(pMin[0]), _mm256_extracti128_si256(pMin[0], 1)); + px = _mm_min_epi8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px)); + px = _mm_min_epi16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px)); + px = _mm_min_epi32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px)); + result[0] = _mm_shuffle_epi8(px, mask); +} + +inline void compute_min_i96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB) +{ + pMinR[0] = _mm256_min_epi8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min + pMinG[0] = _mm256_min_epi8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min + pMinB[0] = _mm256_min_epi8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min +} + +inline void reduce_min_i96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result) +{ + __m128i px[4]; + __m128i zero = _mm_setzero_si128(); + px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1)); + px[1] = _mm_min_epi8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1)); + px[1] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1])); + px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1)); + px[0] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero)); + px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_min_i48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB) +{ + pMinR[0] = _mm_min_epi8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min + pMinG[0] = _mm_min_epi8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min + pMinB[0] = _mm_min_epi8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min +} + +inline void reduce_min_i48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + px[1] = _mm_min_epi8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0])); + px[0] = _mm_min_epi8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero)); + px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void reduce_max_i32_host(__m256i *pMax, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7); + px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1)); + px[0] = _mm_max_epi8(_mm_unpacklo_epi8(zero, px[0]), _mm_unpackhi_epi8(zero, px[0])); + px[0] = _mm_max_epi16(_mm_unpacklo_epi16(zero, px[0]), _mm_unpackhi_epi16(zero, px[0])); + px[0] = _mm_max_epi32(_mm_unpacklo_epi32(zero, px[0]), _mm_unpackhi_epi32(zero, px[0])); + result[0] = _mm_shuffle_epi8(px[0], mask); +} + +inline void compute_max_i96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB) +{ + pMaxR[0] = _mm256_max_epi8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max + pMaxG[0] = _mm256_max_epi8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max + pMaxB[0] = _mm256_max_epi8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max +} + +inline void reduce_max_i96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result) +{ + __m128i px[4]; + __m128i zero = _mm_setzero_si128(); + px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1)); + px[1] = _mm_max_epi8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1)); + px[1] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1])); + px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1)); + px[0] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero)); + px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_max_i48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB) +{ + pMaxR[0] = _mm_max_epi8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max + pMaxG[0] = _mm_max_epi8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max + pMaxB[0] = _mm_max_epi8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max +} + +inline void reduce_max_i48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero)); + px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +#endif //RPP_CPU_COMMON_H diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index ff30de027..d03ec0e79 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -75,7 +75,7 @@ typedef union #define SIMD_GET_PS(name) (*(const __m128 *)_xmm_const_##name) -const __m128 xmm_p0 = _mm_set1_ps(0.0f); +const __m128 xmm_p0 = _mm_setzero_ps(); const __m128 xmm_p1 = _mm_set1_ps(1.0f); const __m128 xmm_p2 = _mm_set1_ps(2.0f); const __m128 xmm_pm2 = _mm_set1_ps(-2.0f); @@ -243,7 +243,7 @@ inline void rpp_mm256_print_epi8(__m256i vPrintArray) printf("\n"); for (int ct = 0; ct < 32; ct++) { - printf("%d ", printArray[ct]); + printf("%d ", (unsigned char)printArray[ct]); } } @@ -1271,6 +1271,20 @@ inline void rpp_load16_u8_to_u32_avx(Rpp8u *srcPtr, __m256i *p) p[1] = _mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15)); /* Contains pixels 09-16 */ } +inline void rpp_load96_u8_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256i *p) +{ + p[0] = _mm256_loadu_si256((__m256i *)srcPtrR); + p[1] = _mm256_loadu_si256((__m256i *)srcPtrG); + p[2] = _mm256_loadu_si256((__m256i *)srcPtrB); +} + +inline void rpp_load96_i8_avx(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *p) +{ + p[0] = _mm256_load_si256((__m256i *)srcPtrR); + p[1] = _mm256_load_si256((__m256i *)srcPtrG); + p[2] = _mm256_load_si256((__m256i *)srcPtrB); +} + inline void rpp_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p) { __m128 p128[8]; @@ -1478,6 +1492,16 @@ inline void rpp_store4_f64_to_f64_avx(Rpp64f *dstPtr, __m256d *p) _mm256_storeu_pd(dstPtr, p[0]); } +inline void rpp_store16_u8_to_u8(Rpp8u *dstPtr, __m128i *p) +{ + _mm_storeu_si128((__m128i *)dstPtr, p[0]); +} + +inline void rpp_store16_i8(Rpp8s *dstPtr, __m128i *p) +{ + _mm_store_si128((__m128i *)dstPtr, p[0]); +} + inline void rpp_store8_f32_to_f16_avx(Rpp16f *dstPtr, __m256 *p) { __m128i px128 = _mm256_cvtps_ph(p[0], _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index a7412aa2d..d9c0ce02d 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -184,6 +184,13 @@ inline void generate_gaussian_kernel_gpu(Rpp32f stdDev, Rpp32f* kernel, Rpp32u k } } +// Retrieve Min and Max given a datatype + +inline void getImageBitDepthMinMax(uchar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); } +inline void getImageBitDepthMinMax(float *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); } +inline void getImageBitDepthMinMax(half *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); } +inline void getImageBitDepthMinMax(schar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(-128, 127); } + /******************** DEVICE FUNCTIONS ********************/ // -------------------- Set 0 - Range checks and Range adjustment -------------------- @@ -1560,6 +1567,20 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_to_int24_pln3(schar *srcPtr, // /******************** DEVICE MATH HELPER FUNCTIONS ********************/ +// float8 min + +__device__ __forceinline__ void rpp_hip_math_min8(d_float8 *srcPtr_f8, float *dstPtr) +{ + *dstPtr = fminf(fminf(fminf(fminf(fminf(fminf(fminf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]); +} + +// float8 max + +__device__ __forceinline__ void rpp_hip_math_max8(d_float8 *srcPtr_f8, float *dstPtr) +{ + *dstPtr = fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]); +} + // d_float16 floor __device__ __forceinline__ void rpp_hip_math_floor16(d_float16 *srcPtr_f16, d_float16 *dstPtr_f16) diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp index dae3e6236..32b8b62b5 100644 --- a/src/modules/cpu/host_tensor_statistical_operations.hpp +++ b/src/modules/cpu/host_tensor_statistical_operations.hpp @@ -26,5 +26,7 @@ SOFTWARE. #define HOST_TENSOR_STATISTICAL_OPERATIONS_HPP #include "kernel/tensor_sum.hpp" +#include "kernel/tensor_min.hpp" +#include "kernel/tensor_max.hpp" #endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP \ No newline at end of file diff --git a/src/modules/cpu/kernel/tensor_max.hpp b/src/modules/cpu/kernel/tensor_max.hpp new file mode 100644 index 000000000..0380f4ef6 --- /dev/null +++ b/src/modules/cpu/kernel/tensor_max.hpp @@ -0,0 +1,847 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus tensor_max_u8_u8_host(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8u *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; + + // Tensor max 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp8u max = 0; + Rpp8u resultAvx[16]; + + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pMax = _mm256_setzero_si256(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp); + pMax = _mm256_max_epu8(p1, pMax); //compare and store max of 32 values into global max + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + max = std::max(*srcPtrTemp++, max); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_32_host(&pMax, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + max = std::max(resultAvx[0], max); +#endif + maxArr[batchCount] = max; + } + // Tensor max 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0; + Rpp8u resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pMaxR = _mm256_setzero_si256(); + __m256i pMaxG = pMaxR; + __m256i pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[3]; + rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_max_96_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + maxR = std::max(*srcPtrTempR++, maxR); + maxG = std::max(*srcPtrTempG++, maxG); + maxB = std::max(*srcPtrTempB++, maxB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_96_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + maxR = std::max(resultAvx[0], maxR); + maxG = std::max(resultAvx[1], maxG); + maxB = std::max(resultAvx[2], maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + + // Tensor max 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0; + Rpp8u resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; + + __m128i pMaxR = _mm_setzero_si128(); + __m128i pMaxG = pMaxR; + __m128i pMaxB = pMaxR; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p[3]; + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p); + compute_max_48_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + maxR = std::max(srcPtrTemp[0], maxR); + maxG = std::max(srcPtrTemp[1], maxG); + maxB = std::max(srcPtrTemp[2], maxB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_48_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + maxR = std::max(resultAvx[0], maxR); + maxG = std::max(resultAvx[1], maxG); + maxB = std::max(resultAvx[2], maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_max_f32_f32_host(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + + // Tensor max 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp32f max = 0.0; + Rpp32f resultAvx[4]; + + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256 pMax = _mm256_setzero_ps(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1); + compute_max_float8_host(&p1, &pMax); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + max = std::max(*srcPtrTemp++, max); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128 result; + reduce_max_float8_host(&pMax, &result); + rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result); + max = std::max(std::max(resultAvx[0], resultAvx[1]), max); +#endif + maxArr[batchCount] = max; + } + + // Tensor max 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0; + Rpp32f resultAvx[8]; + + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256 pMaxR = _mm256_setzero_ps(); + __m256 pMaxG = pMaxR; + __m256 pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + maxR = std::max(*srcPtrTempR++, maxR); + maxG = std::max(*srcPtrTempG++, maxG); + maxB = std::max(*srcPtrTempB++, maxB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR); + maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG); + maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB); +#endif + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + + // Tensor max 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0; + Rpp32f resultAvx[8]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; + +#if __AVX2__ + __m256 pMaxR = _mm256_setzero_ps(); + __m256 pMaxG = pMaxR; + __m256 pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); + compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + maxR = std::max(srcPtrTemp[0], maxR); + maxG = std::max(srcPtrTemp[1], maxG); + maxB = std::max(srcPtrTemp[2], maxB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR); + maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG); + maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_max_f16_f16_host(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp16f *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + + // Tensor max 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp32f max = 0.0; + Rpp32f resultAvx[4]; + + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256 pMax = _mm256_setzero_ps(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[8]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + } + __m256 p1; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1); + compute_max_float8_host(&p1, &pMax); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + max = std::max((Rpp32f)*srcPtrTemp++, max); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128 result; + reduce_max_float8_host(&pMax, &result); + rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result); + max = std::max(std::max(resultAvx[0], resultAvx[1]), max); +#endif + maxArr[batchCount] = (Rpp16f)max; + } + + // Tensor max 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0; + Rpp32f resultAvx[8]; + + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256 pMaxR = _mm256_setzero_ps(); + __m256 pMaxG = pMaxR; + __m256 pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8]; + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt]; + srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt]; + srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt]; + } + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); + compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + maxR = std::max((Rpp32f)*srcPtrTempR++, maxR); + maxG = std::max((Rpp32f)*srcPtrTempG++, maxG); + maxB = std::max((Rpp32f)*srcPtrTempB++, maxB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR); + maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG); + maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB); + +#endif + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = (Rpp16f)maxR; + maxArr[maxArrIndex + 1] = (Rpp16f)maxG; + maxArr[maxArrIndex + 2] = (Rpp16f)maxB; + maxArr[maxArrIndex + 3] = (Rpp16f)maxC; + } + + // Tensor max 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0; + Rpp32f resultAvx[8]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; + +#if __AVX2__ + __m256 pMaxR = _mm256_setzero_ps(); + __m256 pMaxG = pMaxR; + __m256 pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[24]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + } + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); + compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + maxR = std::max((Rpp32f)srcPtrTemp[0], maxR); + maxG = std::max((Rpp32f)srcPtrTemp[1], maxG); + maxB = std::max((Rpp32f)srcPtrTemp[2], maxB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR); + maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG); + maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = (Rpp16f)maxR; + maxArr[maxArrIndex + 1] = (Rpp16f)maxG; + maxArr[maxArrIndex + 2] = (Rpp16f)maxB; + maxArr[maxArrIndex + 3] = (Rpp16f)maxC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_max_i8_i8_host(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8s *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; + + // Tensor max 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp8s max = INT8_MIN; + Rpp8s resultAvx[16]; + + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pMax = _mm256_set1_epi8(INT8_MIN); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp); + pMax = _mm256_max_epi8(p1, pMax); //compare and store max of 32 values into global max + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + max = std::max(*srcPtrTemp++, max); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_i32_host(&pMax, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + max = std::max(resultAvx[0], max); +#endif + maxArr[batchCount] = max; + } + // Tensor max 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN; + Rpp8s resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pMaxR = _mm256_set1_epi8(INT8_MIN); + __m256i pMaxG = pMaxR; + __m256i pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[3]; + rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_max_i96_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + maxR = std::max(*srcPtrTempR++, maxR); + maxG = std::max(*srcPtrTempG++, maxG); + maxB = std::max(*srcPtrTempB++, maxB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_i96_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + maxR = std::max(resultAvx[0], maxR); + maxG = std::max(resultAvx[1], maxG); + maxB = std::max(resultAvx[2], maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + + // Tensor max 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN; + Rpp8s resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; + + __m128i pMaxR = _mm_set1_epi8(INT8_MIN); + __m128i pMaxG = pMaxR; + __m128i pMaxB = pMaxR; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p[3]; + rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p); + compute_max_i48_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + maxR = std::max(srcPtrTemp[0], maxR); + maxG = std::max(srcPtrTemp[1], maxG); + maxB = std::max(srcPtrTemp[2], maxB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_i48_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + maxR = std::max(resultAvx[0], maxR); + maxG = std::max(resultAvx[1], maxG); + maxB = std::max(resultAvx[2], maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + } + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/tensor_min.hpp b/src/modules/cpu/kernel/tensor_min.hpp new file mode 100644 index 000000000..15b9b77ba --- /dev/null +++ b/src/modules/cpu/kernel/tensor_min.hpp @@ -0,0 +1,845 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus tensor_min_u8_u8_host(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8u *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; + + // Tensor min 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp8u min = 255; + Rpp8u resultAvx[16]; + + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pMin = _mm256_set1_epi8((char)255); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp); + pMin = _mm256_min_epu8(p1, pMin); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + min = std::min(*srcPtrTemp++, min); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_min_32_host(&pMin, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + min = std::min(std::min(resultAvx[0], resultAvx[1]), min); +#endif + minArr[batchCount] = min; + } + + // Tensor min 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp8u minC = 255, minR = 255, minG = 255, minB = 255; + Rpp8u resultAvx[16]; + + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pMinR = _mm256_set1_epi8((char)255); + __m256i pMinG = pMinR; + __m256i pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[3]; + rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_min_96_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + minR = std::min(*srcPtrTempR++, minR); + minG = std::min(*srcPtrTempG++, minG); + minB = std::min(*srcPtrTempB++, minB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_min_96_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + minR = std::min(resultAvx[0], minR); + minG = std::min(resultAvx[1], minG); + minB = std::min(resultAvx[2], minB); +#endif + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + + // Tensor min 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp8u minC = 255, minR = 255, minG = 255, minB = 255; + Rpp8u resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; + + __m128i pMinR = _mm_set1_epi8((char)255); + __m128i pMinG = pMinR; + __m128i pMinB = pMinR; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; + + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p[3]; + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p); + compute_min_48_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + minR = std::min(srcPtrTemp[0], minR); + minG = std::min(srcPtrTemp[1], minG); + minB = std::min(srcPtrTemp[2], minB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } + + __m128i result; + reduce_min_48_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + minR = std::min(resultAvx[0], minR); + minG = std::min(resultAvx[1], minG); + minB = std::min(resultAvx[2], minB); + } + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_min_f32_f32_host(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + + // Tensor min 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp32f min = 255.0; + Rpp32f resultAvx[4]; + + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256 pMin = _mm256_set1_ps(255.0); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1); + compute_min_float8_host(&p1, &pMin); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + min = std::min(*srcPtrTemp++, min); + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m128 result; + reduce_min_float8_host(&pMin, &result); + rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result); + min = std::min(std::min(resultAvx[0], resultAvx[1]), min); +#endif + minArr[batchCount] = min; + } + + // Tensor min 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0; + Rpp32f resultAvx[8]; + + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256 pMinR = _mm256_set1_ps(255.0); + __m256 pMinG = pMinR; + __m256 pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_min_float24_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + minR = std::min(*srcPtrTempR++, minR); + minG = std::min(*srcPtrTempG++, minG); + minB = std::min(*srcPtrTempB++, minB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR); + minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG); + minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB); +#endif + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + + // Tensor min 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0; + Rpp32f resultAvx[8]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; + +#if __AVX2__ + __m256 pMinR = _mm256_set1_ps(255.0); + __m256 pMinG = pMinR; + __m256 pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); + compute_min_float24_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + minR = std::min(srcPtrTemp[0], minR); + minG = std::min(srcPtrTemp[1], minG); + minB = std::min(srcPtrTemp[2], minB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m256 result; + reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR); + minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG); + minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB); +#endif + } + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_min_f16_f16_host(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp16f *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + + // Tensor min 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp32f min = 255.0; + Rpp32f resultAvx[4]; + + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256 pMin = _mm256_set1_ps(255.0); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[8]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + } + __m256 p1; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1); + compute_min_float8_host(&p1, &pMin); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + min = std::min((Rpp32f)*srcPtrTemp++, min); + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m128 result; + reduce_min_float8_host(&pMin, &result); + rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result); + min = std::min(std::min(resultAvx[0], resultAvx[1]), min); +#endif + minArr[batchCount] = (Rpp16f) min; + } + + // Tensor min 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0; + Rpp32f resultAvx[8]; + + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256 pMinR = _mm256_set1_ps(255.0); + __m256 pMinG = pMinR; + __m256 pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8]; + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt]; + srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt]; + srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt]; + } + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); + compute_min_float24_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + minR = std::min((Rpp32f)*srcPtrTempR++, minR); + minG = std::min((Rpp32f)*srcPtrTempG++, minG); + minB = std::min((Rpp32f)*srcPtrTempB++, minB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR); + minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG); + minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB); +#endif + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = (Rpp16f) minR; + minArr[minArrIndex + 1] = (Rpp16f) minG; + minArr[minArrIndex + 2] = (Rpp16f) minB; + minArr[minArrIndex + 3] = (Rpp16f) minC; + } + + // Tensor min 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0; + Rpp32f resultAvx[8]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; + +#if __AVX2__ + __m256 pMinR = _mm256_set1_ps(255.0); + __m256 pMinG = pMinR; + __m256 pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[24]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + } + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); + compute_min_float24_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + minR = std::min((Rpp32f)srcPtrTemp[0], minR); + minG = std::min((Rpp32f)srcPtrTemp[1], minG); + minB = std::min((Rpp32f)srcPtrTemp[2], minB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m256 result; + reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR); + minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG); + minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB); +#endif + } + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = (Rpp16f) minR; + minArr[minArrIndex + 1] = (Rpp16f) minG; + minArr[minArrIndex + 2] = (Rpp16f) minB; + minArr[minArrIndex + 3] = (Rpp16f) minC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_min_i8_i8_host(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8s *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; + + // Tensor min 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp8s min = 127; + Rpp8s resultAvx[16]; + + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pMin = _mm256_set1_epi8((char)127); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp); + pMin = _mm256_min_epi8(p1, pMin); //compare and store min of 32 values into global min + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + min = std::min((*srcPtrTemp++), min); + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m128i result; + reduce_min_i32_host(&pMin, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + min = std::min(std::min(resultAvx[0], resultAvx[1]), min); +#endif + minArr[batchCount] = min; + } + + // Tensor min 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp8s minC = 127, minR = 127, minG = 127, minB = 127; + Rpp8s resultAvx[16]; + + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pMinR = _mm256_set1_epi8((char)127); + __m256i pMinG = pMinR; + __m256i pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[3]; + rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_min_i96_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + minR = std::min(*srcPtrTempR++, minR); + minG = std::min(*srcPtrTempG++, minG); + minB = std::min(*srcPtrTempB++, minB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_min_i96_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + minR = std::min(resultAvx[0], minR); + minG = std::min(resultAvx[1], minG); + minB = std::min(resultAvx[2], minB); +#endif + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + + // Tensor min 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp8s minC = 127, minR = 127, minG = 127, minB = 127; + Rpp8s resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; + + __m128i pMinR = _mm_set1_epi8((char)127); + __m128i pMinG = pMinR; + __m128i pMinB = pMinR; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p[3]; + rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p); + compute_min_i48_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + minR = std::min(srcPtrTemp[0], minR); + minG = std::min(srcPtrTemp[1], minG); + minB = std::min(srcPtrTemp[2], minB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_min_i48_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + minR = std::min(resultAvx[0], minR); + minG = std::min(resultAvx[1], minG); + minB = std::min(resultAvx[2], minB); +#endif + } + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + } + return RPP_SUCCESS; +} diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp index 328a232a1..c79e0a951 100644 --- a/src/modules/hip/hip_tensor_statistical_operations.hpp +++ b/src/modules/hip/hip_tensor_statistical_operations.hpp @@ -23,8 +23,9 @@ SOFTWARE. */ #ifndef HIP_TENSOR_STATISTICAL_OPERATIONS_HPP -#define HIP_TENSOR_STATISTICAL_OPERATIONS_HPP #include "kernel/tensor_sum.hpp" +#include "kernel/tensor_min.hpp" +#include "kernel/tensor_max.hpp" -#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP \ No newline at end of file +#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP diff --git a/src/modules/hip/kernel/tensor_max.hpp b/src/modules/hip/kernel/tensor_max.hpp new file mode 100644 index 000000000..b47fce024 --- /dev/null +++ b/src/modules/hip/kernel/tensor_max.hpp @@ -0,0 +1,400 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 0 - Reduction Stage 2 -------------------- + +template +__global__ void tensor_max_grid_3channel_result_hip(float *srcPtr, + uint xBufferLength, + T *dstPtr) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialRMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + __shared__ float partialGMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + __shared__ float partialBMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + + uint srcIdx = (id_z * xBufferLength) * 3; + partialRMax_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start of R channel using all 256 x 1 threads + partialGMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start of G channel using all 256 x 1 threads + partialBMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start of B channel using all 256 x 1 threads + + if (id_x >= xBufferLength) + return; + + srcIdx += id_x * 3; + + if (id_x + 8 > xBufferLength) + srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7) + + d_float24 src_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local mmemory + + rpp_hip_math_max8(&src_f24.f8[0], &partialRMax_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[1], &partialGMax_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[2], &partialBMax_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMax_smem[hipThreadIdx_x] = fmaxf(partialRMax_smem[hipThreadIdx_x], partialRMax_smem[hipThreadIdx_x + threadMax]); + partialGMax_smem[hipThreadIdx_x] = fmaxf(partialGMax_smem[hipThreadIdx_x], partialGMax_smem[hipThreadIdx_x + threadMax]); + partialBMax_smem[hipThreadIdx_x] = fmaxf(partialBMax_smem[hipThreadIdx_x], partialBMax_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int dstIdx = hipBlockIdx_z * 4; + dstPtr[dstIdx] = (T) partialRMax_smem[0]; + dstPtr[dstIdx + 1] = (T) partialGMax_smem[0]; + dstPtr[dstIdx + 2] = (T) partialBMax_smem[0]; + dstPtr[dstIdx + 3] = (T) (fmaxf(fmaxf(partialRMax_smem[0], partialGMax_smem[0]), partialBMax_smem[0])); + } +} + +template +__global__ void tensor_max_grid_result_hip(float *srcPtr, + uint xBufferLength, + T *dstPtr) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + + uint srcIdx = (id_z * xBufferLength); + partialMax_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start of buffer using all 256 x 1 threads + + if (id_x >= xBufferLength) + return; + + srcIdx += id_x; + + if (id_x + 8 > xBufferLength) + srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7))); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7) + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_max8(&src_f8, &partialMax_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialMax_smem[hipThreadIdx_x] = fmaxf(partialMax_smem[hipThreadIdx_x], partialMax_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + dstPtr[hipBlockIdx_z] = (T) (partialMax_smem[0]); +} + + +// -------------------- Set 1 - Reduction Stage 1 -------------------- + +template +__global__ void tensor_max_pkd3_hip(T *srcPtr, + uint2 srcStridesNH, + float *maxArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialRMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel + __shared__ float partialGMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel + __shared__ float partialBMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel + + float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for R Channel + float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for G Channel + float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for B Channel + uint srcIdx = (id_z * srcStridesNH.x); + partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads + partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads + partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + + d_float24 src_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory + + rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + { + partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]); + partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]); + partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3; + maxArr[idx] = partialRMaxRowPtr_smem[0]; + maxArr[idx + 1] = partialGMaxRowPtr_smem[0]; + maxArr[idx + 2] = partialBMaxRowPtr_smem[0]; + } + } +} + +template +__global__ void tensor_max_pln3_hip(T *srcPtr, + uint3 srcStridesNCH, + float *maxArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialRMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialGMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialBMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + + float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + uint srcIdx = (id_z * srcStridesNCH.x); + partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads + partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y]; // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads + partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y]; // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + d_float24 src_f24; + rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24); + + rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + { + partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]); + partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]); + partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3; + maxArr[idx] = partialRMaxRowPtr_smem[0]; + maxArr[idx + 1] = partialGMaxRowPtr_smem[0]; + maxArr[idx + 2] = partialBMaxRowPtr_smem[0]; + } + } +} + +template +__global__ void tensor_max_pln1_hip(T *srcPtr, + uint2 srcStridesNH, + float *maxArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + + uint srcIdx = (id_z * srcStridesNH.x); + float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + + rpp_hip_math_max8(&src_f8, &partialMaxRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0]; + } +} + + +// -------------------- Set 2 - Kernel Executors -------------------- + +template +RppStatus hip_exec_tensor_max(T *srcPtr, + RpptDescPtr srcDescPtr, + U *maxArr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (srcDescPtr->w + 7) >> 3; + int globalThreads_y = srcDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X); + int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y); + int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z); + float2 bitDepthMinMax_f2; + getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2); + float minimum = bitDepthMinMax_f2.x; + + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z; + float *partialMaxArr; + partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_max_pln1_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + partialMaxArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_max_grid_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMaxArr, + gridDim_x * gridDim_y, + maxArr); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + float *partialMaxArr; + partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_max_pln3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + partialMaxArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMaxArr, + gridDim_x * gridDim_y, + maxArr); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + float *partialMaxArr; + partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_max_pkd3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + partialMaxArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMaxArr, + gridDim_x * gridDim_y, + maxArr); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/tensor_min.hpp b/src/modules/hip/kernel/tensor_min.hpp new file mode 100644 index 000000000..a883c4f3b --- /dev/null +++ b/src/modules/hip/kernel/tensor_min.hpp @@ -0,0 +1,410 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 0 - Reduction Stage 2 -------------------- + +template +__global__ void tensor_min_grid_3channel_result_hip(float *srcPtr, + uint xBufferLength, + T *dstPtr) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialRMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + __shared__ float partialGMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + __shared__ float partialBMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + + uint srcIdx = (id_z * xBufferLength) * 3; + partialRMin_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start of R channel using all 256 x 1 threads + partialGMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start of G channel using all 256 x 1 threads + partialBMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start of B channel using all 256 x 1 threads + + if (id_x >= xBufferLength) + return; + + srcIdx += id_x * 3; + + if (id_x + 8 > xBufferLength) + srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7) + + d_float24 src_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory + + rpp_hip_math_min8(&src_f24.f8[0], &partialRMin_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[1], &partialGMin_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[2], &partialBMin_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after min compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMin_smem[hipThreadIdx_x] = fminf(partialRMin_smem[hipThreadIdx_x], partialRMin_smem[hipThreadIdx_x + threadMax]); + partialGMin_smem[hipThreadIdx_x] = fminf(partialGMin_smem[hipThreadIdx_x], partialGMin_smem[hipThreadIdx_x + threadMax]); + partialBMin_smem[hipThreadIdx_x] = fminf(partialBMin_smem[hipThreadIdx_x], partialBMin_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int dstIdx = hipBlockIdx_z * 4; + dstPtr[dstIdx] = (T) partialRMin_smem[0]; + dstPtr[dstIdx + 1] = (T) partialGMin_smem[0]; + dstPtr[dstIdx + 2] = (T) partialBMin_smem[0]; + dstPtr[dstIdx + 3] = (T) (fminf(fminf(partialRMin_smem[0], partialGMin_smem[0]), partialBMin_smem[0])); + } +} + +template +__global__ void tensor_min_grid_result_hip(float *srcPtr, + uint xBufferLength, + T *dstPtr) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + + uint srcIdx = (id_z * xBufferLength); + partialMin_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start of buffer using all 256 x 1 threads + + if (id_x >= xBufferLength) + return; + + srcIdx += id_x; + + if (id_x + 8 > xBufferLength) + srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7))); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7) + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_min8(&src_f8, &partialMin_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after min compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialMin_smem[hipThreadIdx_x] = fminf(partialMin_smem[hipThreadIdx_x], partialMin_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + dstPtr[hipBlockIdx_z] = (T) (partialMin_smem[0]); +} + + +// -------------------- Set 1 - Reduction Stage 1 -------------------- + +template +__global__ void tensor_min_pkd3_hip(T *srcPtr, + uint2 srcStridesNH, + float *minArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialRMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel + __shared__ float partialGMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel + __shared__ float partialBMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel + + float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for R Channel + float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for G Channel + float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for B Channel + + uint srcIdx = (id_z * srcStridesNH.x); + partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads + partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads + partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + + if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) + srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth) * 3; + + d_float24 src_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory + + rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]); + partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]); + partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + { + partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]); + partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]); + partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3; + minArr[idx] = partialRMinRowPtr_smem[0]; + minArr[idx + 1] = partialGMinRowPtr_smem[0]; + minArr[idx + 2] = partialBMinRowPtr_smem[0]; + } + } +} + +template +__global__ void tensor_min_pln3_hip(T *srcPtr, + uint3 srcStridesNCH, + float *minArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialRMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialGMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialBMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + + float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + + uint srcIdx = (id_z * srcStridesNCH.x); + partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads + partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y]; // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads + partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y]; // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) + srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth); + + d_float24 src_f24; + rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24); + + rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after min compute + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]); + partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]); + partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + { + partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]); + partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]); + partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3; + minArr[idx] = partialRMinRowPtr_smem[0]; + minArr[idx + 1] = partialGMinRowPtr_smem[0]; + minArr[idx + 2] = partialBMinRowPtr_smem[0]; + } + } +} + +template +__global__ void tensor_min_pln1_hip(T *srcPtr, + uint2 srcStridesNH, + float *minArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + + uint srcIdx = (id_z * srcStridesNH.x); + float *partialMinRowPtr_smem = &partialMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + partialMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) + srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_min8(&src_f8, &partialMinRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after min compute + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialMinRowPtr_smem[hipThreadIdx_x] = fminf(partialMinRowPtr_smem[hipThreadIdx_x], partialMinRowPtr_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialMinRowPtr_smem[0] = fminf(partialMinRowPtr_smem[0], partialMinRowPtr_smem[increment]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + minArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMinRowPtr_smem[0]; + } +} + + +// -------------------- Set 2 - Kernel Executors -------------------- + +template +RppStatus hip_exec_tensor_min(T *srcPtr, + RpptDescPtr srcDescPtr, + U *minArr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle &handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (srcDescPtr->w + 7) >> 3; + int globalThreads_y = srcDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X); + int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y); + int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z); + float2 bitDepthMinMax_f2; + getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2); + float maximum = bitDepthMinMax_f2.y; + + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z; + float *partialMinArr; + partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_min_pln1_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + partialMinArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_min_grid_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMinArr, + gridDim_x * gridDim_y, + minArr); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + float *partialMinArr; + partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_min_pln3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + partialMinArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMinArr, + gridDim_x * gridDim_y, + minArr); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + float *partialMinArr; + partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_min_pkd3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + partialMinArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMinArr, + gridDim_x * gridDim_y, + minArr); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp index f17028e5e..28313a88f 100644 --- a/src/modules/rppt_tensor_statistical_operations.cpp +++ b/src/modules/rppt_tensor_statistical_operations.cpp @@ -107,6 +107,140 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** tensor_min ********************/ + +RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if (srcDescPtr->c == 1) + { + if (minArrLength < srcDescPtr->n) // 1 min for each image + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + else if (srcDescPtr->c == 3) + { + if (minArrLength < srcDescPtr->n * 4) // min of each channel, and min of all 3 channels + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if (srcDescPtr->dataType == RpptDataType::U8) + { + tensor_min_u8_u8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(minArr), + minArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + tensor_min_f16_f16_host((Rpp16f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(minArr), + minArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + tensor_min_f32_f32_host((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(minArr), + minArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + tensor_min_i8_i8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(minArr), + minArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + + return RPP_SUCCESS; +} + +/******************** tensor_max ********************/ + +RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if (srcDescPtr->c == 1) + { + if (maxArrLength < srcDescPtr->n) // 1 min for each image + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + else if (srcDescPtr->c == 3) + { + if (maxArrLength < srcDescPtr->n * 4) // min of each channel, and min of all 3 channels + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if (srcDescPtr->dataType == RpptDataType::U8) + { + tensor_max_u8_u8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(maxArr), + maxArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + tensor_max_f16_f16_host((Rpp16f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(maxArr), + maxArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + tensor_max_f32_f32_host((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(maxArr), + maxArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + tensor_max_i8_i8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(maxArr), + maxArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ @@ -184,4 +318,126 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, return RPP_SUCCESS; } + +/******************** tensor_min ********************/ + +RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t imageMinArr, + Rpp32u imageMinArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if (srcDescPtr->c == 1) + { + if (imageMinArrLength < srcDescPtr->n) // min of single channel + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + else if (srcDescPtr->c == 3) + { + if (imageMinArrLength < srcDescPtr->n * 4) // min of each channel, and overall min of all 3 channels + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + + if (srcDescPtr->dataType == RpptDataType::U8) + { + hip_exec_tensor_min(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(imageMinArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + hip_exec_tensor_min((half*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(imageMinArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + hip_exec_tensor_min((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(imageMinArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + hip_exec_tensor_min(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(imageMinArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** tensor_max ********************/ + +RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t imageMaxArr, + Rpp32u imageMaxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if (srcDescPtr->c == 1) + { + if (imageMaxArrLength < srcDescPtr->n) // max of single channel + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + else if (srcDescPtr->c == 3) + { + if (imageMaxArrLength < srcDescPtr->n * 4) // max of each channel, and overall max of all 3 channels + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + + if (srcDescPtr->dataType == RpptDataType::U8) + { + hip_exec_tensor_max(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(imageMaxArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + hip_exec_tensor_max((half*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(imageMaxArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + hip_exec_tensor_max((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(imageMaxArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + hip_exec_tensor_max(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(imageMaxArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} #endif // backend diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 48537de29..7bd46b39e 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -68,9 +68,9 @@ int main(int argc, char **argv) bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63); bool randomOutputCase = (testCase == 84 || testCase == 49 || testCase == 54); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24); + bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89); bool noiseTypeCase = (testCase == 8); bool pln1OutTypeCase = (testCase == 86); - bool reductionTypeCase = (testCase == 87); unsigned int verbosity = atoi(argv[11]); unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1; @@ -323,23 +323,20 @@ int main(int argc, char **argv) double wallTime; string testCaseName; - // Initialize buffers for any reductionType functions + // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums) void *reductionFuncResultArr; Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4; - - if(reductionTypeCase) + if (reductionTypeCase) { - if(dstDescPtr->dataType == RpptDataType::U8) - CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64u))); - else if(dstDescPtr->dataType == RpptDataType::F16) - CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f))); - else if(dstDescPtr->dataType == RpptDataType::F32) - CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f))); - else if(dstDescPtr->dataType == RpptDataType::I8) - CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64s))); + int bitDepthByteSize = 0; + if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8)) + bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u); + else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32)) + bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f + CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize)); } - //Allocate hip memory for src/dst + // Allocate hip memory for src/dst CHECK(hipMalloc(&d_input, inputBufferSize)); CHECK(hipMalloc(&d_output, outputBufferSize)); if(dualInputCase) @@ -1044,6 +1041,30 @@ int main(int argc, char **argv) break; } + case 88: + { + testCaseName = "tensor_min"; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_min_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 89: + { + testCaseName = "tensor_max"; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_max_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } default: missingFuncFlag = 1; break; @@ -1071,33 +1092,41 @@ int main(int argc, char **argv) if(srcDescPtr->c == 3) printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): "); else if(srcDescPtr->c == 1) + { printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): "); + reductionFuncResultArrLength = srcDescPtr->n; + } - if(dstDescPtr->dataType == RpptDataType::U8) + // print reduction functions output array based on different bit depths, and precision desired + int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0; + if (dstDescPtr->dataType == RpptDataType::U8) { - Rpp64u *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %llu ", reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::F16) + else if (dstDescPtr->dataType == RpptDataType::F16) { - Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %0.3f ", (float)reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::F32) + else if (dstDescPtr->dataType == RpptDataType::F32) { - Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %0.3f ", (float)reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::I8) + else if (dstDescPtr->dataType == RpptDataType::I8) { - Rpp64s *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %lld ", reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - printf("\n"); /*Compare the output of the function with golden outputs only if @@ -1105,7 +1134,12 @@ int main(int argc, char **argv) 2.input bit depth 0 (U8) 3.source and destination layout are the same*/ if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase)) - compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + { + if (testCase == 87) + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + else + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + } } else { diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py index cabc4015f..2e8054332 100644 --- a/utilities/test_suite/HIP/runTests.py +++ b/utilities/test_suite/HIP/runTests.py @@ -315,11 +315,11 @@ def rpp_test_suite_parser_and_validator(): parser = argparse.ArgumentParser() parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1") parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2") - parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)") - parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)") - parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)") - parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False) - parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False) + parser.add_argument("--case_start", type = int, default = 0, help="Testing range starting case # - (0:90)") + parser.add_argument("--case_end", type = int, default = 90, help="Testing range ending case # - (0:90)") + parser.add_argument('--test_type', type = int, default = 0, help="Type of Test - (0 = Unit tests / 1 = Performance tests)") + parser.add_argument('--case_list', nargs = "+", help="List of case numbers to list", required=False) + parser.add_argument('--profiling', type = str , default='NO', help='Run with profiler? - (YES/NO)', required=False) parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False) parser.add_argument('--decoder_type', type = int, default = 0, help = "Type of Decoder to decode the input data - (0 = TurboJPEG / 1 = OpenCV)") parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests") @@ -334,8 +334,8 @@ def rpp_test_suite_parser_and_validator(): validate_path(qaInputFile) # validate the parameters passed by user - if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)): - print("Starting case# and Ending case# must be in the 0:87 range. Aborting!") + if ((args.case_start < 0 or args.case_start > 90) or (args.case_end < 0 or args.case_end > 90)): + print("Starting case# and Ending case# must be in the 0:90 range. Aborting!") exit(0) elif args.case_end < args.case_start: print("Ending case# must be greater than starting case#. Aborting!") @@ -349,7 +349,7 @@ def rpp_test_suite_parser_and_validator(): elif args.decoder_type < 0 or args.decoder_type > 1: print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting") exit(0) - elif args.case_list is not None and args.case_start > 0 and args.case_end < 87: + elif args.case_list is not None and args.case_start > 0 and args.case_end < 90: print("Invalid input! Please provide only 1 option between case_list, case_start and case_end") exit(0) elif args.num_runs <= 0: @@ -376,8 +376,8 @@ def rpp_test_suite_parser_and_validator(): args.case_list = [str(x) for x in args.case_list] else: for case in args.case_list: - if int(case) < 0 or int(case) > 87: - print("The case# must be in the 0:87 range!") + if int(case) < 0 or int(case) > 90: + print("The case# must be in the 0:90 range!") exit(0) return args @@ -458,8 +458,8 @@ def rpp_test_suite_parser_and_validator(): if qaMode == 1 and case != "82": srcPath1 = inFilePath1 srcPath2 = inFilePath2 - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue for layout in range(3): dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath) @@ -476,8 +476,8 @@ def rpp_test_suite_parser_and_validator(): else: if (testType == 1 and profilingOption == "NO"): for case in caseList: - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv: srcPath1 = ricapInFilePath @@ -491,8 +491,8 @@ def rpp_test_suite_parser_and_validator(): NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69] for case in caseList: - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv: srcPath1 = ricapInFilePath @@ -696,7 +696,7 @@ def rpp_test_suite_parser_and_validator(): f.close() # print the results of qa tests -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89'] nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp index fd198ba23..b698a2def 100644 --- a/utilities/test_suite/HOST/Tensor_host.cpp +++ b/utilities/test_suite/HOST/Tensor_host.cpp @@ -68,11 +68,12 @@ int main(int argc, char **argv) bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63); bool randomOutputCase = (testCase == 84); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24); + bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89); bool noiseTypeCase = (testCase == 8); bool pln1OutTypeCase = (testCase == 86); + unsigned int verbosity = atoi(argv[11]); unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1; - bool reductionTypeCase = (testCase == 87); int roiList[4] = {atoi(argv[15]), atoi(argv[16]), atoi(argv[17]), atoi(argv[18])}; string scriptPath = argv[19]; @@ -140,6 +141,11 @@ int main(int argc, char **argv) std::cerr << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!"; exit(0); } + else if(testCase == 82 && batchSize < 2) + { + std::cerr<<"\n RICAP only works with BatchSize > 1"; + exit(0); + } // Get function name string funcName = augmentationMap[testCase]; @@ -310,6 +316,24 @@ int main(int argc, char **argv) input_second = static_cast(calloc(inputBufferSize, 1)); output = static_cast(calloc(outputBufferSize, 1)); + // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums) + void *reductionFuncResultArr; + Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4; + if (reductionTypeCase) + { + int bitDepthByteSize = 0; + if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8)) + { + bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u); + reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize)); + } + else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32)) + { + bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f + reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize)); + } + } + // Set the number of threads to be used by OpenMP pragma for RPP batch processing on host. // If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size Rpp32u numThreads = 0; @@ -321,21 +345,6 @@ int main(int argc, char **argv) double cpuTime, wallTime; string testCaseName; - // Initialize buffers for any reductionType functions - void *reductionFuncResultArr; - Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4; - if(reductionTypeCase) - { - if(dstDescPtr->dataType == RpptDataType::U8) - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp64u))); - else if(dstDescPtr->dataType == RpptDataType::F16) - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp32f))); - else if(dstDescPtr->dataType == RpptDataType::F32) - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp32f))); - else if(dstDescPtr->dataType == RpptDataType::I8) - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp64s))); - } - // case-wise RPP API and measure time script for Unit and Performance test printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize); for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++) @@ -1050,6 +1059,40 @@ int main(int argc, char **argv) break; } + case 88: + { + testCaseName = "tensor_min"; + + if(srcDescPtr->c == 1) + reductionFuncResultArrLength = srcDescPtr->n; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_min_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 89: + { + testCaseName = "tensor_max"; + + if(srcDescPtr->c == 1) + reductionFuncResultArrLength = srcDescPtr->n; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_max_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } default: missingFuncFlag = 1; break; @@ -1082,33 +1125,41 @@ int main(int argc, char **argv) if(srcDescPtr->c == 3) printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): "); else if(srcDescPtr->c == 1) + { printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): "); + reductionFuncResultArrLength = srcDescPtr->n; + } - if(dstDescPtr->dataType == RpptDataType::U8) + // print reduction functions output array based on different bit depths, and precision desired + int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0; + if (dstDescPtr->dataType == RpptDataType::U8) { - Rpp64u *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %llu ", reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::F16) + else if (dstDescPtr->dataType == RpptDataType::F16) { - Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %0.3f ", (float)reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::F32) + else if (dstDescPtr->dataType == RpptDataType::F32) { - Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %0.3f ", (float)reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::I8) + else if (dstDescPtr->dataType == RpptDataType::I8) { - Rpp64s *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %lld ", reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - printf("\n"); /*Compare the output of the function with golden outputs only if @@ -1116,7 +1167,12 @@ int main(int argc, char **argv) 2.input bit depth 0 (U8) 3.source and destination layout are the same*/ if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase)) - compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + { + if (testCase == 87) + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + else + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + } } else { diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py index b40a0bf01..b08c4d5e8 100644 --- a/utilities/test_suite/HOST/runTests.py +++ b/utilities/test_suite/HOST/runTests.py @@ -244,8 +244,8 @@ def rpp_test_suite_parser_and_validator(): parser = argparse.ArgumentParser() parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1") parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2") - parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)") - parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)") + parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:89)") + parser.add_argument("--case_end", type = int, default = 89, help = "Testing range ending case # - (0:89)") parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)") parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False) parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False) @@ -263,8 +263,8 @@ def rpp_test_suite_parser_and_validator(): validate_path(perfQaInputFile) # validate the parameters passed by user - if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)): - print("Starting case# and Ending case# must be in the 0:87 range. Aborting!") + if ((args.case_start < 0 or args.case_start > 89) or (args.case_end < 0 or args.case_end > 89)): + print("Starting case# and Ending case# must be in the 0:89 range. Aborting!") exit(0) elif args.case_end < args.case_start: print("Ending case# must be greater than starting case#. Aborting!") @@ -278,7 +278,7 @@ def rpp_test_suite_parser_and_validator(): elif args.decoder_type < 0 or args.decoder_type > 1: print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting") exit(0) - elif args.case_list is not None and args.case_start > 0 and args.case_end < 87: + elif args.case_list is not None and args.case_start > 0 and args.case_end < 89: print("Invalid input! Please provide only 1 option between case_list, case_start and case_end") exit(0) elif args.num_runs <= 0: @@ -302,8 +302,8 @@ def rpp_test_suite_parser_and_validator(): args.case_list = [str(x) for x in args.case_list] else: for case in args.case_list: - if int(case) < 0 or int(case) > 87: - print("The case# must be in the 0:87 range!") + if int(case) < 0 or int(case) > 89: + print("The case# must be in the 0:89 range!") exit(0) return args @@ -381,8 +381,8 @@ def rpp_test_suite_parser_and_validator(): if qaMode == 1 and case != "82": srcPath1 = inFilePath1 srcPath2 = inFilePath2 - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue for layout in range(3): dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath) @@ -397,8 +397,8 @@ def rpp_test_suite_parser_and_validator(): create_layout_directories(dstPath, layoutDict) else: for case in caseList: - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs if qaMode == 1 and case != "82": @@ -412,7 +412,7 @@ def rpp_test_suite_parser_and_validator(): run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList) # print the results of qa tests -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89'] nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md index 76ecd9551..067bedb1d 100644 --- a/utilities/test_suite/README.md +++ b/utilities/test_suite/README.md @@ -94,8 +94,8 @@ The image test suite can be executed under 2 backend scenarios - (HOST/HIP): The image test suite accepts the following command line arguments: - input_path1: The path to the input folder 1. Default is $cwd/../TEST_IMAGES/three_images_mixed_src1 - input_path2: The path to the input folder 2. Default is $cwd/../TEST_IMAGES/three_images_mixed_src2 -- case_start: The starting case number for the test range (0-87). Default is 0 -- case_end: The ending case number for the test range (0-87). Default is 87 +- case_start: The starting case number for the test range (0-89). Default is 0 +- case_end: The ending case number for the test range (0-89). Default is 89 - test_type: The type of test to run (0 = Unit tests, 1 = Performance tests). Default is 0 - case_list: A list of specific case numbers to run. Must be used in conjunction with --test_type - profiling: Run the tests with a profiler (YES/NO). Default is NO. This option is only available with HIP backend @@ -121,7 +121,7 @@ python runTests.py --input_path1 --input_path2 --cas - QA mode (Unit tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3. Note: QA mode is not supported for case 84 due to run-to-run variation of outputs. ``` python -python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 1 --batch_size 3 +python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 1 --batch_size 3 ``` - QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement ``` python @@ -131,13 +131,13 @@ python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1 RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1 ``` python -python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 0 +python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 0 ``` - Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time, or optionally, AMD rocprof kernel profiler max/min/avg time for HIP backend variants. Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1 RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1 ``` python -python runTests.py --case_start 0 --case_end 87 --test_type 1 +python runTests.py --case_start 0 --case_end 89 --test_type 1 ``` To run the unit tests / performance tests for specific case numbers. please case use case_list parameter. Example as below diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h index 2bc914af9..58fee0c5d 100644 --- a/utilities/test_suite/rpp_test_suite_common.h +++ b/utilities/test_suite/rpp_test_suite_common.h @@ -99,11 +99,27 @@ std::map augmentationMap = {84, "spatter"}, {85, "swap_channels"}, {86, "color_to_greyscale"}, - {87, "tensor_sum"} + {87, "tensor_sum"}, + {88, "tensor_min"}, + {89, "tensor_max"}, +}; + +// Golden outputs for Tensor min Kernel +std::map> TensorMinReferenceOutputs = +{ + {1, {1, 1, 7}}, + {3, {0, 0, 0, 0, 2, 0, 0, 0, 7, 9, 0, 0}} +}; + +// Golden outputs for Tensor max Kernel +std::map> TensorMaxReferenceOutputs = +{ + {1, {239, 245, 255}}, + {3, {255, 240, 236, 255, 255, 242, 241, 255, 253, 255, 255, 255}} }; // Golden outputs for Tensor sum Kernel -std::map> TensorSumReferenceOutputs = +std::map> TensorSumReferenceOutputs = { {1, {334225, 813471, 2631125}}, {3, {348380, 340992, 262616, 951988, 1056552, 749506, 507441, 2313499, 2170646, 2732368, 3320699, 8223713}} @@ -1118,11 +1134,19 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src int matched_values = 0; T *refOutput; + refOutput = (T *)calloc(srcDescPtr->n * 4, sizeof(T)); int numChannels = (srcDescPtr->c == 1) ? 1 : 3; int numOutputs = (srcDescPtr->c == 1) ? srcDescPtr->n : srcDescPtr->n * 4; - std::vector ref; - if(testCase == 87) - refOutput = TensorSumReferenceOutputs[numChannels].data(); + std::vector ref; + if(testCase == 88) + ref = TensorMinReferenceOutputs[numChannels]; + else if(testCase == 89) + ref = TensorMaxReferenceOutputs[numChannels]; + else if(testCase == 87) + ref = TensorSumReferenceOutputs[numChannels]; + + for (int i = 0; i < numOutputs; i++) + refOutput[i] = (T)ref[i]; if(srcDescPtr->c == 1) { @@ -1148,6 +1172,7 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src fileMatch++; } } + free(refOutput); std::cout << std::endl << "Results for " << func << " :" << std::endl; std::string status = func + ": "; @@ -1172,6 +1197,14 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src } } +// print array of any bit depth for specified length +template +inline void print_array(T *src, Rpp32u length, Rpp32u precision) +{ + for (int i = 0; i < length; i++) + std::cout << " " << std::fixed << std::setprecision(precision) << static_cast(src[i]) << " "; +} + // Used to randomly swap values present in array of size n inline void randomize(unsigned int arr[], unsigned int n) {