diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7be3d2fd4..916a0a0ad 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,8 +1,8 @@ -# Documentation files -docs/* @saadrahim @LisaDelaney -*.md @saadrahim @LisaDelaney -*.rst @saadrahim @LisaDelaney -# Header directory -library/include/* @saadrahim @LisaDelaney @kiritigowda @rrawther # Source code @kiritigowda @rrawther +# Documentation files +docs/* @ROCm/rocm-documentation +*.md @ROCm/rocm-documentation +*.rst @ROCm/rocm-documentation +# Header directory +library/include/* @ROCm/rocm-documentation @kiritigowda @rrawther diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy index 663d3c085..0d7834e2b 100644 --- a/.jenkins/precheckin.groovy +++ b/.jenkins/precheckin.groovy @@ -47,7 +47,7 @@ ci: { def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) - def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu22:['gfx908'], ubuntu20:['gfx906'], centos8:['gfx908']])] + def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['gfx90a'], ubuntu22:['gfx1101'], sles15sp1:['gfx908'], rhel8:['gfx1030'], rhel9:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each diff --git a/CMakeLists.txt b/CMakeLists.txt index 34ce6fcac..224125b36 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -346,6 +346,7 @@ install(FILES ${PROJECT_BINARY_DIR}/include/rpp_backend.h # install Test install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_suite/ DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test) +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/utilities/rpp-performancetests DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test) # set license information set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 0c9b63672..8ecbd3663 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1 +1 @@ -rocm-docs-core[api_reference]==0.33.0 +rocm-docs-core[api_reference]==0.35.0 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index f7bc7e2c1..ea1c7619a 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -112,7 +112,7 @@ requests==2.28.2 # via # pygithub # sphinx -rocm-docs-core[api-reference]==0.33.0 +rocm-docs-core[api-reference]==0.35.0 # via # -r requirements.in # rocm-docs-core diff --git a/include/rppdefs.h b/include/rppdefs.h index 2beafbc0c..b0baf7d34 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -116,8 +116,8 @@ typedef enum RPP_ERROR_NOT_ENOUGH_MEMORY = -16, /*! \brief Out of bound source ROI \ingroup group_rppdefs */ RPP_ERROR_OUT_OF_BOUND_SRC_ROI = -17, - /*! \brief src and dst layout mismatch \ingroup group_rppdefs */ - RPP_ERROR_SRC_DST_LAYOUT_MISMATCH = -18 + /*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */ + RPP_ERROR_INVALID_CHANNELS = -18 } RppStatus; /*! \brief RPP rppStatus_t type enums diff --git a/include/rppi_arithmetic_operations.h b/include/rppi_arithmetic_operations.h index 0fb79dbf6..17aef722d 100644 --- a/include/rppi_arithmetic_operations.h +++ b/include/rppi_arithmetic_operations.h @@ -320,4 +320,4 @@ RppStatus rppi_tensor_multiply_u8_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RppPtr } #endif -#endif \ No newline at end of file +#endif diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h index 0a247f886..51705eefc 100644 --- a/include/rppt_tensor_arithmetic_operations.h +++ b/include/rppt_tensor_arithmetic_operations.h @@ -30,7 +30,7 @@ SOFTWARE. * \brief RPPT Tensor Arithmetic operation Functions. * * \defgroup group_tensor_arithmetic Operations: AMD RPP Tensor Arithmetic Operations - * \brief Tensor Color Augmentations. + * \brief Tensor Arithmetic Operations. */ #include "rpp.h" @@ -39,53 +39,221 @@ SOFTWARE. extern "C" { #endif -/*! \brief Fmadd augmentation HOST +/*! + * \file + * \brief RPPT Tensor Operations - Arithmetic Operations. + * \defgroup group_tensor_arithmetic_operations RPPT Tensor Operations - Arithmetic Operations. + * \brief RPPT Tensor Operations - Arithmetic Operations. + */ + +/*! \addtogroup group_rppt_tensor_arithmetic_operations + * @{ + */ + +/*! \brief Fused multiply add scalar augmentation on HOST backend * \details This function performs the fmadd operation on a batch of 4D tensors. * It multiplies each element of the source tensor by a corresponding element in the 'mulTensor', * adds a corresponding element from the 'addTensor', and stores the result in the destination tensor. * Support added for f32 -> f32 dataype. - * \param [in] srcPtr source tensor memory + * \param [in] srcPtr source tensor in HOST memory * \param[in] srcGenericDescPtr source tensor descriptor - * \param[out] dstPtr destination tensor memory + * \param[out] dstPtr destination tensor in HOST memory * \param[in] dstGenericDescPtr destination tensor descriptor * \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values) * \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values) * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) - * \param [in] rppHandle Host-handle - * \return RppStatus enum. - * \returns RPP_SUCCESS \ref RppStatus on successful completion. - * Else return RPP_ERROR - * \ingroup group_tensor_arithmetic + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. */ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); - -/*! \brief Fmadd augmentation GPU +#ifdef GPU_SUPPORT +/*! \brief Fused multiply add scalar augmentation on HIP backend * \details This function performs the fmadd operation on a batch of 4D tensors. * It multiplies each element of the source tensor by a corresponding element in the 'mulTensor', * adds a corresponding element from the 'addTensor', and stores the result in the destination tensor. * Support added for f32 -> f32 dataype. - * \param [in] srcPtr source tensor memory + * \param [in] srcPtr source tensor in HIP memory * \param[in] srcGenericDescPtr source tensor descriptor - * \param[out] dstPtr destination tensor memory + * \param[out] dstPtr destination tensor in HIP memory * \param[in] dstGenericDescPtr destination tensor descriptor * \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values) * \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values) * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) - * \param [in] rppHandle Hip-handle - * \return RppStatus enum. - * \returns RPP_SUCCESS \ref RppStatus on successful completion. - * Else return RPP_ERROR - * \ingroup group_tensor_arithmetic + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! \brief Add scalar augmentation on HOST backend + * \details This function performs the addition operation on a batch of 4D tensors. + * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HOST memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HOST memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. */ +RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); #ifdef GPU_SUPPORT -RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +/*! \brief Add scalar augmentation on HIP backend + * \details This function performs the addition operation on a batch of 4D tensors. + * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HIP memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HIP memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! \brief Subtract scalar augmentation on HOST backend + * \details This function performs the subtraction operation on a batch of 4D tensors. + * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HOST memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HOST memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Subtract scalar augmentation on HIP backend + * \details This function performs the subtraction operation on a batch of 4D tensors. + * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HIP memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HIP memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! \brief Multiply scalar augmentation on HOST backend + * \details This function performs the multiplication operation on a batch of 4D tensors. + * It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HOST memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HOST memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] mulTensor multiplier values for used for multiplication (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Multiply scalar augmentation on HIP backend + * \details This function performs the multiplication operation on a batch of 4D tensors. + * It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor. + * Support added for f32 -> f32 dataype. + * \param [in] srcPtr source tensor in HIP memory + * \param[in] srcGenericDescPtr source tensor descriptor + * \param[out] dstPtr destination tensor in HIP memory + * \param[in] dstGenericDescPtr destination tensor descriptor + * \param[in] mulTensor multiplier values for used for multiplication (1D tensor of batchSize Rpp32f values) + * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Magnitude computation on HOST backend for a NCHW/NHWC layout tensor + * \details This function computes magnitude of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input1 + * \image html img150x150_2.jpg Sample Input2 + * \image html magnitude_operation_img150x150.jpg Sample Output + * \param [in] srcPtr1 source1 tensor in HOST memory + * \param [in] srcPtr2 source2 tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Magnitude computation on HOST backend for a NCHW/NHWC layout tensor + * \details This function computes magnitude of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input1 + * \image html img150x150_2.jpg Sample Input2 + * \image html magnitude_operation_img150x150.jpg Sample Output + * \param [in] srcPtr1 source1 tensor in HOST memory + * \param [in] srcPtr2 source2 tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! @} + */ + #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H +#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H \ No newline at end of file diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h index 138b3baa8..31bb34eff 100644 --- a/include/rppt_tensor_audio_augmentations.h +++ b/include/rppt_tensor_audio_augmentations.h @@ -95,7 +95,22 @@ RppStatus rppt_to_decibels_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_ */ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcLengthTensor, Rpp32f *coeffTensor, RpptAudioBorderType borderType, rppHandle_t rppHandle); +/*! \brief Down Mixing augmentation on HOST backend +* \details Down Mixing augmentation for audio data +* \param[in] srcPtr source tensor in HOST memory +* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) +* \param[out] dstPtr destination tensor in HOST memory +* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) +* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) +* \param[in] normalizeWeights bool flag to specify if normalization of weights is needed +* \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() +* \return A \ref RppStatus enumeration. +* \retval RPP_SUCCESS Successful completion. +* \retval RPP_ERROR* Unsuccessful completion. +*/ +RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle); + #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_AUDIO_AUGMENTATIONS_H \ No newline at end of file +#endif // RPPT_TENSOR_AUDIO_AUGMENTATIONS_H diff --git a/include/rppt_tensor_color_augmentations.h b/include/rppt_tensor_color_augmentations.h index deabd885d..99909cb42 100644 --- a/include/rppt_tensor_color_augmentations.h +++ b/include/rppt_tensor_color_augmentations.h @@ -417,6 +417,48 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr RppStatus rppt_lut_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RppPtr_t lutPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Color Temperature augmentation on HOST backend for a NCHW/NHWC layout tensor + * \details The color temperature augmentation does a image temperature adjustment operation, taking a pixel adjustment value as argument for each image in a batch of RGB(3 channel) with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.jpg Sample Input + * \image html color_augmentations_color_temperature_img150x150.jpg Sample Output + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp8s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Color Temperature augmentation on HIP backend for a NCHW/NHWC layout tensor + * \details The color temperature augmentation does a image temperature adjustment operation, taking a pixel adjustment value as argument for each image in a batch of RGB(3 channel) with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.jpg Sample Input + * \image html color_augmentations_color_temperature_img150x150.jpg Sample Output + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_color_temperature_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h index 181b1c565..3cb49a82b 100644 --- a/include/rppt_tensor_statistical_operations.h +++ b/include/rppt_tensor_statistical_operations.h @@ -24,6 +24,7 @@ SOFTWARE. #ifndef RPPT_TENSOR_STATISTICAL_OPERATIONS_H #define RPPT_TENSOR_STATISTICAL_OPERATIONS_H + #include "rpp.h" #include "rppdefs.h" #ifdef __cplusplus @@ -77,6 +78,78 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorSumArr, Rpp32u tensorSumArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Tensor min operation on HOST backend for a NCHW/NHWC layout tensor + * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] minArr destination array in HOST memory + * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t minArr, Rpp32u minArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Tensor min operation on HIP backend for a NCHW/NHWC layout tensor + * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] minArr destination array in HIP memory + * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMinArr, Rpp32u imageMinArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! \brief Tensor max operation on HOST backend for a NCHW/NHWC layout tensor + * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] maxArr destination array in HOST memory + * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t maxArr, Rpp32u maxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Tensor max operation on HIP backend for a NCHW/NHWC layout tensor + * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] maxArr destination array in HIP memory + * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index 67c34de70..1e748cc86 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -2431,6 +2431,24 @@ inline RppStatus custom_convolve_image_host(T* srcPtr, RppiSize srcSize, U* dstP // Compute Functions for RPP Tensor API +inline void compute_multiply_16_host(__m256 *p, __m256 *pMulParam) +{ + p[0] = _mm256_mul_ps(p[0], pMulParam[0]); // multiply adjustment + p[1] = _mm256_mul_ps(p[1], pMulParam[0]); // multiply adjustment +} + +inline void compute_subtract_16_host(__m256 *p, __m256 *pSubtractParam) +{ + p[0] = _mm256_sub_ps(p[0], pSubtractParam[0]); // subtract adjustment + p[1] = _mm256_sub_ps(p[1], pSubtractParam[0]); // subtract adjustment +} + +inline void compute_add_16_host(__m256 *p, __m256 *pAddParam) +{ + p[0] = _mm256_add_ps(p[0], pAddParam[0]); // add adjustment + p[1] = _mm256_add_ps(p[1], pAddParam[0]); // add adjustment +} + inline void compute_rmn_24_host(__m256 *p, __m256 *pRMNParams) { p[0] = _mm256_mul_ps(_mm256_sub_ps(p[0], pRMNParams[0]), pRMNParams[1]); @@ -3032,6 +3050,22 @@ inline void compute_color_cast_12_host(__m128 *p, __m128 pMul, __m128 *pAdd) p[2] = _mm_fmadd_ps(_mm_sub_ps(p[2], pAdd[2]), pMul, pAdd[2]); // color_cast adjustment Rs } +inline void compute_color_temperature_48_host(__m256 *p, __m256 pAdj) +{ + p[0] = _mm256_add_ps(p[0], pAdj); // color_temperature adjustment Rs + p[1] = _mm256_add_ps(p[1], pAdj); // color_temperature adjustment Rs + // no color_temperature adjustment Gs + p[4] = _mm256_sub_ps(p[4], pAdj); // color_temperature adjustment Bs + p[5] = _mm256_sub_ps(p[5], pAdj); // color_temperature adjustment Bs +} + +inline void compute_color_temperature_24_host(__m256 *p, __m256 pAdj) +{ + p[0] = _mm256_add_ps(p[0], pAdj); // color_temperature adjustment Rs + // no color_temperature adjustment Gs + p[2] = _mm256_sub_ps(p[2], pAdj); // color_temperature adjustment Bs +} + inline void compute_xywh_from_ltrb_host(RpptROIPtr roiPtrInput, RpptROIPtr roiPtrImage) { roiPtrImage->xywhROI.xy.x = roiPtrInput->ltrbROI.lt.x; @@ -5962,4 +5996,284 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2 pSumB[0] = _mm256_add_pd(_mm256_add_pd(p[4], p[5]), pSumB[0]); //add 8B values and bring it down to 4 } -#endif //RPP_CPU_COMMON_H \ No newline at end of file +inline void reduce_min_32_host(__m256i *pMin, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7); + px[0] = _mm256_castsi256_si128(pMin[0]); + px[1] = _mm256_extracti128_si256(pMin[0], 1); + px[0] = _mm_min_epu8(px[0], px[1]); + px[1] = _mm_unpacklo_epi8(zero, px[0]); + px[0] = _mm_unpackhi_epi8(zero, px[0]); + px[0] = _mm_min_epu8(px[0], px[1]); + px[1] = _mm_unpacklo_epi16(zero, px[0]); + px[0] = _mm_unpackhi_epi16(zero, px[0]); + px[0] = _mm_min_epu16(px[0], px[1]); + px[1] = _mm_unpacklo_epi32(zero, px[0]); + px[0] = _mm_unpackhi_epi32(zero, px[0]); + px[0] = _mm_min_epu32(px[0], px[1]); + result[0] = _mm_shuffle_epi8(px[0], mask); +} + +inline void compute_min_96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB) +{ + pMinR[0] = _mm256_min_epu8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min + pMinG[0] = _mm256_min_epu8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min + pMinB[0] = _mm256_min_epu8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min +} + +inline void reduce_min_96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result) +{ + __m128i px[4]; + __m128i zero = _mm_setzero_si128(); + px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1)); + px[1] = _mm_min_epu8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1)); + px[1] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1])); + px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1)); + px[0] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero)); + px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_min_48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB) +{ + pMinR[0] = _mm_min_epu8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min + pMinG[0] = _mm_min_epu8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min + pMinB[0] = _mm_min_epu8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min +} + +inline void reduce_min_48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + px[1] = _mm_min_epu8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0])); + px[0] = _mm_min_epu8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero)); + px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void reduce_max_32_host(__m256i *pMax, __m128i *result) +{ + __m128i px; + __m128i zero = _mm_setzero_si128(); + __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7); + px = _mm_max_epu8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1)); + px = _mm_max_epu8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px)); + px = _mm_max_epu16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px)); + px = _mm_max_epu32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px)); + result[0] = _mm_shuffle_epi8(px, mask); +} + +inline void compute_max_96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB) +{ + pMaxR[0] = _mm256_max_epu8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max + pMaxG[0] = _mm256_max_epu8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max + pMaxB[0] = _mm256_max_epu8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max +} + +inline void reduce_max_96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result) +{ + __m128i px[4]; + __m128i zero = _mm_setzero_si128(); + px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1)); + px[1] = _mm_max_epu8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1)); + px[1] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1])); + px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1)); + px[0] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero)); + px[1] = _mm_max_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_max_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_max_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_max_48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB) +{ + pMaxR[0] = _mm_max_epu8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max + pMaxG[0] = _mm_max_epu8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max + pMaxB[0] = _mm_max_epu8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max +} + +inline void reduce_max_48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero)); + px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_min_float8_host(__m256 *p1, __m256 *pMin) +{ + pMin[0] = _mm256_min_ps(p1[0], pMin[0]); //compare and store min of 8 values into global min +} + +inline void reduce_min_float8_host(__m256 *pMin, __m128 *result) +{ + __m128 px; + px = _mm_min_ps(_mm256_castps256_ps128(pMin[0]), _mm256_extractf128_ps(pMin[0], 1)); + px = _mm_min_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px)); + result[0] = _mm_shuffle_ps(px, px, 39); +} + +inline void compute_min_float24_host(__m256 *p1, __m256 *pMinR, __m256 *pMinG, __m256 *pMinB) +{ + pMinR[0] = _mm256_min_ps(p1[0], pMinR[0]); //compare and store min of 8 R values into global min + pMinG[0] = _mm256_min_ps(p1[1], pMinG[0]); //compare and store min of 8 G values into global min + pMinB[0] = _mm256_min_ps(p1[2], pMinB[0]); //compare and store min of 8 B values into global min +} + +inline void reduce_min_float24_host(__m256 *pMinR, __m256 *pMinG, __m256 *pMinB, __m256 *result) // TO CHANGE +{ + __m128 px[2]; + px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinR[0]), _mm256_extractf128_ps(pMinR[0], 1)); + px[1] = _mm_min_ps(_mm256_castps256_ps128(pMinG[0]), _mm256_extractf128_ps(pMinG[0], 1)); + px[0] = _mm_min_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1])); + px[0] = _mm_permute_ps(px[0], 0b11011000); + result[0] = _mm256_castps128_ps256(px[0]); + px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinB[0]), _mm256_extractf128_ps(pMinB[0], 1)); + px[1] = _mm_min_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0)); + px[0] = _mm_shuffle_ps(px[1], px[1], 34); + result[0] = _mm256_insertf128_ps(result[0], px[0], 1); +} + +inline void compute_max_float8_host(__m256 *p1, __m256 *pMax) +{ + pMax[0] = _mm256_max_ps(p1[0], pMax[0]); //compare and store max of 8 values into global min +} + +inline void reduce_max_float8_host(__m256 *pMax, __m128 *result) +{ + __m128 px; + px = _mm_max_ps(_mm256_castps256_ps128(pMax[0]), _mm256_extractf128_ps(pMax[0], 1)); + px = _mm_max_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px)); + result[0] = _mm_shuffle_ps(px, px, 39); +} + +inline void compute_max_float24_host(__m256 *p1, __m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB) +{ + pMaxR[0] = _mm256_max_ps(p1[0], pMaxR[0]); //compare and store max of 8 R values into global min + pMaxG[0] = _mm256_max_ps(p1[1], pMaxG[0]); //compare and store max of 8 G values into global min + pMaxB[0] = _mm256_max_ps(p1[2], pMaxB[0]); //compare and store max of 8 B values into global min +} + +inline void reduce_max_float24_host(__m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB, __m256 *result) +{ + __m128 px[2]; + px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxR[0]), _mm256_extractf128_ps(pMaxR[0], 1)); + px[1] = _mm_max_ps(_mm256_castps256_ps128(pMaxG[0]), _mm256_extractf128_ps(pMaxG[0], 1)); + px[0] = _mm_max_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1])); + px[0] = _mm_permute_ps(px[0], 0b11011000); + result[0] = _mm256_castps128_ps256(px[0]); + px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxB[0]), _mm256_extractf128_ps(pMaxB[0], 1)); + px[1] = _mm_max_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0)); + px[0] = _mm_shuffle_ps(px[1], px[1], 34); + result[0] = _mm256_insertf128_ps(result[0], px[0], 1); +} + +inline void reduce_min_i32_host(__m256i *pMin, __m128i *result) +{ + __m128i px; + __m128i zero = _mm_setzero_si128(); + __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7); + px = _mm_min_epi8(_mm256_castsi256_si128(pMin[0]), _mm256_extracti128_si256(pMin[0], 1)); + px = _mm_min_epi8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px)); + px = _mm_min_epi16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px)); + px = _mm_min_epi32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px)); + result[0] = _mm_shuffle_epi8(px, mask); +} + +inline void compute_min_i96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB) +{ + pMinR[0] = _mm256_min_epi8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min + pMinG[0] = _mm256_min_epi8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min + pMinB[0] = _mm256_min_epi8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min +} + +inline void reduce_min_i96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result) +{ + __m128i px[4]; + __m128i zero = _mm_setzero_si128(); + px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1)); + px[1] = _mm_min_epi8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1)); + px[1] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1])); + px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1)); + px[0] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero)); + px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_min_i48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB) +{ + pMinR[0] = _mm_min_epi8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min + pMinG[0] = _mm_min_epi8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min + pMinB[0] = _mm_min_epi8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min +} + +inline void reduce_min_i48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + px[1] = _mm_min_epi8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0])); + px[0] = _mm_min_epi8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero)); + px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void reduce_max_i32_host(__m256i *pMax, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7); + px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1)); + px[0] = _mm_max_epi8(_mm_unpacklo_epi8(zero, px[0]), _mm_unpackhi_epi8(zero, px[0])); + px[0] = _mm_max_epi16(_mm_unpacklo_epi16(zero, px[0]), _mm_unpackhi_epi16(zero, px[0])); + px[0] = _mm_max_epi32(_mm_unpacklo_epi32(zero, px[0]), _mm_unpackhi_epi32(zero, px[0])); + result[0] = _mm_shuffle_epi8(px[0], mask); +} + +inline void compute_max_i96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB) +{ + pMaxR[0] = _mm256_max_epi8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max + pMaxG[0] = _mm256_max_epi8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max + pMaxB[0] = _mm256_max_epi8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max +} + +inline void reduce_max_i96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result) +{ + __m128i px[4]; + __m128i zero = _mm_setzero_si128(); + px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1)); + px[1] = _mm_max_epi8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1)); + px[1] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1])); + px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1)); + px[0] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero)); + px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +inline void compute_max_i48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB) +{ + pMaxR[0] = _mm_max_epi8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max + pMaxG[0] = _mm_max_epi8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max + pMaxB[0] = _mm_max_epi8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max +} + +inline void reduce_max_i48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result) +{ + __m128i px[2]; + __m128i zero = _mm_setzero_si128(); + px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero)); + px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0])); + px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero)); + result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero)); +} + +#endif //RPP_CPU_COMMON_H diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 84c898b90..d03ec0e79 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -75,7 +75,7 @@ typedef union #define SIMD_GET_PS(name) (*(const __m128 *)_xmm_const_##name) -const __m128 xmm_p0 = _mm_set1_ps(0.0f); +const __m128 xmm_p0 = _mm_setzero_ps(); const __m128 xmm_p1 = _mm_set1_ps(1.0f); const __m128 xmm_p2 = _mm_set1_ps(2.0f); const __m128 xmm_pm2 = _mm_set1_ps(-2.0f); @@ -243,7 +243,7 @@ inline void rpp_mm256_print_epi8(__m256i vPrintArray) printf("\n"); for (int ct = 0; ct < 32; ct++) { - printf("%d ", printArray[ct]); + printf("%d ", (unsigned char)printArray[ct]); } } @@ -1271,6 +1271,20 @@ inline void rpp_load16_u8_to_u32_avx(Rpp8u *srcPtr, __m256i *p) p[1] = _mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15)); /* Contains pixels 09-16 */ } +inline void rpp_load96_u8_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256i *p) +{ + p[0] = _mm256_loadu_si256((__m256i *)srcPtrR); + p[1] = _mm256_loadu_si256((__m256i *)srcPtrG); + p[2] = _mm256_loadu_si256((__m256i *)srcPtrB); +} + +inline void rpp_load96_i8_avx(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *p) +{ + p[0] = _mm256_load_si256((__m256i *)srcPtrR); + p[1] = _mm256_load_si256((__m256i *)srcPtrG); + p[2] = _mm256_load_si256((__m256i *)srcPtrB); +} + inline void rpp_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p) { __m128 p128[8]; @@ -1478,6 +1492,16 @@ inline void rpp_store4_f64_to_f64_avx(Rpp64f *dstPtr, __m256d *p) _mm256_storeu_pd(dstPtr, p[0]); } +inline void rpp_store16_u8_to_u8(Rpp8u *dstPtr, __m128i *p) +{ + _mm_storeu_si128((__m128i *)dstPtr, p[0]); +} + +inline void rpp_store16_i8(Rpp8s *dstPtr, __m128i *p) +{ + _mm_store_si128((__m128i *)dstPtr, p[0]); +} + inline void rpp_store8_f32_to_f16_avx(Rpp16f *dstPtr, __m256 *p) { __m128i px128 = _mm256_cvtps_ph(p[0], _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); @@ -2438,6 +2462,29 @@ static inline __m128 log_ps(__m128 x) return x; } +inline Rpp32f rpp_hsum_ps(__m128 x) +{ + __m128 shuf = _mm_movehdup_ps(x); // broadcast elements 3,1 to 2,0 + __m128 sums = _mm_add_ps(x, shuf); + shuf = _mm_movehl_ps(shuf, sums); // high half -> low half + sums = _mm_add_ss(sums, shuf); + return _mm_cvtss_f32(sums); +} + +inline Rpp32f rpp_hsum_ps(__m256 x) +{ + __m128 p0 = _mm256_extractf128_ps(x, 1); // Contains x7, x6, x5, x4 + __m128 p1 = _mm256_castps256_ps128(x); // Contains x3, x2, x1, x0 + __m128 sum = _mm_add_ps(p0, p1); // Contains x3 + x7, x2 + x6, x1 + x5, x0 + x4 + p0 = sum; // Contains -, -, x1 + x5, x0 + x4 + p1 = _mm_movehl_ps(sum, sum); // Contains -, -, x3 + x7, x2 + x6 + sum = _mm_add_ps(p0, p1); // Contains -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 + p0 = sum; // Contains -, -, -, x0 + x2 + x4 + x6 + p1 = _mm_shuffle_ps(sum, sum, 0x1); // Contains -, -, -, x1 + x3 + x5 + x7 + sum = _mm_add_ss(p0, p1); // Contains -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 + return _mm_cvtss_f32(sum); +} + static inline void fast_matmul4x4_sse(float *A, float *B, float *C) { __m128 row1 = _mm_load_ps(&B[0]); // Row 0 of B diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index a7412aa2d..d9c0ce02d 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -184,6 +184,13 @@ inline void generate_gaussian_kernel_gpu(Rpp32f stdDev, Rpp32f* kernel, Rpp32u k } } +// Retrieve Min and Max given a datatype + +inline void getImageBitDepthMinMax(uchar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); } +inline void getImageBitDepthMinMax(float *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); } +inline void getImageBitDepthMinMax(half *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); } +inline void getImageBitDepthMinMax(schar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(-128, 127); } + /******************** DEVICE FUNCTIONS ********************/ // -------------------- Set 0 - Range checks and Range adjustment -------------------- @@ -1560,6 +1567,20 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_to_int24_pln3(schar *srcPtr, // /******************** DEVICE MATH HELPER FUNCTIONS ********************/ +// float8 min + +__device__ __forceinline__ void rpp_hip_math_min8(d_float8 *srcPtr_f8, float *dstPtr) +{ + *dstPtr = fminf(fminf(fminf(fminf(fminf(fminf(fminf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]); +} + +// float8 max + +__device__ __forceinline__ void rpp_hip_math_max8(d_float8 *srcPtr_f8, float *dstPtr) +{ + *dstPtr = fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]); +} + // d_float16 floor __device__ __forceinline__ void rpp_hip_math_floor16(d_float16 *srcPtr_f16, d_float16 *dstPtr_f16) diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp index 96553489d..b98145be0 100644 --- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp +++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp @@ -26,5 +26,9 @@ SOFTWARE. #define HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP #include "kernel/fused_multiply_add_scalar.hpp" +#include "kernel/add_scalar.hpp" +#include "kernel/subtract_scalar.hpp" +#include "kernel/multiply_scalar.hpp" +#include "kernel/magnitude.hpp" -#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP \ No newline at end of file +#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP diff --git a/src/modules/cpu/host_tensor_audio_augmentations.hpp b/src/modules/cpu/host_tensor_audio_augmentations.hpp index 7737b38c3..e2edb1afc 100644 --- a/src/modules/cpu/host_tensor_audio_augmentations.hpp +++ b/src/modules/cpu/host_tensor_audio_augmentations.hpp @@ -28,5 +28,6 @@ SOFTWARE. #include "kernel/non_silent_region_detection.hpp" #include "kernel/to_decibels.hpp" #include "kernel/pre_emphasis_filter.hpp" +#include "kernel/down_mixing.hpp" #endif // HOST_TENSOR_AUDIO_AUGMENTATIONS_HPP \ No newline at end of file diff --git a/src/modules/cpu/host_tensor_color_augmentations.hpp b/src/modules/cpu/host_tensor_color_augmentations.hpp index 19e0b471c..aba3b8158 100644 --- a/src/modules/cpu/host_tensor_color_augmentations.hpp +++ b/src/modules/cpu/host_tensor_color_augmentations.hpp @@ -34,5 +34,6 @@ SOFTWARE. #include "kernel/exposure.hpp" #include "kernel/contrast.hpp" #include "kernel/lut.hpp" +#include "kernel/color_temperature.hpp" #endif // HOST_TENSOR_COLOR_AUGMENTATIONS_HPP diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp index dae3e6236..32b8b62b5 100644 --- a/src/modules/cpu/host_tensor_statistical_operations.hpp +++ b/src/modules/cpu/host_tensor_statistical_operations.hpp @@ -26,5 +26,7 @@ SOFTWARE. #define HOST_TENSOR_STATISTICAL_OPERATIONS_HPP #include "kernel/tensor_sum.hpp" +#include "kernel/tensor_min.hpp" +#include "kernel/tensor_max.hpp" #endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP \ No newline at end of file diff --git a/src/modules/cpu/kernel/add_scalar.hpp b/src/modules/cpu/kernel/add_scalar.hpp new file mode 100644 index 000000000..d0179d4e1 --- /dev/null +++ b/src/modules/cpu/kernel/add_scalar.hpp @@ -0,0 +1,152 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus add_scalar_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *addTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI3D roiDefault; + if(srcGenericDescPtr->layout==RpptLayout::NCDHW) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]}; + else if(srcGenericDescPtr->layout==RpptLayout::NDHWC) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + RpptROI3D roi; + RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount]; + compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + Rpp32f addParam = addTensor[batchCount]; + Rpp32f *srcPtrChannel, *dstPtrChannel; + dstPtrChannel = dstPtrImage; + + Rpp32u vectorIncrement = 16; + Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; + __m256 pAddParam = _mm256_set1_ps(addParam); + + // Add without fused output-layout toggle (NCDHW -> NCDHW) + if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrDepth, *dstPtrDepth; + srcPtrDepth = srcPtrChannel; + dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_add_16_host(p, &pAddParam); // add adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtrTemp++ + addParam; + } + srcPtrRow += srcGenericDescPtr->strides[3]; + dstPtrRow += dstGenericDescPtr->strides[3]; + } + srcPtrDepth += srcGenericDescPtr->strides[2]; + dstPtrDepth += dstGenericDescPtr->strides[2]; + } + srcPtrChannel += srcGenericDescPtr->strides[1]; + dstPtrChannel += srcGenericDescPtr->strides[1]; + } + } + // Add without fused output-layout toggle (NDHWC -> NDHWC) + else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + Rpp32f *srcPtrDepth = srcPtrChannel; + Rpp32f *dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_add_16_host(p, &pAddParam); // add adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtrTemp++ + addParam; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtrDepth += srcGenericDescPtr->strides[1]; + dstPtrDepth += dstGenericDescPtr->strides[1]; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/color_temperature.hpp b/src/modules/cpu/kernel/color_temperature.hpp new file mode 100644 index 000000000..1358ac800 --- /dev/null +++ b/src/modules/cpu/kernel/color_temperature.hpp @@ -0,0 +1,1035 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus color_temperature_u8_u8_host_tensor(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + Rpp8s *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(dstDescPtr->n) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f adjustmentValue = adjustmentValueTensor[batchCount]; + + Rpp8u *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + __m256 pAdj = _mm256_set1_ps(adjustmentValue); + + Rpp8u *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + // Color Temperature with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = (bufferLength / 48) * 48; + + Rpp8u *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRow = srcPtrChannel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTemp = srcPtrRow; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 48) + { + __m256 p[6]; + + rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads + compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores + + srcPtrTemp += 48; + dstPtrTempR += 16; + dstPtrTempG += 16; + dstPtrTempB += 16; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[0] + adjustmentValue); + *dstPtrTempG++ = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[1]); + *dstPtrTempB++ = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[2] - adjustmentValue); + + srcPtrTemp += 3; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = (bufferLength / 48) * 48; + + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 16) + { + __m256 p[6]; + + rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads + compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p); // simd stores + + srcPtrTempR += 16; + srcPtrTempG += 16; + srcPtrTempB += 16; + dstPtrTemp += 48; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = (Rpp8u) RPPPIXELCHECK(*srcPtrTempR + adjustmentValue); + dstPtrTemp[1] = (Rpp8u) RPPPIXELCHECK(*srcPtrTempG); + dstPtrTemp[2] = (Rpp8u) RPPPIXELCHECK(*srcPtrTempB - adjustmentValue); + + dstPtrTemp += 3; + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = (bufferLength / 48) * 48; + + Rpp8u *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 48) + { + __m256 p[6]; + + rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads + compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p); // simd stores + + srcPtrTemp += 48; + dstPtrTemp += 48; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + dstPtrTemp[0] = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[0] + adjustmentValue); + dstPtrTemp[1] = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[1]); + dstPtrTemp[2] = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[2] - adjustmentValue); + + srcPtrTemp += 3; + dstPtrTemp += 3; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = (bufferLength / 48) * 48; + + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 16) + { + __m256 p[6]; + + rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads + compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores + + srcPtrTempR += 16; + srcPtrTempG += 16; + srcPtrTempB += 16; + dstPtrTempR += 16; + dstPtrTempG += 16; + dstPtrTempB += 16; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR++ = (Rpp8u) RPPPIXELCHECK(*srcPtrTempR + adjustmentValue); + *dstPtrTempG++ = (Rpp8u) RPPPIXELCHECK(*srcPtrTempG); + *dstPtrTempB++ = (Rpp8u) RPPPIXELCHECK(*srcPtrTempB - adjustmentValue); + + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus color_temperature_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp8s *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(dstDescPtr->n) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f adjustmentValue = adjustmentValueTensor[batchCount] * ONE_OVER_255; + + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + __m256 pAdj = _mm256_set1_ps(adjustmentValue); + + Rpp32f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + // Color Temperature with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = (bufferLength / 24) * 24; + + Rpp32f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRow = srcPtrChannel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTemp = srcPtrRow; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 24) + { + __m256 p[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads + compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores + + srcPtrTemp += 24; + dstPtrTempR += 8; + dstPtrTempG += 8; + dstPtrTempB += 8; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue); + *dstPtrTempG++ = RPPPIXELCHECKF32(srcPtrTemp[1]); + *dstPtrTempB++ = RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue); + + srcPtrTemp += 3; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = (bufferLength / 24) * 24; + + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 8) + { + __m256 p[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads + compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p); // simd stores + + srcPtrTempR += 8; + srcPtrTempG += 8; + srcPtrTempB += 8; + dstPtrTemp += 24; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue); + dstPtrTemp[1] = RPPPIXELCHECKF32(*srcPtrTempG); + dstPtrTemp[2] = RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue); + + dstPtrTemp += 3; + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = (bufferLength / 24) * 24; + + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 24) + { + __m256 p[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads + compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p); // simd stores + + srcPtrTemp += 24; + dstPtrTemp += 24; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + dstPtrTemp[0] = RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue); + dstPtrTemp[1] = RPPPIXELCHECKF32(srcPtrTemp[1]); + dstPtrTemp[2] = RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue); + + srcPtrTemp += 3; + dstPtrTemp += 3; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = (bufferLength / 24) * 24; + + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 8) + { + __m256 p[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads + compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores + + srcPtrTempR += 8; + srcPtrTempG += 8; + srcPtrTempB += 8; + dstPtrTempR += 8; + dstPtrTempG += 8; + dstPtrTempB += 8; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR++ = RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue); + *dstPtrTempG++ = RPPPIXELCHECKF32(*srcPtrTempG); + *dstPtrTempB++ = RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue); + + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRowR += srcDescPtr->strides.hStride; + dstPtrRowG += srcDescPtr->strides.hStride; + dstPtrRowB += srcDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus color_temperature_f16_f16_host_tensor(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp8s *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(dstDescPtr->n) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f adjustmentValue = adjustmentValueTensor[batchCount] * ONE_OVER_255; + + Rpp16f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + __m256 pAdj = _mm256_set1_ps(adjustmentValue); + + Rpp16f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + Rpp32u vectorIncrement = 24; + + // Color Temperature with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; + + Rpp16f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRow = srcPtrChannel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTemp = srcPtrRow; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[24]; + Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8]; + + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + + __m256 p[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); // simd loads + compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, p); // simd stores + + for(int cnt = 0; cnt < 8; cnt++) + { + dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt]; + dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt]; + dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt]; + } + + srcPtrTemp += 24; + dstPtrTempR += 8; + dstPtrTempG += 8; + dstPtrTempB += 8; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue); + *dstPtrTempG++ = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[1]); + *dstPtrTempB++ = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue); + + srcPtrTemp += 3; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = (bufferLength / 24) * 24; + + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 8) + { + Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8]; + Rpp32f dstPtrTemp_ps[25]; + + for(int cnt = 0; cnt < 8; cnt++) + { + srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt]; + srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt]; + srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt]; + } + + __m256 p[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); // simd loads + compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, p); // simd stores + + for(int cnt = 0; cnt < 24; cnt++) + dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt]; + + srcPtrTempR += 8; + srcPtrTempG += 8; + srcPtrTempB += 8; + dstPtrTemp += 24; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue); + dstPtrTemp[1] = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempG); + dstPtrTemp[2] = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue); + + dstPtrTemp += 3; + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = (bufferLength / 24) * 24; + + Rpp16f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 24) + { + Rpp32f srcPtrTemp_ps[24], dstPtrTemp_ps[25]; + + for(int cnt = 0; cnt < 24; cnt++) + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + + __m256 p[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); // simd loads + compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, p); // simd stores + + for(int cnt = 0; cnt < 24; cnt++) + dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt]; + + srcPtrTemp += 24; + dstPtrTemp += 24; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + dstPtrTemp[0] = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue); + dstPtrTemp[1] = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[1]); + dstPtrTemp[2] = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue); + + srcPtrTemp += 3; + dstPtrTemp += 3; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = (bufferLength / 24) * 24; + + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 8) + { + Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8]; + Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8]; + + for(int cnt = 0; cnt < 8; cnt++) + { + srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt]; + srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt]; + srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt]; + } + + __m256 p[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); // simd loads + compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, p); // simd stores + + for(int cnt = 0; cnt < 8; cnt++) + { + dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt]; + dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt]; + dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt]; + } + + srcPtrTempR += 8; + srcPtrTempG += 8; + srcPtrTempB += 8; + dstPtrTempR += 8; + dstPtrTempG += 8; + dstPtrTempB += 8; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR++ = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue); + *dstPtrTempG++ = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempG); + *dstPtrTempB++ = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue); + + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRowR += srcDescPtr->strides.hStride; + dstPtrRowG += srcDescPtr->strides.hStride; + dstPtrRowB += srcDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus color_temperature_i8_i8_host_tensor(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + Rpp8s *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(dstDescPtr->n) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f adjustmentValue = adjustmentValueTensor[batchCount]; + + Rpp8s *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + __m256 pAdj = _mm256_set1_ps(adjustmentValue); + + Rpp8s *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + // Color Temperature with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = (bufferLength / 48) * 48; + + Rpp8s *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRow = srcPtrChannel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTemp = srcPtrRow; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 48) + { + __m256 p[6]; + + rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads + compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores + + srcPtrTemp += 48; + dstPtrTempR += 16; + dstPtrTempG += 16; + dstPtrTempB += 16; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[0] + adjustmentValue); + *dstPtrTempG++ = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[1]); + *dstPtrTempB++ = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[2] - adjustmentValue); + + srcPtrTemp += 3; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = (bufferLength / 48) * 48; + + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 16) + { + __m256 p[6]; + + rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads + compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p); // simd stores + + srcPtrTempR += 16; + srcPtrTempG += 16; + srcPtrTempB += 16; + dstPtrTemp += 48; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempR + adjustmentValue); + dstPtrTemp[1] = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempG); + dstPtrTemp[2] = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempB - adjustmentValue); + + dstPtrTemp += 3; + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = (bufferLength / 48) * 48; + + Rpp8s *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 48) + { + __m256 p[6]; + + rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads + compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p); // simd stores + + srcPtrTemp += 48; + dstPtrTemp += 48; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + dstPtrTemp[0] = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[0] + adjustmentValue); + dstPtrTemp[1] = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[1]); + dstPtrTemp[2] = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[2] - adjustmentValue); + + srcPtrTemp += 3; + dstPtrTemp += 3; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Color Temperature with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = (bufferLength / 48) * 48; + + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += 16) + { + __m256 p[6]; + + rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads + compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment + rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores + + srcPtrTempR += 16; + srcPtrTempG += 16; + srcPtrTempB += 16; + dstPtrTempR += 16; + dstPtrTempG += 16; + dstPtrTempB += 16; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR++ = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempR + adjustmentValue); + *dstPtrTempG++ = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempG); + *dstPtrTempB++ = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempB - adjustmentValue); + + srcPtrTempR++; + srcPtrTempG++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/down_mixing.hpp b/src/modules/cpu/kernel/down_mixing.hpp new file mode 100644 index 000000000..9cefc64a2 --- /dev/null +++ b/src/modules/cpu/kernel/down_mixing.hpp @@ -0,0 +1,122 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include + +RppStatus down_mixing_host_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s *srcDimsTensor, + bool normalizeWeights, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32s samples = srcDimsTensor[batchCount * 2]; + Rpp32s channels = srcDimsTensor[batchCount * 2 + 1]; + bool flagAVX = 0; + + if(channels == 1) + { + // No need of downmixing, do a direct memcpy + memcpy(dstPtrTemp, srcPtrTemp, (size_t)(samples * sizeof(Rpp32f))); + } + else + { + Rpp32f *weights = handle.GetInitHandle()->mem.mcpu.tempFloatmem + batchCount * channels; + std::fill(weights, weights + channels, 1.f / channels); + + if(normalizeWeights) + { + // Compute sum of the weights + Rpp32f sum = 0.0; + for(int i = 0; i < channels; i++) + sum += weights[i]; + + // Normalize the weights + Rpp32f invSum = 1.0 / sum; + for(int i = 0; i < channels; i++) + weights[i] *= invSum; + } + + Rpp32s channelIncrement = 4; + Rpp32s alignedChannels = (channels / 4) * 4; + if(channels > 7) + { + flagAVX = 1; + channelIncrement = 8; + alignedChannels = (channels / 8) * 8; + } + + // use weights to downmix to mono + for(int64_t dstIdx = 0; dstIdx < samples; dstIdx++) + { + Rpp32s channelLoopCount = 0; + // if number of channels are greater than or equal to 8, use AVX implementation + if(flagAVX) + { + __m256 pDst = avx_p0; + for(; channelLoopCount < alignedChannels; channelLoopCount += channelIncrement) + { + __m256 pSrc, pWeights; + pWeights = _mm256_setr_ps(weights[channelLoopCount], weights[channelLoopCount + 1], weights[channelLoopCount + 2], weights[channelLoopCount + 3], + weights[channelLoopCount + 4], weights[channelLoopCount + 5], weights[channelLoopCount + 6], weights[channelLoopCount + 7]); + pSrc = _mm256_loadu_ps(srcPtrTemp); + pSrc = _mm256_mul_ps(pSrc, pWeights); + pDst = _mm256_add_ps(pDst, pSrc); + srcPtrTemp += channelIncrement; + } + dstPtrTemp[dstIdx] = rpp_hsum_ps(pDst); + } + else + { + __m128 pDst = xmm_p0; + for(; channelLoopCount < alignedChannels; channelLoopCount += channelIncrement) + { + __m128 pSrc, pWeights; + pWeights = _mm_setr_ps(weights[channelLoopCount], weights[channelLoopCount + 1], weights[channelLoopCount + 2], weights[channelLoopCount + 3]); + pSrc = _mm_loadu_ps(srcPtrTemp); + pSrc = _mm_mul_ps(pSrc, pWeights); + pDst = _mm_add_ps(pDst, pSrc); + srcPtrTemp += channelIncrement; + } + dstPtrTemp[dstIdx] = rpp_hsum_ps(pDst); + } + for(; channelLoopCount < channels; channelLoopCount++) + dstPtrTemp[dstIdx] += ((*srcPtrTemp++) * weights[channelLoopCount]); + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/magnitude.hpp b/src/modules/cpu/kernel/magnitude.hpp new file mode 100644 index 000000000..6eaf4f236 --- /dev/null +++ b/src/modules/cpu/kernel/magnitude.hpp @@ -0,0 +1,1001 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus magnitude_u8_u8_host_tensor(Rpp8u *srcPtr1, + Rpp8u *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; +#endif + + // Magnitude with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[6], p2[6]; + + rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation + p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3]))); // magnitude computation + p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4]))); // magnitude computation + p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5]))); // magnitude computation + rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + Rpp32f srcPtr1TempValue0 = static_cast(srcPtr1Temp[0]); + Rpp32f srcPtr1TempValue1 = static_cast(srcPtr1Temp[1]); + Rpp32f srcPtr1TempValue2 = static_cast(srcPtr1Temp[2]); + Rpp32f srcPtr2TempValue0 = static_cast(srcPtr2Temp[0]); + Rpp32f srcPtr2TempValue1 = static_cast(srcPtr2Temp[1]); + Rpp32f srcPtr2TempValue2 = static_cast(srcPtr2Temp[2]); + *dstPtrTempR++ = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0))))); + *dstPtrTempG++ = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1))))); + *dstPtrTempB++ = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2))))); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Magnitude with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[6], p2[6]; + + rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation + p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3]))); // magnitude computation + p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4]))); // magnitude computation + p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5]))); // magnitude computation + rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + Rpp32f srcPtr1TempValue0 = static_cast(*srcPtr1TempR); + Rpp32f srcPtr1TempValue1 = static_cast(*srcPtr1TempG); + Rpp32f srcPtr1TempValue2 = static_cast(*srcPtr1TempB); + Rpp32f srcPtr2TempValue0 = static_cast(*srcPtr2TempR); + Rpp32f srcPtr2TempValue1 = static_cast(*srcPtr2TempG); + Rpp32f srcPtr2TempValue2 = static_cast(*srcPtr2TempB); + dstPtrTemp[0] = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0))))); + dstPtrTemp[1] = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1))))); + dstPtrTemp[2] = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2))))); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~15; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[2], p2[2]; + + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + rpp_simd_store(rpp_store16_f32_to_u8_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + Rpp32f srcPtr1TempValue = static_cast(*srcPtr1Temp); + Rpp32f srcPtr2TempValue = static_cast(*srcPtr2Temp); + *dstPtrTemp++ = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue * srcPtr1TempValue) + (srcPtr2TempValue * srcPtr2TempValue))))); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus magnitude_f32_f32_host_tensor(Rpp32f *srcPtr1, + Rpp32f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Magnitude with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = RPPPIXELCHECKF32(sqrt((srcPtr1Temp[0] * srcPtr1Temp[0]) + (srcPtr2Temp[0] * srcPtr2Temp[0]))); + *dstPtrTempG++ = RPPPIXELCHECKF32(sqrt((srcPtr1Temp[1] * srcPtr1Temp[1]) + (srcPtr2Temp[1] * srcPtr2Temp[1]))); + *dstPtrTempB++ = RPPPIXELCHECKF32(sqrt((srcPtr1Temp[2] * srcPtr1Temp[2]) + (srcPtr2Temp[2] * srcPtr2Temp[2]))); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Magnitude with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = RPPPIXELCHECKF32(sqrt((*srcPtr1TempR * *srcPtr1TempR) + (*srcPtr2TempR * *srcPtr2TempR))); + dstPtrTemp[1] = RPPPIXELCHECKF32(sqrt((*srcPtr1TempG * *srcPtr1TempG) + (*srcPtr2TempG * *srcPtr2TempG))); + dstPtrTemp[2] = RPPPIXELCHECKF32(sqrt((*srcPtr1TempB * *srcPtr1TempB) + (*srcPtr2TempB * *srcPtr2TempB))); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = RPPPIXELCHECKF32(sqrt((*srcPtr1Temp * *srcPtr1Temp) + (*srcPtr2Temp * *srcPtr2Temp))); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus magnitude_f16_f16_host_tensor(Rpp16f *srcPtr1, + Rpp16f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Magnitude with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; + + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); + } + + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp_ps, p1); // simd loads + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp_ps, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation + rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32(sqrt((srcPtr1Temp[0] * srcPtr1Temp[0]) + (srcPtr2Temp[0] * srcPtr2Temp[0])))); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32(sqrt((srcPtr1Temp[1] * srcPtr1Temp[1]) + (srcPtr2Temp[1] * srcPtr2Temp[1])))); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32(sqrt((srcPtr1Temp[2] * srcPtr1Temp[2]) + (srcPtr2Temp[2] * srcPtr2Temp[2])))); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Magnitude with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1TempR[cnt]); + srcPtr1Temp_ps[cnt + 8] = static_cast(srcPtr1TempG[cnt]); + srcPtr1Temp_ps[cnt + 16] = static_cast(srcPtr1TempB[cnt]); + + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2TempR[cnt]); + srcPtr2Temp_ps[cnt + 8] = static_cast(srcPtr2TempG[cnt]); + srcPtr2Temp_ps[cnt + 16] = static_cast(srcPtr2TempB[cnt]); + } + + __m256 p1[4], p2[4]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1Temp_ps, srcPtr1Temp_ps + 8, srcPtr1Temp_ps + 16, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2Temp_ps, srcPtr2Temp_ps + 8, srcPtr2Temp_ps + 16, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation + rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32(sqrt((*srcPtr1TempR * *srcPtr1TempR) + (*srcPtr2TempR * *srcPtr2TempR)))); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32(sqrt((*srcPtr1TempG * *srcPtr1TempG) + (*srcPtr2TempG * *srcPtr2TempG)))); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32(sqrt((*srcPtr1TempB * *srcPtr1TempB) + (*srcPtr2TempB * *srcPtr2TempB)))); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8]; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); + } + + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp_ps, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp_ps, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(sqrt((*srcPtr1Temp * *srcPtr1Temp) + (*srcPtr2Temp * *srcPtr2Temp)))); + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus magnitude_i8_i8_host_tensor(Rpp8s *srcPtr1, + Rpp8s *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; +#endif + + // Magnitude with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[6], p2[6]; + + rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation + p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3]))); // magnitude computation + p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4]))); // magnitude computation + p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5]))); // magnitude computation + rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + Rpp32f srcPtr1TempValue0 = static_cast(srcPtr1Temp[0] + 128); + Rpp32f srcPtr1TempValue1 = static_cast(srcPtr1Temp[1] + 128); + Rpp32f srcPtr1TempValue2 = static_cast(srcPtr1Temp[2] + 128); + Rpp32f srcPtr2TempValue0 = static_cast(srcPtr2Temp[0] + 128); + Rpp32f srcPtr2TempValue1 = static_cast(srcPtr2Temp[1] + 128); + Rpp32f srcPtr2TempValue2 = static_cast(srcPtr2Temp[2] + 128); + *dstPtrTempR++ = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)) - 128))); + *dstPtrTempG++ = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)) - 128))); + *dstPtrTempB++ = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)) - 128))); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Magnitude with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[6], p2[6]; + + rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation + p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3]))); // magnitude computation + p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4]))); // magnitude computation + p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5]))); // magnitude computation + rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + Rpp32f srcPtr1TempValue0 = static_cast(*srcPtr1TempR + 128); + Rpp32f srcPtr1TempValue1 = static_cast(*srcPtr1TempG + 128); + Rpp32f srcPtr1TempValue2 = static_cast(*srcPtr1TempB + 128); + Rpp32f srcPtr2TempValue0 = static_cast(*srcPtr2TempR + 128); + Rpp32f srcPtr2TempValue1 = static_cast(*srcPtr2TempG + 128); + Rpp32f srcPtr2TempValue2 = static_cast(*srcPtr2TempB + 128); + dstPtrTemp[0] = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)) - 128))); + dstPtrTemp[1] = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)) - 128))); + dstPtrTemp[2] = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)) - 128))); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~15; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[2], p2[2]; + + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation + p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation + rpp_simd_store(rpp_store16_f32_to_i8_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + Rpp32f srcPtr1TempValue = static_cast(*srcPtr1Temp + 128); + Rpp32f srcPtr2TempValue = static_cast(*srcPtr2Temp + 128); + *dstPtrTemp++ = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue * srcPtr1TempValue) + (srcPtr2TempValue * srcPtr2TempValue)) - 128))); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/multiply_scalar.hpp b/src/modules/cpu/kernel/multiply_scalar.hpp new file mode 100644 index 000000000..a27782bcc --- /dev/null +++ b/src/modules/cpu/kernel/multiply_scalar.hpp @@ -0,0 +1,152 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus multiply_scalar_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *mulTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI3D roiDefault; + if(srcGenericDescPtr->layout==RpptLayout::NCDHW) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]}; + else if(srcGenericDescPtr->layout==RpptLayout::NDHWC) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + RpptROI3D roi; + RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount]; + compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f mulParam = mulTensor[batchCount]; + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + Rpp32f *srcPtrChannel, *dstPtrChannel; + dstPtrChannel = dstPtrImage; +#if __AVX2__ + Rpp32u vectorIncrement = 16; + __m256 pMulParam = _mm256_set1_ps(mulParam); + Rpp32u alignedLength = bufferLength & ~(vectorIncrement - 1); +#endif + // multiply without fused output-layout toggle (NCDHW -> NCDHW) + if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrDepth, *dstPtrDepth; + srcPtrDepth = srcPtrChannel; + dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_multiply_16_host(p, &pMulParam); // multiply adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + *dstPtrTemp++ = (*srcPtrTemp++ * mulParam); + + srcPtrRow += srcGenericDescPtr->strides[3]; + dstPtrRow += dstGenericDescPtr->strides[3]; + } + srcPtrDepth += srcGenericDescPtr->strides[2]; + dstPtrDepth += dstGenericDescPtr->strides[2]; + } + srcPtrChannel += srcGenericDescPtr->strides[1]; + dstPtrChannel += srcGenericDescPtr->strides[1]; + } + } + // multiply without fused output-layout toggle (NDHWC -> NDHWC) + else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + Rpp32f *srcPtrDepth = srcPtrChannel; + Rpp32f *dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_multiply_16_host(p, &pMulParam); // multiply adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + *dstPtrTemp++ = (*srcPtrTemp++ * mulParam); + + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtrDepth += srcGenericDescPtr->strides[1]; + dstPtrDepth += dstGenericDescPtr->strides[1]; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/subtract_scalar.hpp b/src/modules/cpu/kernel/subtract_scalar.hpp new file mode 100644 index 000000000..a40e6219f --- /dev/null +++ b/src/modules/cpu/kernel/subtract_scalar.hpp @@ -0,0 +1,152 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus subtract_scalar_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *subtractTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI3D roiDefault; + if(srcGenericDescPtr->layout==RpptLayout::NCDHW) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]}; + else if(srcGenericDescPtr->layout==RpptLayout::NDHWC) + roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + RpptROI3D roi; + RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount]; + compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + Rpp32f subtractParam = subtractTensor[batchCount]; + Rpp32f *srcPtrChannel, *dstPtrChannel; + dstPtrChannel = dstPtrImage; + + Rpp32u vectorIncrement = 16; + Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; + __m256 pSubtractParam = _mm256_set1_ps(subtractParam); + + // Subtract without fused output-layout toggle (NCDHW -> NCDHW) + if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrDepth, *dstPtrDepth; + srcPtrDepth = srcPtrChannel; + dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_subtract_16_host(p, &pSubtractParam); // subtract adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtrTemp++ - subtractParam; + } + srcPtrRow += srcGenericDescPtr->strides[3]; + dstPtrRow += dstGenericDescPtr->strides[3]; + } + srcPtrDepth += srcGenericDescPtr->strides[2]; + dstPtrDepth += dstGenericDescPtr->strides[2]; + } + srcPtrChannel += srcGenericDescPtr->strides[1]; + dstPtrChannel += srcGenericDescPtr->strides[1]; + } + } + // Subtract without fused output-layout toggle (NDHWC -> NDHWC) + else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + { + srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); + Rpp32f *srcPtrDepth = srcPtrChannel; + Rpp32f *dstPtrDepth = dstPtrChannel; + for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_subtract_16_host(p, &pSubtractParam); // subtract adjustment + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtrTemp++ - subtractParam; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtrDepth += srcGenericDescPtr->strides[1]; + dstPtrDepth += dstGenericDescPtr->strides[1]; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/tensor_max.hpp b/src/modules/cpu/kernel/tensor_max.hpp new file mode 100644 index 000000000..0380f4ef6 --- /dev/null +++ b/src/modules/cpu/kernel/tensor_max.hpp @@ -0,0 +1,847 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus tensor_max_u8_u8_host(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8u *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; + + // Tensor max 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp8u max = 0; + Rpp8u resultAvx[16]; + + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pMax = _mm256_setzero_si256(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp); + pMax = _mm256_max_epu8(p1, pMax); //compare and store max of 32 values into global max + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + max = std::max(*srcPtrTemp++, max); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_32_host(&pMax, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + max = std::max(resultAvx[0], max); +#endif + maxArr[batchCount] = max; + } + // Tensor max 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0; + Rpp8u resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pMaxR = _mm256_setzero_si256(); + __m256i pMaxG = pMaxR; + __m256i pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[3]; + rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_max_96_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + maxR = std::max(*srcPtrTempR++, maxR); + maxG = std::max(*srcPtrTempG++, maxG); + maxB = std::max(*srcPtrTempB++, maxB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_96_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + maxR = std::max(resultAvx[0], maxR); + maxG = std::max(resultAvx[1], maxG); + maxB = std::max(resultAvx[2], maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + + // Tensor max 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0; + Rpp8u resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; + + __m128i pMaxR = _mm_setzero_si128(); + __m128i pMaxG = pMaxR; + __m128i pMaxB = pMaxR; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p[3]; + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p); + compute_max_48_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + maxR = std::max(srcPtrTemp[0], maxR); + maxG = std::max(srcPtrTemp[1], maxG); + maxB = std::max(srcPtrTemp[2], maxB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_48_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + maxR = std::max(resultAvx[0], maxR); + maxG = std::max(resultAvx[1], maxG); + maxB = std::max(resultAvx[2], maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_max_f32_f32_host(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + + // Tensor max 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp32f max = 0.0; + Rpp32f resultAvx[4]; + + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256 pMax = _mm256_setzero_ps(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1); + compute_max_float8_host(&p1, &pMax); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + max = std::max(*srcPtrTemp++, max); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128 result; + reduce_max_float8_host(&pMax, &result); + rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result); + max = std::max(std::max(resultAvx[0], resultAvx[1]), max); +#endif + maxArr[batchCount] = max; + } + + // Tensor max 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0; + Rpp32f resultAvx[8]; + + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256 pMaxR = _mm256_setzero_ps(); + __m256 pMaxG = pMaxR; + __m256 pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + maxR = std::max(*srcPtrTempR++, maxR); + maxG = std::max(*srcPtrTempG++, maxG); + maxB = std::max(*srcPtrTempB++, maxB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR); + maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG); + maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB); +#endif + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + + // Tensor max 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0; + Rpp32f resultAvx[8]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; + +#if __AVX2__ + __m256 pMaxR = _mm256_setzero_ps(); + __m256 pMaxG = pMaxR; + __m256 pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); + compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + maxR = std::max(srcPtrTemp[0], maxR); + maxG = std::max(srcPtrTemp[1], maxG); + maxB = std::max(srcPtrTemp[2], maxB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR); + maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG); + maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_max_f16_f16_host(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp16f *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + + // Tensor max 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp32f max = 0.0; + Rpp32f resultAvx[4]; + + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256 pMax = _mm256_setzero_ps(); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[8]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + } + __m256 p1; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1); + compute_max_float8_host(&p1, &pMax); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + max = std::max((Rpp32f)*srcPtrTemp++, max); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128 result; + reduce_max_float8_host(&pMax, &result); + rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result); + max = std::max(std::max(resultAvx[0], resultAvx[1]), max); +#endif + maxArr[batchCount] = (Rpp16f)max; + } + + // Tensor max 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0; + Rpp32f resultAvx[8]; + + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256 pMaxR = _mm256_setzero_ps(); + __m256 pMaxG = pMaxR; + __m256 pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8]; + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt]; + srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt]; + srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt]; + } + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); + compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + maxR = std::max((Rpp32f)*srcPtrTempR++, maxR); + maxG = std::max((Rpp32f)*srcPtrTempG++, maxG); + maxB = std::max((Rpp32f)*srcPtrTempB++, maxB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR); + maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG); + maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB); + +#endif + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = (Rpp16f)maxR; + maxArr[maxArrIndex + 1] = (Rpp16f)maxG; + maxArr[maxArrIndex + 2] = (Rpp16f)maxB; + maxArr[maxArrIndex + 3] = (Rpp16f)maxC; + } + + // Tensor max 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0; + Rpp32f resultAvx[8]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; + +#if __AVX2__ + __m256 pMaxR = _mm256_setzero_ps(); + __m256 pMaxG = pMaxR; + __m256 pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[24]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + } + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); + compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + maxR = std::max((Rpp32f)srcPtrTemp[0], maxR); + maxG = std::max((Rpp32f)srcPtrTemp[1], maxG); + maxB = std::max((Rpp32f)srcPtrTemp[2], maxB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR); + maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG); + maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = (Rpp16f)maxR; + maxArr[maxArrIndex + 1] = (Rpp16f)maxG; + maxArr[maxArrIndex + 2] = (Rpp16f)maxB; + maxArr[maxArrIndex + 3] = (Rpp16f)maxC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_max_i8_i8_host(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8s *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; + + // Tensor max 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp8s max = INT8_MIN; + Rpp8s resultAvx[16]; + + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pMax = _mm256_set1_epi8(INT8_MIN); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp); + pMax = _mm256_max_epi8(p1, pMax); //compare and store max of 32 values into global max + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + max = std::max(*srcPtrTemp++, max); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_i32_host(&pMax, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + max = std::max(resultAvx[0], max); +#endif + maxArr[batchCount] = max; + } + // Tensor max 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN; + Rpp8s resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pMaxR = _mm256_set1_epi8(INT8_MIN); + __m256i pMaxG = pMaxR; + __m256i pMaxB = pMaxR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[3]; + rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_max_i96_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + maxR = std::max(*srcPtrTempR++, maxR); + maxG = std::max(*srcPtrTempG++, maxG); + maxB = std::max(*srcPtrTempB++, maxB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_i96_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + maxR = std::max(resultAvx[0], maxR); + maxG = std::max(resultAvx[1], maxG); + maxB = std::max(resultAvx[2], maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + + // Tensor max 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u maxArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN; + Rpp8s resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; + + __m128i pMaxR = _mm_set1_epi8(INT8_MIN); + __m128i pMaxG = pMaxR; + __m128i pMaxB = pMaxR; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p[3]; + rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p); + compute_max_i48_host(p, &pMaxR, &pMaxG, &pMaxB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + maxR = std::max(srcPtrTemp[0], maxR); + maxG = std::max(srcPtrTemp[1], maxG); + maxB = std::max(srcPtrTemp[2], maxB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_max_i48_host(&pMaxR, &pMaxG, &pMaxB, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + maxR = std::max(resultAvx[0], maxR); + maxG = std::max(resultAvx[1], maxG); + maxB = std::max(resultAvx[2], maxB); +#endif + } + maxC = std::max(std::max(maxR, maxG), maxB); + maxArr[maxArrIndex] = maxR; + maxArr[maxArrIndex + 1] = maxG; + maxArr[maxArrIndex + 2] = maxB; + maxArr[maxArrIndex + 3] = maxC; + } + } + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/tensor_min.hpp b/src/modules/cpu/kernel/tensor_min.hpp new file mode 100644 index 000000000..15b9b77ba --- /dev/null +++ b/src/modules/cpu/kernel/tensor_min.hpp @@ -0,0 +1,845 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus tensor_min_u8_u8_host(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8u *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; + + // Tensor min 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp8u min = 255; + Rpp8u resultAvx[16]; + + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pMin = _mm256_set1_epi8((char)255); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp); + pMin = _mm256_min_epu8(p1, pMin); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + min = std::min(*srcPtrTemp++, min); + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_min_32_host(&pMin, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + min = std::min(std::min(resultAvx[0], resultAvx[1]), min); +#endif + minArr[batchCount] = min; + } + + // Tensor min 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp8u minC = 255, minR = 255, minG = 255, minB = 255; + Rpp8u resultAvx[16]; + + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pMinR = _mm256_set1_epi8((char)255); + __m256i pMinG = pMinR; + __m256i pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[3]; + rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_min_96_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + minR = std::min(*srcPtrTempR++, minR); + minG = std::min(*srcPtrTempG++, minG); + minB = std::min(*srcPtrTempB++, minB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_min_96_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + minR = std::min(resultAvx[0], minR); + minG = std::min(resultAvx[1], minG); + minB = std::min(resultAvx[2], minB); +#endif + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + + // Tensor min 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp8u minC = 255, minR = 255, minG = 255, minB = 255; + Rpp8u resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtrRow; + srcPtrRow = srcPtrChannel; + + __m128i pMinR = _mm_set1_epi8((char)255); + __m128i pMinG = pMinR; + __m128i pMinB = pMinR; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; + + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p[3]; + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p); + compute_min_48_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + minR = std::min(srcPtrTemp[0], minR); + minG = std::min(srcPtrTemp[1], minG); + minB = std::min(srcPtrTemp[2], minB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } + + __m128i result; + reduce_min_48_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result); + + minR = std::min(resultAvx[0], minR); + minG = std::min(resultAvx[1], minG); + minB = std::min(resultAvx[2], minB); + } + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_min_f32_f32_host(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + + // Tensor min 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp32f min = 255.0; + Rpp32f resultAvx[4]; + + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256 pMin = _mm256_set1_ps(255.0); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1); + compute_min_float8_host(&p1, &pMin); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + min = std::min(*srcPtrTemp++, min); + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m128 result; + reduce_min_float8_host(&pMin, &result); + rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result); + min = std::min(std::min(resultAvx[0], resultAvx[1]), min); +#endif + minArr[batchCount] = min; + } + + // Tensor min 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0; + Rpp32f resultAvx[8]; + + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256 pMinR = _mm256_set1_ps(255.0); + __m256 pMinG = pMinR; + __m256 pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_min_float24_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + minR = std::min(*srcPtrTempR++, minR); + minG = std::min(*srcPtrTempG++, minG); + minB = std::min(*srcPtrTempB++, minB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR); + minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG); + minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB); +#endif + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + + // Tensor min 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0; + Rpp32f resultAvx[8]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrRow; + srcPtrRow = srcPtrChannel; + +#if __AVX2__ + __m256 pMinR = _mm256_set1_ps(255.0); + __m256 pMinG = pMinR; + __m256 pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); + compute_min_float24_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + minR = std::min(srcPtrTemp[0], minR); + minG = std::min(srcPtrTemp[1], minG); + minB = std::min(srcPtrTemp[2], minB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m256 result; + reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR); + minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG); + minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB); +#endif + } + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_min_f16_f16_host(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp16f *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + + // Tensor min 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp32f min = 255.0; + Rpp32f resultAvx[4]; + + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256 pMin = _mm256_set1_ps(255.0); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[8]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + } + __m256 p1; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1); + compute_min_float8_host(&p1, &pMin); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + min = std::min((Rpp32f)*srcPtrTemp++, min); + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m128 result; + reduce_min_float8_host(&pMin, &result); + rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result); + min = std::min(std::min(resultAvx[0], resultAvx[1]), min); +#endif + minArr[batchCount] = (Rpp16f) min; + } + + // Tensor min 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0; + Rpp32f resultAvx[8]; + + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256 pMinR = _mm256_set1_ps(255.0); + __m256 pMinG = pMinR; + __m256 pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8]; + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt]; + srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt]; + srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt]; + } + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); + compute_min_float24_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + minR = std::min((Rpp32f)*srcPtrTempR++, minR); + minG = std::min((Rpp32f)*srcPtrTempG++, minG); + minB = std::min((Rpp32f)*srcPtrTempB++, minB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m256 result; + reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR); + minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG); + minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB); +#endif + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = (Rpp16f) minR; + minArr[minArrIndex + 1] = (Rpp16f) minG; + minArr[minArrIndex + 2] = (Rpp16f) minB; + minArr[minArrIndex + 3] = (Rpp16f) minC; + } + + // Tensor min 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0; + Rpp32f resultAvx[8]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtrRow; + srcPtrRow = srcPtrChannel; + +#if __AVX2__ + __m256 pMinR = _mm256_set1_ps(255.0); + __m256 pMinG = pMinR; + __m256 pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[24]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt]; + } + __m256 p[3]; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); + compute_min_float24_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + minR = std::min((Rpp32f)srcPtrTemp[0], minR); + minG = std::min((Rpp32f)srcPtrTemp[1], minG); + minB = std::min((Rpp32f)srcPtrTemp[2], minB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m256 result; + reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result); + + minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR); + minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG); + minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB); +#endif + } + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = (Rpp16f) minR; + minArr[minArrIndex + 1] = (Rpp16f) minG; + minArr[minArrIndex + 2] = (Rpp16f) minB; + minArr[minArrIndex + 3] = (Rpp16f) minC; + } + } + return RPP_SUCCESS; +} + +RppStatus tensor_min_i8_i8_host(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8s *srcPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; + + // Tensor min 1 channel (NCHW) + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + vectorIncrement = vectorIncrementPerChannel; + Rpp8s min = 127; + Rpp8s resultAvx[16]; + + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; +#if __AVX2__ + __m256i pMin = _mm256_set1_epi8((char)127); +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp); + pMin = _mm256_min_epi8(p1, pMin); //compare and store min of 32 values into global min + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + min = std::min((*srcPtrTemp++), min); + } + srcPtrRow += srcDescPtr->strides.hStride; + } + +#if __AVX2__ + __m128i result; + reduce_min_i32_host(&pMin, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + min = std::min(std::min(resultAvx[0], resultAvx[1]), min); +#endif + minArr[batchCount] = min; + } + + // Tensor min 3 channel (NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp8s minC = 127, minR = 127, minG = 127, minB = 127; + Rpp8s resultAvx[16]; + + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; +#if __AVX2__ + __m256i pMinR = _mm256_set1_epi8((char)127); + __m256i pMinG = pMinR; + __m256i pMinB = pMinR; +#endif + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p[3]; + rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + compute_min_i96_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTempR += vectorIncrementPerChannel; + srcPtrTempG += vectorIncrementPerChannel; + srcPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + minR = std::min(*srcPtrTempR++, minR); + minG = std::min(*srcPtrTempG++, minG); + minB = std::min(*srcPtrTempB++, minB); + } + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_min_i96_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + minR = std::min(resultAvx[0], minR); + minG = std::min(resultAvx[1], minG); + minB = std::min(resultAvx[2], minB); +#endif + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + + // Tensor min 3 channel (NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u minArrIndex = batchCount * 4; + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp8s minC = 127, minR = 127, minG = 127, minB = 127; + Rpp8s resultAvx[16]; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtrRow; + srcPtrRow = srcPtrChannel; + + __m128i pMinR = _mm_set1_epi8((char)127); + __m128i pMinG = pMinR; + __m128i pMinB = pMinR; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp; + srcPtrTemp = srcPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p[3]; + rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p); + compute_min_i48_host(p, &pMinR, &pMinG, &pMinB); + + srcPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + minR = std::min(srcPtrTemp[0], minR); + minG = std::min(srcPtrTemp[1], minG); + minB = std::min(srcPtrTemp[2], minB); + srcPtrTemp += 3; + } + srcPtrRow += srcDescPtr->strides.hStride; + } +#if __AVX2__ + __m128i result; + reduce_min_i48_host(&pMinR, &pMinG, &pMinB, &result); + rpp_simd_store(rpp_store16_i8, resultAvx, &result); + + minR = std::min(resultAvx[0], minR); + minG = std::min(resultAvx[1], minG); + minB = std::min(resultAvx[2], minB); +#endif + } + minC = std::min(std::min(minR, minG), minB); + minArr[minArrIndex] = minR; + minArr[minArrIndex + 1] = minG; + minArr[minArrIndex + 2] = minB; + minArr[minArrIndex + 3] = minC; + } + } + return RPP_SUCCESS; +} diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp index 55fbb7832..0345171fc 100644 --- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp +++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp @@ -26,5 +26,9 @@ SOFTWARE. #define HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP #include "kernel/fused_multiply_add_scalar.hpp" +#include "kernel/add_scalar.hpp" +#include "kernel/subtract_scalar.hpp" +#include "kernel/multiply_scalar.hpp" +#include "kernel/magnitude.hpp" #endif // HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP diff --git a/src/modules/hip/hip_tensor_color_augmentations.hpp b/src/modules/hip/hip_tensor_color_augmentations.hpp index 873f06b97..c5610dbcb 100644 --- a/src/modules/hip/hip_tensor_color_augmentations.hpp +++ b/src/modules/hip/hip_tensor_color_augmentations.hpp @@ -33,5 +33,6 @@ SOFTWARE. #include "kernel/exposure.hpp" #include "kernel/contrast.hpp" #include "kernel/lut.hpp" +#include "kernel/color_temperature.hpp" #endif // HIP_TENSOR_COLOR_AUGMENTATIONS_HPP diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp index 328a232a1..c79e0a951 100644 --- a/src/modules/hip/hip_tensor_statistical_operations.hpp +++ b/src/modules/hip/hip_tensor_statistical_operations.hpp @@ -23,8 +23,9 @@ SOFTWARE. */ #ifndef HIP_TENSOR_STATISTICAL_OPERATIONS_HPP -#define HIP_TENSOR_STATISTICAL_OPERATIONS_HPP #include "kernel/tensor_sum.hpp" +#include "kernel/tensor_min.hpp" +#include "kernel/tensor_max.hpp" -#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP \ No newline at end of file +#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP diff --git a/src/modules/hip/kernel/add_scalar.hpp b/src/modules/hip/kernel/add_scalar.hpp new file mode 100644 index 000000000..709337c9d --- /dev/null +++ b/src/modules/hip/kernel/add_scalar.hpp @@ -0,0 +1,114 @@ +#include +#include "rpp_hip_common.hpp" + + +__global__ void add_scalar_ncdhw_hip_tensor(float *srcPtr, + uint3 srcStridesCDH, + float *dstPtr, + uint3 dstStridesCDH, + int channels, + float addParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x); + uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x; + + d_float8 val_f8; + for(int c = 0; c < channels; c++) + { + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8); + rpp_hip_math_add8_const(&val_f8, &val_f8, static_cast(addParam)); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8); + srcIdx += srcStridesCDH.x; + dstIdx += dstStridesCDH.x; + } +} + +__global__ void add_scalar_ndhwc_hip_tensor(float *srcPtr, + uint2 srcStridesDH, + float *dstPtr, + uint2 dstStridesDH, + float addParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3; + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3; + + d_float24 val_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24); + rpp_hip_math_add24_const(&val_f24, &val_f24, static_cast(addParam)); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24); +} + +RppStatus hip_exec_add_scalar_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + RpptROI3DPtr roiGenericPtrSrc, + Rpp32f *addTensor, + rpp::Handle& handle) +{ + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(add_scalar_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), + dstGenericDescPtr->dims[1], + addTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(add_scalar_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + addTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/color_temperature.hpp b/src/modules/hip/kernel/color_temperature.hpp new file mode 100644 index 000000000..ad8adc32a --- /dev/null +++ b/src/modules/hip/kernel/color_temperature.hpp @@ -0,0 +1,223 @@ +#include +#include "rpp_hip_common.hpp" + +template +__device__ void color_temperature_hip_compute(T *srcPtr, d_float24 *pix_f24, float4 *adjustmentValue_f4) +{ + float4 adjustment_f4; + if constexpr ((std::is_same::value) || (std::is_same::value)) + { + adjustment_f4 = *adjustmentValue_f4 * (float4) ONE_OVER_255; + rpp_hip_math_add8_const(&pix_f24->f8[0], &pix_f24->f8[0], adjustment_f4); + rpp_hip_math_subtract8_const(&pix_f24->f8[2], &pix_f24->f8[2], adjustment_f4); + } + else if constexpr (std::is_same::value) + { + adjustment_f4 = *adjustmentValue_f4; + rpp_hip_math_add24_const(pix_f24, pix_f24, (float4)128); + rpp_hip_math_add8_const(&pix_f24->f8[0], &pix_f24->f8[0], adjustment_f4); + rpp_hip_math_subtract8_const(&pix_f24->f8[2], &pix_f24->f8[2], adjustment_f4); + rpp_hip_pixel_check_0to255(pix_f24); + rpp_hip_math_subtract24_const(pix_f24, pix_f24, (float4)128); + } + else + { + rpp_hip_math_add8_const(&pix_f24->f8[0], &pix_f24->f8[0], *adjustmentValue_f4); + rpp_hip_math_subtract8_const(&pix_f24->f8[2], &pix_f24->f8[2], *adjustmentValue_f4); + } +} + +template +__global__ void color_temperature_pkd_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + int *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]); + + d_float24 pix_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &pix_f24); + color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &pix_f24); +} + +template +__global__ void color_temperature_pln_hip_tensor(T *srcPtr, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]); + + d_float24 pix_f24; + + rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &pix_f24); + color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4); + rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &pix_f24); +} + +template +__global__ void color_temperature_pkd3_pln3_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + int *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]); + + d_float24 pix_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &pix_f24); + color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4); + rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &pix_f24); +} + +template +__global__ void color_temperature_pln3_pkd3_hip_tensor(T *srcPtr, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + int *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]); + + d_float24 pix_f24; + + rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &pix_f24); + color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &pix_f24); +} + +template +RppStatus hip_exec_color_temperature_tensor(T *srcPtr, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(color_temperature_pkd_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + handle.GetInitHandle()->mem.mgpu.intArr[0].intmem, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(color_temperature_pln_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + handle.GetInitHandle()->mem.mgpu.intArr[0].intmem, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(color_temperature_pkd3_pln3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + handle.GetInitHandle()->mem.mgpu.intArr[0].intmem, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(color_temperature_pln3_pkd3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + handle.GetInitHandle()->mem.mgpu.intArr[0].intmem, + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/magnitude.hpp b/src/modules/hip/kernel/magnitude.hpp new file mode 100644 index 000000000..902d27bde --- /dev/null +++ b/src/modules/hip/kernel/magnitude.hpp @@ -0,0 +1,244 @@ +#include +#include "rpp_hip_common.hpp" + +template +__device__ void magnitude_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) +{ + if constexpr (std::is_same::value) + { + rpp_hip_math_add8_const(src1_f8, src1_f8, (float4)128); + rpp_hip_math_add8_const(src2_f8, src2_f8, (float4)128); + } + + d_float8 src1Sq_f8, src2Sq_f8, sum_f8; + rpp_hip_math_multiply8(src1_f8, src1_f8, &src1Sq_f8); + rpp_hip_math_multiply8(src2_f8, src2_f8, &src2Sq_f8); + rpp_hip_math_add8(&src1Sq_f8, &src2Sq_f8, &sum_f8); + rpp_hip_math_sqrt8(&sum_f8, dst_f8); + + if constexpr (std::is_same::value) + { + dst_f8->f4[0] = rpp_hip_pixel_check_0to255(dst_f8->f4[0]) - (float4)128; + dst_f8->f4[1] = rpp_hip_pixel_check_0to255(dst_f8->f4[1]) - (float4)128; + } +} + +template +__global__ void magnitude_pkd_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + magnitude_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + magnitude_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + magnitude_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +__global__ void magnitude_pln_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int channelsDst, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float8 src1_f8, src2_f8, dst_f8; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + magnitude_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + if (channelsDst == 3) + { + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + magnitude_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + magnitude_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + } +} + +template +__global__ void magnitude_pkd3_pln3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + magnitude_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + magnitude_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + magnitude_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24); +} + +template +__global__ void magnitude_pln3_pkd3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24); + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24); + magnitude_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + magnitude_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + magnitude_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_magnitude_tensor(T *srcPtr1, + T *srcPtr2, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (dstDescPtr->w + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(magnitude_pkd_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(magnitude_pln_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + dstDescPtr->c, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(magnitude_pkd3_pln3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; + hipLaunchKernelGGL(magnitude_pln3_pkd3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/multiply_scalar.hpp b/src/modules/hip/kernel/multiply_scalar.hpp new file mode 100644 index 000000000..e0816576a --- /dev/null +++ b/src/modules/hip/kernel/multiply_scalar.hpp @@ -0,0 +1,114 @@ +#include +#include "rpp_hip_common.hpp" + + +__global__ void multiply_scalar_ncdhw_hip_tensor(float *srcPtr, + uint3 srcStridesCDH, + float *dstPtr, + uint3 dstStridesCDH, + int channels, + float mulParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x); + uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x; + + d_float8 val_f8; + for(int c = 0; c < channels; c++) + { + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8); + rpp_hip_math_multiply8_const(&val_f8, &val_f8, static_cast(mulParam)); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8); + srcIdx += srcStridesCDH.x; + dstIdx += dstStridesCDH.x; + } +} + +__global__ void multiply_scalar_ndhwc_hip_tensor(float *srcPtr, + uint2 srcStridesDH, + float *dstPtr, + uint2 dstStridesDH, + float mulParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3; + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3; + + d_float24 val_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24); + rpp_hip_math_multiply24_const(&val_f24, &val_f24, static_cast(mulParam)); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24); +} + +RppStatus hip_exec_multiply_scalar_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + RpptROI3DPtr roiGenericPtrSrc, + Rpp32f *mulTensor, + rpp::Handle& handle) +{ + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(multiply_scalar_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), + dstGenericDescPtr->dims[1], + mulTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(multiply_scalar_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + mulTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/subtract_scalar.hpp b/src/modules/hip/kernel/subtract_scalar.hpp new file mode 100644 index 000000000..7ee128709 --- /dev/null +++ b/src/modules/hip/kernel/subtract_scalar.hpp @@ -0,0 +1,114 @@ +#include +#include "rpp_hip_common.hpp" + + +__global__ void subtract_scalar_ncdhw_hip_tensor(float *srcPtr, + uint3 srcStridesCDH, + float *dstPtr, + uint3 dstStridesCDH, + int channels, + float subtractParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x); + uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x; + + d_float8 val_f8; + for(int c = 0; c < channels; c++) + { + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8); + rpp_hip_math_subtract8_const(&val_f8, &val_f8, static_cast(subtractParam)); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8); + srcIdx += srcStridesCDH.x; + dstIdx += dstStridesCDH.x; + } +} + +__global__ void subtract_scalar_ndhwc_hip_tensor(float *srcPtr, + uint2 srcStridesDH, + float *dstPtr, + uint2 dstStridesDH, + float subtractParam, + RpptROI3DPtr roiGenericPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth)) + { + return; + } + + uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3; + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3; + + d_float24 val_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24); + rpp_hip_math_subtract24_const(&val_f24, &val_f24, static_cast(subtractParam)); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24); +} + +RppStatus hip_exec_subtract_scalar_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + RpptROI3DPtr roiGenericPtrSrc, + Rpp32f *subtractTensor, + rpp::Handle& handle) +{ + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(subtract_scalar_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), + dstGenericDescPtr->dims[1], + subtractTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(subtract_scalar_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + subtractTensor[batchCount], + &roiGenericPtrSrc[batchCount]); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/tensor_max.hpp b/src/modules/hip/kernel/tensor_max.hpp new file mode 100644 index 000000000..b47fce024 --- /dev/null +++ b/src/modules/hip/kernel/tensor_max.hpp @@ -0,0 +1,400 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 0 - Reduction Stage 2 -------------------- + +template +__global__ void tensor_max_grid_3channel_result_hip(float *srcPtr, + uint xBufferLength, + T *dstPtr) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialRMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + __shared__ float partialGMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + __shared__ float partialBMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + + uint srcIdx = (id_z * xBufferLength) * 3; + partialRMax_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start of R channel using all 256 x 1 threads + partialGMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start of G channel using all 256 x 1 threads + partialBMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start of B channel using all 256 x 1 threads + + if (id_x >= xBufferLength) + return; + + srcIdx += id_x * 3; + + if (id_x + 8 > xBufferLength) + srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7) + + d_float24 src_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local mmemory + + rpp_hip_math_max8(&src_f24.f8[0], &partialRMax_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[1], &partialGMax_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[2], &partialBMax_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMax_smem[hipThreadIdx_x] = fmaxf(partialRMax_smem[hipThreadIdx_x], partialRMax_smem[hipThreadIdx_x + threadMax]); + partialGMax_smem[hipThreadIdx_x] = fmaxf(partialGMax_smem[hipThreadIdx_x], partialGMax_smem[hipThreadIdx_x + threadMax]); + partialBMax_smem[hipThreadIdx_x] = fmaxf(partialBMax_smem[hipThreadIdx_x], partialBMax_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int dstIdx = hipBlockIdx_z * 4; + dstPtr[dstIdx] = (T) partialRMax_smem[0]; + dstPtr[dstIdx + 1] = (T) partialGMax_smem[0]; + dstPtr[dstIdx + 2] = (T) partialBMax_smem[0]; + dstPtr[dstIdx + 3] = (T) (fmaxf(fmaxf(partialRMax_smem[0], partialGMax_smem[0]), partialBMax_smem[0])); + } +} + +template +__global__ void tensor_max_grid_result_hip(float *srcPtr, + uint xBufferLength, + T *dstPtr) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + + uint srcIdx = (id_z * xBufferLength); + partialMax_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start of buffer using all 256 x 1 threads + + if (id_x >= xBufferLength) + return; + + srcIdx += id_x; + + if (id_x + 8 > xBufferLength) + srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7))); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7) + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_max8(&src_f8, &partialMax_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialMax_smem[hipThreadIdx_x] = fmaxf(partialMax_smem[hipThreadIdx_x], partialMax_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + dstPtr[hipBlockIdx_z] = (T) (partialMax_smem[0]); +} + + +// -------------------- Set 1 - Reduction Stage 1 -------------------- + +template +__global__ void tensor_max_pkd3_hip(T *srcPtr, + uint2 srcStridesNH, + float *maxArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialRMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel + __shared__ float partialGMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel + __shared__ float partialBMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel + + float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for R Channel + float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for G Channel + float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for B Channel + uint srcIdx = (id_z * srcStridesNH.x); + partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads + partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads + partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + + d_float24 src_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory + + rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + { + partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]); + partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]); + partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3; + maxArr[idx] = partialRMaxRowPtr_smem[0]; + maxArr[idx + 1] = partialGMaxRowPtr_smem[0]; + maxArr[idx + 2] = partialBMaxRowPtr_smem[0]; + } + } +} + +template +__global__ void tensor_max_pln3_hip(T *srcPtr, + uint3 srcStridesNCH, + float *maxArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialRMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialGMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialBMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + + float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + uint srcIdx = (id_z * srcStridesNCH.x); + partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads + partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y]; // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads + partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y]; // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + d_float24 src_f24; + rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24); + + rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + { + partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]); + partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]); + partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3; + maxArr[idx] = partialRMaxRowPtr_smem[0]; + maxArr[idx + 1] = partialGMaxRowPtr_smem[0]; + maxArr[idx + 2] = partialBMaxRowPtr_smem[0]; + } + } +} + +template +__global__ void tensor_max_pln1_hip(T *srcPtr, + uint2 srcStridesNH, + float *maxArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + + uint srcIdx = (id_z * srcStridesNH.x); + float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + + rpp_hip_math_max8(&src_f8, &partialMaxRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0]; + } +} + + +// -------------------- Set 2 - Kernel Executors -------------------- + +template +RppStatus hip_exec_tensor_max(T *srcPtr, + RpptDescPtr srcDescPtr, + U *maxArr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (srcDescPtr->w + 7) >> 3; + int globalThreads_y = srcDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X); + int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y); + int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z); + float2 bitDepthMinMax_f2; + getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2); + float minimum = bitDepthMinMax_f2.x; + + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z; + float *partialMaxArr; + partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_max_pln1_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + partialMaxArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_max_grid_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMaxArr, + gridDim_x * gridDim_y, + maxArr); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + float *partialMaxArr; + partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_max_pln3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + partialMaxArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMaxArr, + gridDim_x * gridDim_y, + maxArr); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + float *partialMaxArr; + partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_max_pkd3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + partialMaxArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMaxArr, + gridDim_x * gridDim_y, + maxArr); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/tensor_min.hpp b/src/modules/hip/kernel/tensor_min.hpp new file mode 100644 index 000000000..a883c4f3b --- /dev/null +++ b/src/modules/hip/kernel/tensor_min.hpp @@ -0,0 +1,410 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 0 - Reduction Stage 2 -------------------- + +template +__global__ void tensor_min_grid_3channel_result_hip(float *srcPtr, + uint xBufferLength, + T *dstPtr) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialRMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + __shared__ float partialGMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + __shared__ float partialBMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + + uint srcIdx = (id_z * xBufferLength) * 3; + partialRMin_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start of R channel using all 256 x 1 threads + partialGMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start of G channel using all 256 x 1 threads + partialBMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start of B channel using all 256 x 1 threads + + if (id_x >= xBufferLength) + return; + + srcIdx += id_x * 3; + + if (id_x + 8 > xBufferLength) + srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7) + + d_float24 src_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory + + rpp_hip_math_min8(&src_f24.f8[0], &partialRMin_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[1], &partialGMin_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[2], &partialBMin_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after min compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMin_smem[hipThreadIdx_x] = fminf(partialRMin_smem[hipThreadIdx_x], partialRMin_smem[hipThreadIdx_x + threadMax]); + partialGMin_smem[hipThreadIdx_x] = fminf(partialGMin_smem[hipThreadIdx_x], partialGMin_smem[hipThreadIdx_x + threadMax]); + partialBMin_smem[hipThreadIdx_x] = fminf(partialBMin_smem[hipThreadIdx_x], partialBMin_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int dstIdx = hipBlockIdx_z * 4; + dstPtr[dstIdx] = (T) partialRMin_smem[0]; + dstPtr[dstIdx + 1] = (T) partialGMin_smem[0]; + dstPtr[dstIdx + 2] = (T) partialBMin_smem[0]; + dstPtr[dstIdx + 3] = (T) (fminf(fminf(partialRMin_smem[0], partialGMin_smem[0]), partialBMin_smem[0])); + } +} + +template +__global__ void tensor_min_grid_result_hip(float *srcPtr, + uint xBufferLength, + T *dstPtr) +{ + int id_x = hipThreadIdx_x * 8; + int id_z = hipBlockIdx_z; + + __shared__ float partialMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block + + uint srcIdx = (id_z * xBufferLength); + partialMin_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start of buffer using all 256 x 1 threads + + if (id_x >= xBufferLength) + return; + + srcIdx += id_x; + + if (id_x + 8 > xBufferLength) + srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7))); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7) + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_min8(&src_f8, &partialMin_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after min compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialMin_smem[hipThreadIdx_x] = fminf(partialMin_smem[hipThreadIdx_x], partialMin_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + dstPtr[hipBlockIdx_z] = (T) (partialMin_smem[0]); +} + + +// -------------------- Set 1 - Reduction Stage 1 -------------------- + +template +__global__ void tensor_min_pkd3_hip(T *srcPtr, + uint2 srcStridesNH, + float *minArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialRMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel + __shared__ float partialGMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel + __shared__ float partialBMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel + + float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for R Channel + float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for G Channel + float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for B Channel + + uint srcIdx = (id_z * srcStridesNH.x); + partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads + partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads + partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + + if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) + srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth) * 3; + + d_float24 src_f24; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory + + rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]); + partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]); + partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + { + partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]); + partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]); + partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3; + minArr[idx] = partialRMinRowPtr_smem[0]; + minArr[idx + 1] = partialGMinRowPtr_smem[0]; + minArr[idx + 2] = partialBMinRowPtr_smem[0]; + } + } +} + +template +__global__ void tensor_min_pln3_hip(T *srcPtr, + uint3 srcStridesNCH, + float *minArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialRMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialGMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + __shared__ float partialBMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + + float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + + uint srcIdx = (id_z * srcStridesNCH.x); + partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads + partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y]; // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads + partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y]; // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) + srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth); + + d_float24 src_f24; + rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24); + + rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]); + rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after min compute + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + { + partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]); + partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]); + partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]); + } + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + { + partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]); + partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]); + partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]); + } + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3; + minArr[idx] = partialRMinRowPtr_smem[0]; + minArr[idx + 1] = partialGMinRowPtr_smem[0]; + minArr[idx + 2] = partialBMinRowPtr_smem[0]; + } + } +} + +template +__global__ void tensor_min_pln1_hip(T *srcPtr, + uint2 srcStridesNH, + float *minArr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + __shared__ float partialMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + + uint srcIdx = (id_z * srcStridesNH.x); + float *partialMinRowPtr_smem = &partialMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + partialMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + + if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth) + srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_min8(&src_f8, &partialMinRowPtr_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after min compute + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialMinRowPtr_smem[hipThreadIdx_x] = fminf(partialMinRowPtr_smem[hipThreadIdx_x], partialMinRowPtr_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialMinRowPtr_smem[0] = fminf(partialMinRowPtr_smem[0], partialMinRowPtr_smem[increment]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + minArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMinRowPtr_smem[0]; + } +} + + +// -------------------- Set 2 - Kernel Executors -------------------- + +template +RppStatus hip_exec_tensor_min(T *srcPtr, + RpptDescPtr srcDescPtr, + U *minArr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle &handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (srcDescPtr->w + 7) >> 3; + int globalThreads_y = srcDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X); + int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y); + int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z); + float2 bitDepthMinMax_f2; + getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2); + float maximum = bitDepthMinMax_f2.y; + + if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z; + float *partialMinArr; + partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_min_pln1_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + partialMinArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_min_grid_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMinArr, + gridDim_x * gridDim_y, + minArr); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + float *partialMinArr; + partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_min_pln3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + partialMinArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMinArr, + gridDim_x * gridDim_y, + minArr); + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3; + float *partialMinArr; + partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem; + hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream()); + hipLaunchKernelGGL(tensor_min_pkd3_hip, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + partialMinArr, + roiTensorPtrSrc); + hipStreamSynchronize(handle.GetStream()); + hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip, + dim3(1, 1, gridDim_z), + dim3(256, 1, 1), + 0, + handle.GetStream(), + partialMinArr, + gridDim_x * gridDim_y, + minArr); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp index daf0479ee..8f88ba90f 100644 --- a/src/modules/rppt_tensor_arithmetic_operations.cpp +++ b/src/modules/rppt_tensor_arithmetic_operations.cpp @@ -73,6 +73,188 @@ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** add_scalar ********************/ + +RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *addTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams; + if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); + else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); + + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + add_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + addTensor, + roiGenericPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** subtract_scalar ********************/ + +RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *subtractTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams; + if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); + else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); + + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + subtract_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + subtractTensor, + roiGenericPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** multiply_scalar ********************/ + +RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *mulTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams; + if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); + else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); + + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + multiply_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + mulTensor, + roiGenericPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** magnitude ********************/ + +RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + magnitude_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + magnitude_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + magnitude_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + magnitude_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -113,4 +295,163 @@ RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** add_scalar ********************/ + +RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *addTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + hip_exec_add_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiGenericPtrSrc, + addTensor, + rpp::deref(rppHandle)); + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** subtract_scalar ********************/ + +RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *subtractTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + hip_exec_subtract_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiGenericPtrSrc, + subtractTensor, + rpp::deref(rppHandle)); + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** multiply_scalar ********************/ + +RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *mulTensor, + RpptROI3DPtr roiGenericPtrSrc, + RpptRoi3DType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE; + if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + + hip_exec_multiply_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiGenericPtrSrc, + mulTensor, + rpp::deref(rppHandle)); + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** magnitude ********************/ + +RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + #ifdef HIP_COMPILE + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_magnitude_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_magnitude_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_magnitude_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_magnitude_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp index 23b52bc44..d78b8890a 100644 --- a/src/modules/rppt_tensor_audio_augmentations.cpp +++ b/src/modules/rppt_tensor_audio_augmentations.cpp @@ -126,3 +126,31 @@ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, return RPP_ERROR_NOT_IMPLEMENTED; } } + +/******************** down_mixing ********************/ + +RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s *srcDimsTensor, + bool normalizeWeights, + rppHandle_t rppHandle) +{ + if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + down_mixing_host_tensor(static_cast(srcPtr), + srcDescPtr, + static_cast(dstPtr), + dstDescPtr, + srcDimsTensor, + normalizeWeights, + rpp::deref(rppHandle)); + + return RPP_SUCCESS; + } + else + { + return RPP_ERROR_NOT_IMPLEMENTED; + } +} diff --git a/src/modules/rppt_tensor_color_augmentations.cpp b/src/modules/rppt_tensor_color_augmentations.cpp index be61b6da1..3023973fc 100644 --- a/src/modules/rppt_tensor_color_augmentations.cpp +++ b/src/modules/rppt_tensor_color_augmentations.cpp @@ -411,7 +411,7 @@ RppStatus rppt_color_cast_host(RppPtr_t srcPtr, { if (srcDescPtr->c != 3) { - return RPP_ERROR_INVALID_ARGUMENTS; + return RPP_ERROR_INVALID_CHANNELS; } RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); @@ -671,6 +671,72 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** color_temperature ********************/ + +RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp8s *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if (srcDescPtr->c != 3) + { + return RPP_ERROR_INVALID_CHANNELS; + } + + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + color_temperature_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + adjustmentValueTensor, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + color_temperature_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + adjustmentValueTensor, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + color_temperature_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + adjustmentValueTensor, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + color_temperature_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + adjustmentValueTensor, + roiTensorPtrSrc, + roiType, + layoutParams); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -887,7 +953,7 @@ RppStatus rppt_color_twist_gpu(RppPtr_t srcPtr, #ifdef HIP_COMPILE if (srcDescPtr->c != 3) { - return RPP_ERROR_INVALID_ARGUMENTS; + return RPP_ERROR_INVALID_CHANNELS; } Rpp32u paramIndex = 0; @@ -958,7 +1024,7 @@ RppStatus rppt_color_cast_gpu(RppPtr_t srcPtr, #ifdef HIP_COMPILE if (srcDescPtr->c != 3) { - return RPP_ERROR_INVALID_ARGUMENTS; + return RPP_ERROR_INVALID_CHANNELS; } Rpp32u paramIndex = 0; @@ -1204,4 +1270,71 @@ RppStatus rppt_lut_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** color_temperature ********************/ + +RppStatus rppt_color_temperature_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s *adjustmentValueTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if (srcDescPtr->c != 3) + { + return RPP_ERROR_INVALID_CHANNELS; + } + + Rpp32u paramIndex = 0; + copy_param_int(adjustmentValueTensor, rpp::deref(rppHandle), paramIndex++); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_color_temperature_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_color_temperature_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_color_temperature_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_color_temperature_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp index da1036256..fff62d085 100644 --- a/src/modules/rppt_tensor_geometric_augmentations.cpp +++ b/src/modules/rppt_tensor_geometric_augmentations.cpp @@ -1036,11 +1036,11 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr, else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); - if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; - if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; - if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_SRC_DST_LAYOUT_MISMATCH; if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE; if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) { @@ -1823,11 +1823,11 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr, rppHandle_t rppHandle) { #ifdef HIP_COMPILE - if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; - if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; - if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_SRC_DST_LAYOUT_MISMATCH; if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE; if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; + if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) { diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp index f17028e5e..28313a88f 100644 --- a/src/modules/rppt_tensor_statistical_operations.cpp +++ b/src/modules/rppt_tensor_statistical_operations.cpp @@ -107,6 +107,140 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** tensor_min ********************/ + +RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t minArr, + Rpp32u minArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if (srcDescPtr->c == 1) + { + if (minArrLength < srcDescPtr->n) // 1 min for each image + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + else if (srcDescPtr->c == 3) + { + if (minArrLength < srcDescPtr->n * 4) // min of each channel, and min of all 3 channels + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if (srcDescPtr->dataType == RpptDataType::U8) + { + tensor_min_u8_u8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(minArr), + minArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + tensor_min_f16_f16_host((Rpp16f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(minArr), + minArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + tensor_min_f32_f32_host((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(minArr), + minArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + tensor_min_i8_i8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(minArr), + minArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + + return RPP_SUCCESS; +} + +/******************** tensor_max ********************/ + +RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t maxArr, + Rpp32u maxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if (srcDescPtr->c == 1) + { + if (maxArrLength < srcDescPtr->n) // 1 min for each image + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + else if (srcDescPtr->c == 3) + { + if (maxArrLength < srcDescPtr->n * 4) // min of each channel, and min of all 3 channels + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if (srcDescPtr->dataType == RpptDataType::U8) + { + tensor_max_u8_u8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(maxArr), + maxArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + tensor_max_f16_f16_host((Rpp16f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(maxArr), + maxArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + tensor_max_f32_f32_host((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(maxArr), + maxArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + tensor_max_i8_i8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(maxArr), + maxArrLength, + roiTensorPtrSrc, + roiType, + layoutParams); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ @@ -184,4 +318,126 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, return RPP_SUCCESS; } + +/******************** tensor_min ********************/ + +RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t imageMinArr, + Rpp32u imageMinArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if (srcDescPtr->c == 1) + { + if (imageMinArrLength < srcDescPtr->n) // min of single channel + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + else if (srcDescPtr->c == 3) + { + if (imageMinArrLength < srcDescPtr->n * 4) // min of each channel, and overall min of all 3 channels + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + + if (srcDescPtr->dataType == RpptDataType::U8) + { + hip_exec_tensor_min(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(imageMinArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + hip_exec_tensor_min((half*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(imageMinArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + hip_exec_tensor_min((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(imageMinArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + hip_exec_tensor_min(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(imageMinArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** tensor_max ********************/ + +RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t imageMaxArr, + Rpp32u imageMaxArrLength, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + if (srcDescPtr->c == 1) + { + if (imageMaxArrLength < srcDescPtr->n) // max of single channel + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + else if (srcDescPtr->c == 3) + { + if (imageMaxArrLength < srcDescPtr->n * 4) // max of each channel, and overall max of all 3 channels + return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH; + } + + if (srcDescPtr->dataType == RpptDataType::U8) + { + hip_exec_tensor_max(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(imageMaxArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + hip_exec_tensor_max((half*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(imageMaxArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + hip_exec_tensor_max((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + static_cast(imageMaxArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + hip_exec_tensor_max(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(imageMaxArr), + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} #endif // backend diff --git a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp index 250ceadfc..e298ebd99 100644 --- a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp +++ b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp @@ -1356,8 +1356,8 @@ int main(int argc, char **argv) for (i = 0; i < images; i++) { - dstSize[i].height = srcSize[i].height / 3; - dstSize[i].width = srcSize[i].width / 1.1; + dstSize[i].height = srcSize[i].height / 2; + dstSize[i].width = srcSize[i].width / 2; if (maxDstHeight < dstSize[i].height) maxDstHeight = dstSize[i].height; if (maxDstWidth < dstSize[i].width) diff --git a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp index fbffdbe68..dc8679e5d 100644 --- a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp +++ b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp @@ -1357,8 +1357,8 @@ int main(int argc, char **argv) for (i = 0; i < images; i++) { - dstSize[i].height = srcSize[i].height / 3; - dstSize[i].width = srcSize[i].width / 1.1; + dstSize[i].height = srcSize[i].height / 2; + dstSize[i].width = srcSize[i].width / 2; if (maxDstHeight < dstSize[i].height) maxDstHeight = dstSize[i].height; if (maxDstWidth < dstSize[i].width) diff --git a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp index ed1e7751b..271ed3d1c 100644 --- a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp +++ b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp @@ -1459,8 +1459,8 @@ int main(int argc, char **argv) for (i = 0; i < images; i++) { - dstSize[i].height = srcSize[i].height / 3; - dstSize[i].width = srcSize[i].width / 1.1; + dstSize[i].height = srcSize[i].height / 2; + dstSize[i].width = srcSize[i].width / 2; if (maxDstHeight < dstSize[i].height) maxDstHeight = dstSize[i].height; if (maxDstWidth < dstSize[i].width) diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 04831ddf4..7bd46b39e 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -65,12 +65,12 @@ int main(int argc, char **argv) bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54); bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54); - bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 63); + bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63); bool randomOutputCase = (testCase == 84 || testCase == 49 || testCase == 54); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24); + bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89); bool noiseTypeCase = (testCase == 8); bool pln1OutTypeCase = (testCase == 86); - bool reductionTypeCase = (testCase == 87); unsigned int verbosity = atoi(argv[11]); unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1; @@ -104,7 +104,7 @@ int main(int argc, char **argv) if (layoutType == 2) { - if(testCase == 36 || testCase == 31 || testCase == 86) + if(testCase == 36 || testCase == 31 || testCase == 45 || testCase == 86) { printf("\ncase %d does not exist for PLN1 layout\n", testCase); return -1; @@ -323,35 +323,20 @@ int main(int argc, char **argv) double wallTime; string testCaseName; - if(testCase == 82 && imagesMixed) - { - std::cerr<<"\n RICAP only works with same dimension images"; - exit(0); - } - - if(testCase == 82 && batchSize < 2) - { - std::cerr<<"\n RICAP only works with BatchSize > 1"; - exit(0); - } - - // Initialize buffers for any reductionType functions + // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums) void *reductionFuncResultArr; Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4; - - if(reductionTypeCase) + if (reductionTypeCase) { - if(dstDescPtr->dataType == RpptDataType::U8) - CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64u))); - else if(dstDescPtr->dataType == RpptDataType::F16) - CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f))); - else if(dstDescPtr->dataType == RpptDataType::F32) - CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f))); - else if(dstDescPtr->dataType == RpptDataType::I8) - CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64s))); + int bitDepthByteSize = 0; + if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8)) + bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u); + else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32)) + bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f + CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize)); } - //Allocate hip memory for src/dst + // Allocate hip memory for src/dst CHECK(hipMalloc(&d_input, inputBufferSize)); CHECK(hipMalloc(&d_output, outputBufferSize)); if(dualInputCase) @@ -827,6 +812,22 @@ int main(int argc, char **argv) break; } + case 45: + { + testCaseName = "color_temperature"; + + Rpp32s adjustment[batchSize]; + for (i = 0; i < batchSize; i++) + adjustment[i] = 70; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_color_temperature_gpu(d_input, srcDescPtr, d_output, dstDescPtr, adjustment, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 49: { testCaseName = "box_filter"; @@ -859,6 +860,18 @@ int main(int argc, char **argv) break; } + case 61: + { + testCaseName = "magnitude"; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_magnitude_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 63: { testCaseName = "phase"; @@ -1028,6 +1041,30 @@ int main(int argc, char **argv) break; } + case 88: + { + testCaseName = "tensor_min"; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_min_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 89: + { + testCaseName = "tensor_max"; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_max_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } default: missingFuncFlag = 1; break; @@ -1055,33 +1092,41 @@ int main(int argc, char **argv) if(srcDescPtr->c == 3) printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): "); else if(srcDescPtr->c == 1) + { printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): "); + reductionFuncResultArrLength = srcDescPtr->n; + } - if(dstDescPtr->dataType == RpptDataType::U8) + // print reduction functions output array based on different bit depths, and precision desired + int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0; + if (dstDescPtr->dataType == RpptDataType::U8) { - Rpp64u *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %llu ", reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::F16) + else if (dstDescPtr->dataType == RpptDataType::F16) { - Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %0.3f ", (float)reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::F32) + else if (dstDescPtr->dataType == RpptDataType::F32) { - Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %0.3f ", (float)reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::I8) + else if (dstDescPtr->dataType == RpptDataType::I8) { - Rpp64s *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %lld ", reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - printf("\n"); /*Compare the output of the function with golden outputs only if @@ -1089,7 +1134,12 @@ int main(int argc, char **argv) 2.input bit depth 0 (U8) 3.source and destination layout are the same*/ if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase)) - compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + { + if (testCase == 87) + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + else + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + } } else { @@ -1175,4 +1225,4 @@ int main(int argc, char **argv) CHECK(hipFree(d_input_second)); CHECK(hipFree(d_output)); return 0; -} \ No newline at end of file +} diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp index f4741ad78..e8dc4e365 100644 --- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp @@ -55,8 +55,6 @@ int main(int argc, char * argv[]) fprintf(stdout, "\nUsage: %s
\n", argv[0]); exit(1); } - - if(batchSize > MAX_BATCH_SIZE) { std::cout << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!"; @@ -268,6 +266,38 @@ int main(int argc, char * argv[]) break; } + case 2: + { + testCaseName = "add_scalar"; + Rpp32f addTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + addTensor[i] = 40; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 2) + rppt_add_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, addTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 3: + { + testCaseName = "subtract_scalar"; + Rpp32f subtractTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + subtractTensor[i] = 40; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 2) + rppt_subtract_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, subtractTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 4: { testCaseName = "flip_voxel"; @@ -292,6 +322,22 @@ int main(int argc, char * argv[]) break; } + case 5: + { + testCaseName = "multiply_scalar"; + Rpp32f mulTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + mulTensor[i] = 80; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 2) + rppt_multiply_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, mulTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } default: { missingFuncFlag = 1; diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py index 6150ad97c..2e8054332 100644 --- a/utilities/test_suite/HIP/runTests.py +++ b/utilities/test_suite/HIP/runTests.py @@ -153,7 +153,7 @@ def get_log_file_list(preserveOutput): # Functionality group finder def func_group_finder(case_number): - if case_number < 5 or case_number == 13 or case_number == 36: + if case_number < 5 or case_number == 13 or case_number == 36 or case_number == 45: return "color_augmentations" elif case_number == 8 or case_number == 30 or case_number == 82 or case_number == 83 or case_number == 84: return "effects_augmentations" @@ -165,6 +165,8 @@ def func_group_finder(case_number): return "filter_augmentations" elif case_number < 40: return "geometric_augmentations" + elif case_number == 61: + return "arithmetic_operations" elif case_number < 87: return "data_exchange_operations" elif case_number < 88: @@ -313,11 +315,11 @@ def rpp_test_suite_parser_and_validator(): parser = argparse.ArgumentParser() parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1") parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2") - parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)") - parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)") - parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)") - parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False) - parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False) + parser.add_argument("--case_start", type = int, default = 0, help="Testing range starting case # - (0:90)") + parser.add_argument("--case_end", type = int, default = 90, help="Testing range ending case # - (0:90)") + parser.add_argument('--test_type', type = int, default = 0, help="Type of Test - (0 = Unit tests / 1 = Performance tests)") + parser.add_argument('--case_list', nargs = "+", help="List of case numbers to list", required=False) + parser.add_argument('--profiling', type = str , default='NO', help='Run with profiler? - (YES/NO)', required=False) parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False) parser.add_argument('--decoder_type', type = int, default = 0, help = "Type of Decoder to decode the input data - (0 = TurboJPEG / 1 = OpenCV)") parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests") @@ -332,8 +334,8 @@ def rpp_test_suite_parser_and_validator(): validate_path(qaInputFile) # validate the parameters passed by user - if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)): - print("Starting case# and Ending case# must be in the 0:87 range. Aborting!") + if ((args.case_start < 0 or args.case_start > 90) or (args.case_end < 0 or args.case_end > 90)): + print("Starting case# and Ending case# must be in the 0:90 range. Aborting!") exit(0) elif args.case_end < args.case_start: print("Ending case# must be greater than starting case#. Aborting!") @@ -347,7 +349,7 @@ def rpp_test_suite_parser_and_validator(): elif args.decoder_type < 0 or args.decoder_type > 1: print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting") exit(0) - elif args.case_list is not None and args.case_start > 0 and args.case_end < 87: + elif args.case_list is not None and args.case_start > 0 and args.case_end < 90: print("Invalid input! Please provide only 1 option between case_list, case_start and case_end") exit(0) elif args.num_runs <= 0: @@ -374,8 +376,8 @@ def rpp_test_suite_parser_and_validator(): args.case_list = [str(x) for x in args.case_list] else: for case in args.case_list: - if int(case) < 0 or int(case) > 87: - print("The case# must be in the 0:87 range!") + if int(case) < 0 or int(case) > 90: + print("The case# must be in the 0:90 range!") exit(0) return args @@ -456,8 +458,8 @@ def rpp_test_suite_parser_and_validator(): if qaMode == 1 and case != "82": srcPath1 = inFilePath1 srcPath2 = inFilePath2 - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue for layout in range(3): dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath) @@ -474,8 +476,8 @@ def rpp_test_suite_parser_and_validator(): else: if (testType == 1 and profilingOption == "NO"): for case in caseList: - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv: srcPath1 = ricapInFilePath @@ -489,8 +491,8 @@ def rpp_test_suite_parser_and_validator(): NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69] for case in caseList: - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv: srcPath1 = ricapInFilePath @@ -627,7 +629,9 @@ def rpp_test_suite_parser_and_validator(): "effects_augmentations", "filter_augmentations", "geometric_augmentations", - "morphological_operations" + "morphological_operations", + "arithmetic_operations", + "statistical_operations" ] for log_file in log_file_list: # Opening log file @@ -692,7 +696,7 @@ def rpp_test_suite_parser_and_validator(): f.close() # print the results of qa tests -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '54', '63', '70', '80', '82', '83', '84', '85', '86', '87'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89'] nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: @@ -717,4 +721,4 @@ def rpp_test_suite_parser_and_validator(): resultsInfo += "\n - Total augmentations with golden output QA test support = " + str(len(supportedCaseList) - len(nonQACaseList)) resultsInfo += "\n - Total augmentations without golden ouput QA test support (due to randomization involved) = " + str(len(nonQACaseList)) f.write(resultsInfo) - print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------") \ No newline at end of file + print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------") diff --git a/utilities/test_suite/HIP/runTests_voxel.py b/utilities/test_suite/HIP/runTests_voxel.py index 2f007ecaa..b6648affb 100644 --- a/utilities/test_suite/HIP/runTests_voxel.py +++ b/utilities/test_suite/HIP/runTests_voxel.py @@ -39,7 +39,7 @@ outFolderPath = os.getcwd() buildFolderPath = os.getcwd() caseMin = 0 -caseMax = 4 +caseMax = 5 # Check if folder path is empty, if it is the root folder, or if it exists, and remove its contents def validate_and_remove_contents(path): @@ -258,8 +258,8 @@ def rpp_test_suite_parser_and_validator(): parser = argparse.ArgumentParser() parser.add_argument("--header_path", type = str, default = headerFilePath, help = "Path to the nii header") parser.add_argument("--data_path", type = str, default = dataFilePath, help = "Path to the nii data file") - parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing range starting case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") - parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing range ending case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") + parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") + parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)") parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False) parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False) @@ -309,8 +309,8 @@ def rpp_test_suite_parser_and_validator(): else: for case in args.case_list: if int(case) < caseMin or int(case) > caseMax: - print("The case# must be in the 0:1 range!") - exit(0) + print("The case# must be in [" + str(caseMin) + ":" + str(caseMax) + "]") + exit(0) # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs if args.qa_mode: @@ -470,7 +470,7 @@ def rpp_test_suite_parser_and_validator(): print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv") # print the results of qa tests -supportedCaseList = ['0', '1', '4'] +supportedCaseList = ['0', '1', '2', '3', '4', '5'] nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: diff --git a/utilities/test_suite/HOST/CMakeLists.txt b/utilities/test_suite/HOST/CMakeLists.txt index 6adc461b3..b7abf5d77 100644 --- a/utilities/test_suite/HOST/CMakeLists.txt +++ b/utilities/test_suite/HOST/CMakeLists.txt @@ -82,8 +82,15 @@ if (OpenCV_FOUND) link_directories(${ROCM_PATH}/lib /usr/local/lib) add_executable(Tensor_host Tensor_host.cpp) + add_executable(BatchPD_host_pkd3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp) + add_executable(BatchPD_host_pln1 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp) + add_executable(BatchPD_host_pln3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17") target_link_libraries(Tensor_host ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST}) + target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST}) + target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST}) + target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST}) else() message("-- ${Red}Error: OpenCV must be installed to install ${PROJECT_NAME} successfully!${ColourReset}") endif() @@ -102,7 +109,7 @@ else() endif() if(NOT libsnd_LIBS) - message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_voxel_host successfully!${ColourReset}") + message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_host successfully!${ColourReset}") else() message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}") include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include) diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp index 1e416ed52..b698a2def 100644 --- a/utilities/test_suite/HOST/Tensor_host.cpp +++ b/utilities/test_suite/HOST/Tensor_host.cpp @@ -65,14 +65,15 @@ int main(int argc, char **argv) int batchSize = atoi(argv[14]); bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24); - bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 63); + bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63); bool randomOutputCase = (testCase == 84); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24); + bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89); bool noiseTypeCase = (testCase == 8); bool pln1OutTypeCase = (testCase == 86); + unsigned int verbosity = atoi(argv[11]); unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1; - bool reductionTypeCase = (testCase == 87); int roiList[4] = {atoi(argv[15]), atoi(argv[16]), atoi(argv[17]), atoi(argv[18])}; string scriptPath = argv[19]; @@ -102,7 +103,7 @@ int main(int argc, char **argv) if (layoutType == 2) { - if(testCase == 36 || testCase == 31 || testCase == 86) + if(testCase == 31 || testCase == 36 || testCase == 45 || testCase == 86) { printf("\ncase %d does not exist for PLN1 layout\n", testCase); return -1; @@ -140,6 +141,11 @@ int main(int argc, char **argv) std::cerr << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!"; exit(0); } + else if(testCase == 82 && batchSize < 2) + { + std::cerr<<"\n RICAP only works with BatchSize > 1"; + exit(0); + } // Get function name string funcName = augmentationMap[testCase]; @@ -310,6 +316,24 @@ int main(int argc, char **argv) input_second = static_cast(calloc(inputBufferSize, 1)); output = static_cast(calloc(outputBufferSize, 1)); + // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums) + void *reductionFuncResultArr; + Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4; + if (reductionTypeCase) + { + int bitDepthByteSize = 0; + if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8)) + { + bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u); + reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize)); + } + else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32)) + { + bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f + reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize)); + } + } + // Set the number of threads to be used by OpenMP pragma for RPP batch processing on host. // If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size Rpp32u numThreads = 0; @@ -321,33 +345,6 @@ int main(int argc, char **argv) double cpuTime, wallTime; string testCaseName; - if(testCase == 82 && imagesMixed) - { - std::cerr<<"\n RICAP only works with same dimension images"; - exit(0); - } - - if(testCase == 82 && batchSize < 2) - { - std::cerr<<"\n RICAP only works with BatchSize > 1"; - exit(0); - } - - // Initialize buffers for any reductionType functions - void *reductionFuncResultArr; - Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4; - if(reductionTypeCase) - { - if(dstDescPtr->dataType == RpptDataType::U8) - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp64u))); - else if(dstDescPtr->dataType == RpptDataType::F16) - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp32f))); - else if(dstDescPtr->dataType == RpptDataType::F32) - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp32f))); - else if(dstDescPtr->dataType == RpptDataType::I8) - reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp64s))); - } - // case-wise RPP API and measure time script for Unit and Performance test printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize); for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++) @@ -818,6 +815,36 @@ int main(int argc, char **argv) break; } + case 45: + { + testCaseName = "color_temperature"; + + Rpp8s adjustment[batchSize]; + for (i = 0; i < batchSize; i++) + adjustment[i] = 70; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_color_temperature_host(input, srcDescPtr, output, dstDescPtr, adjustment, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 61: + { + testCaseName = "magnitude"; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_magnitude_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 63: { testCaseName = "phase"; @@ -1032,6 +1059,40 @@ int main(int argc, char **argv) break; } + case 88: + { + testCaseName = "tensor_min"; + + if(srcDescPtr->c == 1) + reductionFuncResultArrLength = srcDescPtr->n; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_min_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 89: + { + testCaseName = "tensor_max"; + + if(srcDescPtr->c == 1) + reductionFuncResultArrLength = srcDescPtr->n; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_tensor_max_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } default: missingFuncFlag = 1; break; @@ -1064,33 +1125,41 @@ int main(int argc, char **argv) if(srcDescPtr->c == 3) printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): "); else if(srcDescPtr->c == 1) + { printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): "); + reductionFuncResultArrLength = srcDescPtr->n; + } - if(dstDescPtr->dataType == RpptDataType::U8) + // print reduction functions output array based on different bit depths, and precision desired + int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0; + if (dstDescPtr->dataType == RpptDataType::U8) { - Rpp64u *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %llu ", reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::F16) + else if (dstDescPtr->dataType == RpptDataType::F16) { - Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %0.3f ", (float)reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::F32) + else if (dstDescPtr->dataType == RpptDataType::F32) { - Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %0.3f ", (float)reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - else if(dstDescPtr->dataType == RpptDataType::I8) + else if (dstDescPtr->dataType == RpptDataType::I8) { - Rpp64s *reductionOutPtr = static_cast(reductionFuncResultArr); - for (int i = 0; i < reductionFuncResultArrLength; i++) - printf(" %lld ", reductionOutPtr[i]); + if (testCase == 87) + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); + else + print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - printf("\n"); /*Compare the output of the function with golden outputs only if @@ -1098,7 +1167,12 @@ int main(int argc, char **argv) 2.input bit depth 0 (U8) 3.source and destination layout are the same*/ if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase)) - compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + { + if (testCase == 87) + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + else + compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); + } } else { @@ -1181,4 +1255,4 @@ int main(int argc, char **argv) if(reductionTypeCase) free(reductionFuncResultArr); return 0; -} \ No newline at end of file +} diff --git a/utilities/test_suite/HOST/Tensor_host_audio.cpp b/utilities/test_suite/HOST/Tensor_host_audio.cpp index 139e7e97e..fe6fa1246 100644 --- a/utilities/test_suite/HOST/Tensor_host_audio.cpp +++ b/utilities/test_suite/HOST/Tensor_host_audio.cpp @@ -197,6 +197,25 @@ int main(int argc, char **argv) break; } + case 3: + { + testCaseName = "down_mixing"; + bool normalizeWeights = false; + Rpp32s srcDimsTensor[batchSize * 2]; + + for (int i = 0, j = 0; i < batchSize; i++, j += 2) + { + srcDimsTensor[j] = srcLengthTensor[i]; + srcDimsTensor[j + 1] = channelsTensor[i]; + dstDims[i].height = srcLengthTensor[i]; + dstDims[i].width = 1; + } + + startWallTime = omp_get_wtime(); + rppt_down_mixing_host(inputf32, srcDescPtr, outputf32, dstDescPtr, srcDimsTensor, normalizeWeights, handle); + + break; + } default: { missingFuncFlag = 1; @@ -263,4 +282,4 @@ int main(int argc, char **argv) free(inputf32); free(outputf32); return 0; -} +} \ No newline at end of file diff --git a/utilities/test_suite/HOST/Tensor_voxel_host.cpp b/utilities/test_suite/HOST/Tensor_voxel_host.cpp index 15cdbedd3..ebaaaf639 100644 --- a/utilities/test_suite/HOST/Tensor_voxel_host.cpp +++ b/utilities/test_suite/HOST/Tensor_voxel_host.cpp @@ -55,7 +55,10 @@ int main(int argc, char * argv[]) fprintf(stdout, "\nUsage: %s
\n", argv[0]); exit(1); } +<<<<<<< HEAD +======= +>>>>>>> abishek_rpp/develop if(batchSize > MAX_BATCH_SIZE) { std::cout << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!"; @@ -252,6 +255,38 @@ int main(int argc, char * argv[]) break; } + case 2: + { + testCaseName = "add_scalar"; + Rpp32f addTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + addTensor[i] = 40; + + startWallTime = omp_get_wtime(); + if(inputBitDepth == 2) + rppt_add_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, addTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 3: + { + testCaseName = "subtract_scalar"; + Rpp32f subtractTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + subtractTensor[i] = 40; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 2) + rppt_subtract_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, subtractTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 4: { testCaseName = "flip_voxel"; @@ -267,10 +302,28 @@ int main(int argc, char * argv[]) } startWallTime = omp_get_wtime(); - if(inputBitDepth == 0) + if (inputBitDepth == 0) rppt_flip_voxel_host(inputU8, descriptorPtr3D, outputU8, descriptorPtr3D, horizontalTensor, verticalTensor, depthTensor, roiGenericSrcPtr, roiTypeSrc, handle); - else + else if(inputBitDepth == 2) rppt_flip_voxel_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, horizontalTensor, verticalTensor, depthTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 5: + { + testCaseName = "multiply_scalar"; + Rpp32f mulTensor[batchSize]; + + for (int i = 0; i < batchSize; i++) + mulTensor[i] = 80; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 2) + rppt_multiply_scalar_host(inputF32, descriptorPtr3D, outputF32, descriptorPtr3D, mulTensor, roiGenericSrcPtr, roiTypeSrc, handle); + else + missingFuncFlag = 1; break; } diff --git a/utilities/test_suite/HOST/runAudioTests.py b/utilities/test_suite/HOST/runAudioTests.py index c05a7a011..70ec00026 100644 --- a/utilities/test_suite/HOST/runAudioTests.py +++ b/utilities/test_suite/HOST/runAudioTests.py @@ -37,7 +37,7 @@ outFolderPath = os.getcwd() buildFolderPath = os.getcwd() caseMin = 0 -caseMax = 2 +caseMax = 3 # Checks if the folder path is empty, or is it a root folder, or if it exists, and remove its contents def validate_and_remove_files(path): @@ -235,13 +235,31 @@ def rpp_test_suite_parser_and_validator(): exit(0) for case in caseList: + if "--input_path" not in sys.argv: + if case == "3": + srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1" + else: + srcPath = inFilePath + if int(case) < 0 or int(case) > 3: + print(f"Invalid case number {case}. Case number must be 0-3 range!") + continue + run_unit_test(srcPath, case, numRuns, testType, batchSize, outFilePath) else: for case in caseList: + if "--input_path" not in sys.argv: + if case == "3": + srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1" + else: + srcPath = inFilePath + if int(case) < 0 or int(case) > 3: + print(f"Invalid case number {case}. Case number must be 0-3 range!") + continue + run_performance_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath) # print the results of qa tests -supportedCaseList = ['0', '1', '2'] +supportedCaseList = ['0', '1', '2', '3'] nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support if testType == 0: diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py index bd938e218..b08c4d5e8 100644 --- a/utilities/test_suite/HOST/runTests.py +++ b/utilities/test_suite/HOST/runTests.py @@ -28,6 +28,7 @@ import sys import datetime import shutil +import pandas as pd # Set the timestamp timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") @@ -37,6 +38,7 @@ inFilePath2 = scriptPath + "/../TEST_IMAGES/three_images_mixed_src2" ricapInFilePath = scriptPath + "/../TEST_IMAGES/three_images_150x150_src1" qaInputFile = scriptPath + "/../TEST_IMAGES/three_images_mixed_src1" +perfQaInputFile = scriptPath + "/../TEST_IMAGES/eight_images_mixed_src1" outFolderPath = os.getcwd() buildFolderPath = os.getcwd() @@ -113,12 +115,14 @@ def get_log_file_list(preserveOutput): # Functionality group finder def func_group_finder(case_number): - if case_number < 5 or case_number == 13 or case_number == 36 or case_number == 31: + if case_number < 5 or case_number == 13 or case_number == 36 or case_number == 31 or case_number == 45: return "color_augmentations" elif case_number == 8 or case_number == 30 or case_number == 82 or case_number == 83 or case_number == 84: return "effects_augmentations" elif case_number < 40: return "geometric_augmentations" + elif case_number == 61: + return "arithmetic_operations" elif case_number < 87: return "data_exchange_operations" elif case_number < 88: @@ -126,7 +130,7 @@ def func_group_finder(case_number): else: return "miscellaneous" - # Generate a directory name based on certain parameters +# Generate a directory name based on certain parameters def directory_name_generator(qaMode, affinity, layoutType, case, path): if qaMode == 0: functionality_group = func_group_finder(int(case)) @@ -155,7 +159,11 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo print("--------------------------------") print("Running a New Functionality...") print("--------------------------------") - for bitDepth in range(7): + if qaMode: + maxBitdepth = 1 + else: + maxBitdepth = 7 + for bitDepth in range(maxBitdepth): print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n") for outputFormatToggle in range(2): @@ -183,6 +191,16 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo print("------------------------------------------------------------------------------------------") def run_performance_test_cmd(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList): + if qaMode == 1: + with open("{}/BatchPD_host_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file: + process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + log_file_layout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + while True: + output = process.stdout.readline() + if not output and process.poll() is not None: + break + print(output.strip()) + log_file.write(output) + with open("{}/Tensor_host_{}_raw_performance_log.txt".format(loggingFolder, log_file_layout), "a") as log_file: print(f"./Tensor_host {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ") process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec @@ -198,8 +216,11 @@ def run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dst print("--------------------------------") print("Running a New Functionality...") print("--------------------------------") - - for bitDepth in range(7): + if qaMode: + maxBitdepth = 1 + else: + maxBitdepth = 7 + for bitDepth in range(maxBitdepth): print("\n\n\nRunning New Bit Depth...\n-------------------------\n\n") for outputFormatToggle in range(2): @@ -223,8 +244,8 @@ def rpp_test_suite_parser_and_validator(): parser = argparse.ArgumentParser() parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1") parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2") - parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)") - parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)") + parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:89)") + parser.add_argument("--case_end", type = int, default = 89, help = "Testing range ending case # - (0:89)") parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)") parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False) parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False) @@ -239,10 +260,11 @@ def rpp_test_suite_parser_and_validator(): validate_path(args.input_path1) validate_path(args.input_path2) validate_path(qaInputFile) + validate_path(perfQaInputFile) # validate the parameters passed by user - if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)): - print("Starting case# and Ending case# must be in the 0:87 range. Aborting!") + if ((args.case_start < 0 or args.case_start > 89) or (args.case_end < 0 or args.case_end > 89)): + print("Starting case# and Ending case# must be in the 0:89 range. Aborting!") exit(0) elif args.case_end < args.case_start: print("Ending case# must be greater than starting case#. Aborting!") @@ -256,7 +278,7 @@ def rpp_test_suite_parser_and_validator(): elif args.decoder_type < 0 or args.decoder_type > 1: print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting") exit(0) - elif args.case_list is not None and args.case_start > 0 and args.case_end < 87: + elif args.case_list is not None and args.case_start > 0 and args.case_end < 89: print("Invalid input! Please provide only 1 option between case_list, case_start and case_end") exit(0) elif args.num_runs <= 0: @@ -280,8 +302,8 @@ def rpp_test_suite_parser_and_validator(): args.case_list = [str(x) for x in args.case_list] else: for case in args.case_list: - if int(case) < 0 or int(case) > 87: - print("The case# must be in the 0:87 range!") + if int(case) < 0 or int(case) > 89: + print("The case# must be in the 0:89 range!") exit(0) return args @@ -300,10 +322,14 @@ def rpp_test_suite_parser_and_validator(): batchSize = args.batch_size roiList = ['0', '0', '0', '0'] if args.roi is None else args.roi -if qaMode and batchSize != 3: +if qaMode and testType == 0 and batchSize != 3: print("QA mode can only run with a batch size of 3.") exit(0) +if qaMode and testType == 1 and batchSize != 8: + print("Performance QA mode can only run with a batch size of 8.") + exit(0) + # set the output folders and number of runs based on type of test (unit test / performance test) if(testType == 0): if qaMode: @@ -355,8 +381,8 @@ def rpp_test_suite_parser_and_validator(): if qaMode == 1 and case != "82": srcPath1 = inFilePath1 srcPath2 = inFilePath2 - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue for layout in range(3): dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath) @@ -371,19 +397,22 @@ def rpp_test_suite_parser_and_validator(): create_layout_directories(dstPath, layoutDict) else: for case in caseList: - if int(case) < 0 or int(case) > 87: - print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!") + if int(case) < 0 or int(case) > 89: + print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!") continue + # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs + if qaMode == 1 and case != "82": + srcPath1 = inFilePath1 + srcPath2 = inFilePath2 if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv: - srcPath1 = ricapInFilePath - srcPath2 = ricapInFilePath + srcPath1 = ricapInFilePath + srcPath2 = ricapInFilePath for layout in range(3): dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath) - run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList) # print the results of qa tests -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '54', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89'] nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: @@ -415,7 +444,146 @@ def rpp_test_suite_parser_and_validator(): if testType == 0 and qaMode == 0: create_layout_directories(dstPath, layoutDict) # Performance tests -elif (testType == 1): +elif (testType == 1 and qaMode == 1): + columns = ['BatchPD_Augmentation_Type', 'Tensor_Augmentation_Type', 'Performance Speedup (%)', 'Test_Result'] + tensorAugVariations = [] + batchPDAugVariations = [] + achievedPerf = [] + status = [] + df = pd.DataFrame(columns=columns) + tensorLogFileList = get_log_file_list(preserveOutput) + batchpdLogFileList = [sub.replace("Tensor_host", "BatchPD_host") for sub in tensorLogFileList] # will be needed only in qa mode + + stats = [] + tensorVal = [] + batchpdVal = [] + functions = [] + functionsBatchPD = [] + funcCount = 0 + performanceNoise = 10 + perfQASupportCaseList = ["resize", "color_twist", "phase"] + for i in range(3): + tensorLogFile = tensorLogFileList[i] + batchpdLogFile = batchpdLogFileList[i] + # Opening log file + try: + tensorFile = open(tensorLogFile,"r") + except IOError: + print("Skipping file -> "+ tensorLogFile) + continue + + # Opening log file + try: + batchpdFile = open(batchpdLogFile,"r") + except IOError: + print("Skipping file -> "+ batchpdLogFile) + continue + + prevLine = "" + # Loop over each line + for line in tensorFile: + if "max,min,avg wall times in ms/batch" in line and "u8_Tensor" in prevLine: + layoutCheck = "PKD3_toPKD3" in prevLine or "PLN3_toPLN3" in prevLine or "PLN1_toPLN1" in prevLine + interpolationCheck = "interpolationType" not in prevLine or "interpolationTypeBilinear" in prevLine + if layoutCheck and interpolationCheck: + splitWordStart = "Running " + splitWordEnd = " " + str(numRuns) + prevLine = prevLine.partition(splitWordStart)[2].partition(splitWordEnd)[0] + splitWordStart = "max,min,avg wall times in ms/batch = " + splitWordEnd = "\n" + if prevLine not in functions: + functions.append(prevLine) + stats = line.partition(splitWordStart)[2].partition(splitWordEnd)[0].split(",") + tensorVal.append(float(stats[2])) + funcCount += 1 + + if line != "\n": + prevLine = line + + # Closing log file + tensorFile.close() + + stats = [] + prevLine = "" + for line in batchpdFile: + if "max,min,avg" in line and "u8_BatchPD" in prevLine: + if "PKD3_toPKD3" in prevLine or "PLN3_toPLN3" in prevLine or "PLN1_toPLN1" in prevLine: + splitWordStart = "Running " + splitWordEnd = " " + str(numRuns) + prevLine = prevLine.partition(splitWordStart)[2].partition(splitWordEnd)[0] + splitWordStart = "max,min,avg" + splitWordEnd = "\n" + if prevLine not in functionsBatchPD: + functionsBatchPD.append(prevLine) + stats = line.partition(splitWordStart)[2].partition(splitWordEnd)[0].split(",") + batchpdVal.append(float(stats[2]) * float(1000.0)) + + if line != "\n": + prevLine = line + + # Closing log file + batchpdFile.close() + + print("---------------------------------- Results of QA Test - Tensor_host ----------------------------------\n") + qaFilePath = os.path.join(outFilePath, "QA_results.txt") + excelFilePath = os.path.join(outFilePath, "performance_qa_results.xlsx") + f = open(qaFilePath, 'w') + numLines = 0 + numPassed = 0 + removalList = ["_HOST", "_toPKD3", "_toPLN3", "_toPLN1"] + for i in range(len(functions)): + perfImprovement = int(((batchpdVal[i] - tensorVal[i]) / batchpdVal[i]) * 100) + numLines += 1 + funcName = functions[i] + caseName = funcName.split("_u8_")[0] + for string in removalList: + funcName = funcName.replace(string, "") + if caseName not in perfQASupportCaseList: + print("Error! QA mode is not yet available for variant: " + funcName) + continue + achievedPerf.append(perfImprovement) + tensorAugVariations.append(funcName) + if perfImprovement > -performanceNoise: + numPassed += 1 + status.append("PASSED") + print(funcName + ": PASSED") + else: + status.append("FAILED") + print(funcName + ": FAILED") + + resultsInfo = "\n\nFinal Results of Tests:" + resultsInfo += "\n - Total test cases including all subvariants REQUESTED = " + str(numLines) + resultsInfo += "\n - Total test cases including all subvariants PASSED = " + str(numPassed) + f.write(resultsInfo) + batchPDAugVariations = [s.replace('Tensor', 'BatchPD') for s in tensorAugVariations] + df['Tensor_Augmentation_Type'] = tensorAugVariations + df['BatchPD_Augmentation_Type'] = batchPDAugVariations + df['Performance Speedup (%)'] = achievedPerf + df['Test_Result'] = status + # Calculate the number of cases passed and failed + passedCases = df['Test_Result'].eq('PASSED').sum() + failedCases = df['Test_Result'].eq('FAILED').sum() + + summary_row = {'BatchPD_Augmentation_Type': pd.NA, + 'Tensor_Augmentation_Type': pd.NA, + 'Performance Speedup (%)': pd.NA, + 'Test_Result': f'Final Results of Tests: Passed: {passedCases}, Failed: {failedCases}'} + + print("\n", df.to_markdown()) + + # Append the summary row to the DataFrame + # Convert the dictionary to a DataFrame + summary_row = pd.DataFrame([summary_row]) + df = pd.concat([df, summary_row], ignore_index=True) + + df.to_excel(excelFilePath, index=False) + print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------") + print("\nIMPORTANT NOTE:") + print("- The following performance comparison shows Performance Speedup percentages between times measured on previous generation RPP-BatchPD APIs against current generation RPP-Tensor APIs.") + print(f"- All APIs have been improved for performance ranging from {0}% (almost same) to {100}% faster.") + print("- Random observations of negative speedups might always occur due to current test machine temperature/load variances or other CPU/GPU state-dependent conditions.") + print("\n-------------------------------------------------------------------\n") +elif (testType == 1 and qaMode == 0): log_file_list = get_log_file_list(preserveOutput) functionality_group_list = [ @@ -423,6 +591,8 @@ def rpp_test_suite_parser_and_validator(): "data_exchange_operations", "effects_augmentations", "geometric_augmentations", + "arithmetic_operations", + "statistical_operations", ] for log_file in log_file_list: @@ -485,4 +655,4 @@ def rpp_test_suite_parser_and_validator(): print("No variants under this category") # Closing log file - f.close() \ No newline at end of file + f.close() diff --git a/utilities/test_suite/HOST/runTests_voxel.py b/utilities/test_suite/HOST/runTests_voxel.py index 97b6eb3d7..93318d8c7 100644 --- a/utilities/test_suite/HOST/runTests_voxel.py +++ b/utilities/test_suite/HOST/runTests_voxel.py @@ -39,7 +39,7 @@ outFolderPath = os.getcwd() buildFolderPath = os.getcwd() caseMin = 0 -caseMax = 4 +caseMax = 5 # Check if folder path is empty, if it is the root folder, or if it exists, and remove its contents def validate_and_remove_contents(path): @@ -185,8 +185,8 @@ def rpp_test_suite_parser_and_validator(): parser = argparse.ArgumentParser() parser.add_argument("--header_path", type = str, default = headerFilePath, help = "Path to the nii header") parser.add_argument("--data_path", type = str, default = dataFilePath, help = "Path to the nii data file") - parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing range starting case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") - parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing range ending case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") + parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") + parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]") parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)") parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False) parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False) @@ -232,8 +232,8 @@ def rpp_test_suite_parser_and_validator(): else: for case in args.case_list: if int(case) < caseMin or int(case) > caseMax: - print("The case# must be in the 0:1 range!") - exit(0) + print("The case# must be in [" + str(caseMin) + ":" + str(caseMax) + "]") + exit(0) # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs if args.qa_mode: @@ -321,7 +321,7 @@ def rpp_test_suite_parser_and_validator(): run_performance_test(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize) # print the results of qa tests -supportedCaseList = ['0', '1', '4'] +supportedCaseList = ['0', '1', '2', '3', '4', '5'] nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md index cc4c662f0..067bedb1d 100644 --- a/utilities/test_suite/README.md +++ b/utilities/test_suite/README.md @@ -80,7 +80,12 @@ This repository contains three test suites for the AMD ROCm Performance Primitiv sudo make -j$nproc install ``` -## RPP Image Test Suite +* Openpyxl + ``` + pip install openpyxl + ``` + +## Rpp Image Test Suite The image test suite can be executed under 2 backend scenarios - (HOST/HIP): - HOST backend - (On a CPU with HOST backend) - HIP backend - (On a GPU with HIP backend) @@ -89,8 +94,8 @@ The image test suite can be executed under 2 backend scenarios - (HOST/HIP): The image test suite accepts the following command line arguments: - input_path1: The path to the input folder 1. Default is $cwd/../TEST_IMAGES/three_images_mixed_src1 - input_path2: The path to the input folder 2. Default is $cwd/../TEST_IMAGES/three_images_mixed_src2 -- case_start: The starting case number for the test range (0-87). Default is 0 -- case_end: The ending case number for the test range (0-87). Default is 87 +- case_start: The starting case number for the test range (0-89). Default is 0 +- case_end: The ending case number for the test range (0-89). Default is 89 - test_type: The type of test to run (0 = Unit tests, 1 = Performance tests). Default is 0 - case_list: A list of specific case numbers to run. Must be used in conjunction with --test_type - profiling: Run the tests with a profiler (YES/NO). Default is NO. This option is only available with HIP backend @@ -112,23 +117,27 @@ The test suite can be run with the following command: python runTests.py --input_path1 --input_path2 --case_start --case_end --test_type --profiling ``` -### Modes of operation (RPP Image Test Suite) -- QA mode - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3. +## Modes of operation (Rpp Image Test Suite) +- QA mode (Unit tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3. Note: QA mode is not supported for case 84 due to run-to-run variation of outputs. ``` python -python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 1 --batch_size 3 +python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 1 --batch_size 3 +``` +- QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement +``` python +python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 --num_runs 100 ``` - Unit test mode - Unit tests allowing users to pass a path to a folder containing images, to execute the desired functionality and variant once, report RPP execution wall time, save and view output images Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1 RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1 ``` python -python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 0 +python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 0 ``` - Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time, or optionally, AMD rocprof kernel profiler max/min/avg time for HIP backend variants. Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1 RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1 ``` python -python runTests.py --case_start 0 --case_end 87 --test_type 1 +python runTests.py --case_start 0 --case_end 89 --test_type 1 ``` To run the unit tests / performance tests for specific case numbers. please case use case_list parameter. Example as below diff --git a/utilities/test_suite/REFERENCE_OUTPUT/color_temperature/color_temperature_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/color_temperature/color_temperature_u8_Tensor.bin new file mode 100644 index 000000000..696f0daa5 Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/color_temperature/color_temperature_u8_Tensor.bin differ diff --git a/utilities/test_suite/REFERENCE_OUTPUT/magnitude/magnitude_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/magnitude/magnitude_u8_Tensor.bin new file mode 100644 index 000000000..fbf86994b Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/magnitude/magnitude_u8_Tensor.bin differ diff --git a/utilities/test_suite/REFERENCE_OUTPUT/tensor_sum/tensor_sum_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/tensor_sum/tensor_sum_u8_Tensor.bin deleted file mode 100644 index dacf51e6e..000000000 Binary files a/utilities/test_suite/REFERENCE_OUTPUT/tensor_sum/tensor_sum_u8_Tensor.bin and /dev/null differ diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/down_mixing/down_mixing.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/down_mixing/down_mixing.bin new file mode 100644 index 000000000..cb7c8bb84 Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/down_mixing/down_mixing.bin differ diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin new file mode 100644 index 000000000..628d3785b Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/add_scalar/add_scalar_nifti_output.bin differ diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/multiply_scalar/multiply_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/multiply_scalar/multiply_scalar_nifti_output.bin new file mode 100644 index 000000000..aae9ff96c Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/multiply_scalar/multiply_scalar_nifti_output.bin differ diff --git a/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin new file mode 100644 index 000000000..9b9328536 Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT_VOXEL/subtract_scalar/subtract_scalar_nifti_output.bin differ diff --git a/utilities/test_suite/TEST_AUDIO_FILES/single_sample_multi_channel_src1/sample.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample1.wav similarity index 100% rename from utilities/test_suite/TEST_AUDIO_FILES/single_sample_multi_channel_src1/sample.wav rename to utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample1.wav diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample2.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample2.wav new file mode 100644 index 000000000..4847f78cd Binary files /dev/null and b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample2.wav differ diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample3.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample3.wav new file mode 100644 index 000000000..a506e1762 Binary files /dev/null and b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample3.wav differ diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img1024x768.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1024x768.jpg similarity index 100% rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img1024x768.jpg rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1024x768.jpg diff --git a/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1280x720.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1280x720.jpg new file mode 100644 index 000000000..8995dbbb6 Binary files /dev/null and b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1280x720.jpg differ diff --git a/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img150x150.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img150x150.jpg new file mode 100644 index 000000000..a283d2472 Binary files /dev/null and b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img150x150.jpg differ diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img1920x1080.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1920x1080.jpg similarity index 100% rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img1920x1080.jpg rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img1920x1080.jpg diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img224x224.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img224x224.jpg similarity index 100% rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img224x224.jpg rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img224x224.jpg diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img256x256.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img256x256.jpg similarity index 100% rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img256x256.jpg rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img256x256.jpg diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img300x300.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img300x300.jpg similarity index 100% rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img300x300.jpg rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img300x300.jpg diff --git a/utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img3840x2160.jpg b/utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img3840x2160.jpg similarity index 100% rename from utilities/rpp-unittests/TEST_IMAGES/six_images_mixed_src1/img3840x2160.jpg rename to utilities/test_suite/TEST_IMAGES/eight_images_mixed_src1/img3840x2160.jpg diff --git a/utilities/test_suite/rpp_test_suite_audio.h b/utilities/test_suite/rpp_test_suite_audio.h index 2ac174042..ec962270a 100644 --- a/utilities/test_suite/rpp_test_suite_audio.h +++ b/utilities/test_suite/rpp_test_suite_audio.h @@ -39,6 +39,7 @@ std::map audioAugmentationMap = {0, "non_silent_region_detection"}, {1, "to_decibels"}, {2, "pre_emphasis_filter"}, + {3, "down_mixing"}, }; // Golden outputs for Non Silent Region Detection @@ -137,7 +138,7 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst // read data from golden outputs Rpp64u oBufferSize = dstDescPtr->n * dstDescPtr->strides.nStride; Rpp32f *refOutput = static_cast(malloc(oBufferSize * sizeof(float))); - string outFile = scriptPath + testCase + "/" + testCase + ".bin"; + string outFile = scriptPath + "/../REFERENCE_OUTPUTS_AUDIO/" + testCase + "/" + testCase + ".bin"; std::fstream fin(outFile, std::ios::in | std::ios::binary); if(fin.is_open()) { diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h index c227567b5..58fee0c5d 100644 --- a/utilities/test_suite/rpp_test_suite_common.h +++ b/utilities/test_suite/rpp_test_suite_common.h @@ -86,8 +86,10 @@ std::map augmentationMap = {37, "crop"}, {38, "crop_mirror_normalize"}, {39, "resize_crop_mirror"}, + {45, "color_temperature"}, {49, "box_filter"}, {54, "gaussian_filter"}, + {61, "magnitude"}, {63, "phase"}, {70, "copy"}, {80, "resize_mirror_normalize"}, @@ -97,7 +99,30 @@ std::map augmentationMap = {84, "spatter"}, {85, "swap_channels"}, {86, "color_to_greyscale"}, - {87, "tensor_sum"} + {87, "tensor_sum"}, + {88, "tensor_min"}, + {89, "tensor_max"}, +}; + +// Golden outputs for Tensor min Kernel +std::map> TensorMinReferenceOutputs = +{ + {1, {1, 1, 7}}, + {3, {0, 0, 0, 0, 2, 0, 0, 0, 7, 9, 0, 0}} +}; + +// Golden outputs for Tensor max Kernel +std::map> TensorMaxReferenceOutputs = +{ + {1, {239, 245, 255}}, + {3, {255, 240, 236, 255, 255, 242, 241, 255, 253, 255, 255, 255}} +}; + +// Golden outputs for Tensor sum Kernel +std::map> TensorSumReferenceOutputs = +{ + {1, {334225, 813471, 2631125}}, + {3, {348380, 340992, 262616, 951988, 1056552, 749506, 507441, 2313499, 2170646, 2732368, 3320699, 8223713}} }; template @@ -1050,8 +1075,6 @@ inline void compare_output(T* output, string funcName, RpptDescPtr srcDescPtr, R binFile += "_noiseType" + noiseTypeName; } refFile = scriptPath + "/../REFERENCE_OUTPUT/" + funcName + "/"+ binFile + ".bin"; - string line,word; - int index = 0; int fileMatch = 0; Rpp8u *binaryContent = (Rpp8u *)malloc(binOutputSize * sizeof(Rpp8u)); @@ -1088,17 +1111,14 @@ inline void compare_output(T* output, string funcName, RpptDescPtr srcDescPtr, R free(binaryContent); } -inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPtr srcDescPtr, int testCase, string dst, string scriptPath) +// compares reduction type functions outputs +template +inline void compare_reduction_output(T* output, string funcName, RpptDescPtr srcDescPtr, int testCase, string dst, string scriptPath) { string func = funcName; - string refFile = ""; - int pln1RefStride = srcDescPtr->n * 4; - Rpp64u binaryOutputSize = srcDescPtr->n * 5; - string dataType[4] = {"_u8_", "_f16_", "_f32_", "_i8_"}; func += dataType[srcDescPtr->dataType]; - std::string binFile = func + "Tensor"; if(srcDescPtr->layout == RpptLayout::NHWC) func += "Tensor_PKD3"; @@ -1110,21 +1130,29 @@ inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPt func += "Tensor_PLN1"; } - refFile = scriptPath + "/../REFERENCE_OUTPUT/" + funcName + "/"+ binFile + ".bin"; - - string line,word; - int index = 0; int fileMatch = 0; int matched_values = 0; - Rpp64u *binaryContent = (Rpp64u *)malloc(binaryOutputSize * sizeof(Rpp64u)); - read_bin_file(refFile, binaryContent); + + T *refOutput; + refOutput = (T *)calloc(srcDescPtr->n * 4, sizeof(T)); + int numChannels = (srcDescPtr->c == 1) ? 1 : 3; + int numOutputs = (srcDescPtr->c == 1) ? srcDescPtr->n : srcDescPtr->n * 4; + std::vector ref; + if(testCase == 88) + ref = TensorMinReferenceOutputs[numChannels]; + else if(testCase == 89) + ref = TensorMaxReferenceOutputs[numChannels]; + else if(testCase == 87) + ref = TensorSumReferenceOutputs[numChannels]; + + for (int i = 0; i < numOutputs; i++) + refOutput[i] = (T)ref[i]; if(srcDescPtr->c == 1) { - binaryContent += pln1RefStride; for(int i = 0; i < srcDescPtr->n; i++) { - int diff = output[i] - binaryContent[i]; + int diff = abs(static_cast(output[i] - refOutput[i])); if(diff <= CUTOFF) fileMatch++; } @@ -1136,7 +1164,7 @@ inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPt matched_values = 0; for(int j = 0; j < 4; j++) { - int diff = output[(i * 4) + j] - binaryContent[(i * 4) + j]; + int diff = abs(static_cast(output[(i * 4) + j] - refOutput[(i * 4) + j])); if(diff <= CUTOFF) matched_values++; } @@ -1144,6 +1172,7 @@ inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPt fileMatch++; } } + free(refOutput); std::cout << std::endl << "Results for " << func << " :" << std::endl; std::string status = func + ": "; @@ -1166,7 +1195,14 @@ inline void compare_reduction_output(Rpp64u* output, string funcName, RpptDescPt qaResults << status << std::endl; qaResults.close(); } - free(binaryContent); +} + +// print array of any bit depth for specified length +template +inline void print_array(T *src, Rpp32u length, Rpp32u precision) +{ + for (int i = 0; i < length; i++) + std::cout << " " << std::fixed << std::setprecision(precision) << static_cast(src[i]) << " "; } // Used to randomly swap values present in array of size n @@ -1255,4 +1291,4 @@ void inline init_ricap(int width, int height, int batchSize, Rpp32u *permutation roiPtrInputCropRegion[1].xywhROI = {randrange(0, part0Width - 8), randrange(0, height - part0Height), width - part0Width, part0Height}; roiPtrInputCropRegion[2].xywhROI = {randrange(0, width - part0Width - 8), randrange(0, part0Height), part0Width, height - part0Height}; roiPtrInputCropRegion[3].xywhROI = {randrange(0, part0Width - 8), randrange(0, part0Height), width - part0Width, height - part0Height}; -} +} \ No newline at end of file diff --git a/utilities/test_suite/rpp_test_suite_voxel.h b/utilities/test_suite/rpp_test_suite_voxel.h index 8ed011ad6..d72a959d7 100644 --- a/utilities/test_suite/rpp_test_suite_voxel.h +++ b/utilities/test_suite/rpp_test_suite_voxel.h @@ -61,7 +61,10 @@ std::map augmentationMap = { {0, "fused_multiply_add_scalar"}, {1, "slice"}, + {2, "add_scalar"}, + {3, "subtract_scalar"}, {4, "flip_voxel"}, + {5, "multiply_scalar"} }; void replicate_last_file_to_fill_batch(const string& lastFilePath, vector& filePathVector, vector& fileNamesVector, const string& lastFileName, int noOfFiles, int batchCount)