diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 7be3d2fd4..916a0a0ad 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,8 +1,8 @@
-# Documentation files
-docs/* @saadrahim @LisaDelaney
-*.md @saadrahim @LisaDelaney
-*.rst @saadrahim @LisaDelaney
-# Header directory
-library/include/* @saadrahim @LisaDelaney @kiritigowda @rrawther
# Source code
@kiritigowda @rrawther
+# Documentation files
+docs/* @ROCm/rocm-documentation
+*.md @ROCm/rocm-documentation
+*.rst @ROCm/rocm-documentation
+# Header directory
+library/include/* @ROCm/rocm-documentation @kiritigowda @rrawther
diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy
index 663d3c085..0d7834e2b 100644
--- a/.jenkins/precheckin.groovy
+++ b/.jenkins/precheckin.groovy
@@ -47,7 +47,7 @@ ci: {
def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
propertyList = auxiliary.appendPropertyList(propertyList)
- def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu22:['gfx908'], ubuntu20:['gfx906'], centos8:['gfx908']])]
+ def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['gfx90a'], ubuntu22:['gfx1101'], sles15sp1:['gfx908'], rhel8:['gfx1030'], rhel9:['gfx908']])]
jobNameList = auxiliary.appendJobNameList(jobNameList)
propertyList.each
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34ce6fcac..224125b36 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -346,6 +346,7 @@ install(FILES ${PROJECT_BINARY_DIR}/include/rpp_backend.h
# install Test
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test)
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_suite/ DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/utilities/rpp-performancetests DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test)
# set license information
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 0c9b63672..8ecbd3663 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1 +1 @@
-rocm-docs-core[api_reference]==0.33.0
+rocm-docs-core[api_reference]==0.35.0
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index f7bc7e2c1..ea1c7619a 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -112,7 +112,7 @@ requests==2.28.2
# via
# pygithub
# sphinx
-rocm-docs-core[api-reference]==0.33.0
+rocm-docs-core[api-reference]==0.35.0
# via
# -r requirements.in
# rocm-docs-core
diff --git a/include/rppdefs.h b/include/rppdefs.h
index 2beafbc0c..b0baf7d34 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -116,8 +116,8 @@ typedef enum
RPP_ERROR_NOT_ENOUGH_MEMORY = -16,
/*! \brief Out of bound source ROI \ingroup group_rppdefs */
RPP_ERROR_OUT_OF_BOUND_SRC_ROI = -17,
- /*! \brief src and dst layout mismatch \ingroup group_rppdefs */
- RPP_ERROR_SRC_DST_LAYOUT_MISMATCH = -18
+ /*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
+ RPP_ERROR_INVALID_CHANNELS = -18
} RppStatus;
/*! \brief RPP rppStatus_t type enums
diff --git a/include/rppi_arithmetic_operations.h b/include/rppi_arithmetic_operations.h
index 0fb79dbf6..17aef722d 100644
--- a/include/rppi_arithmetic_operations.h
+++ b/include/rppi_arithmetic_operations.h
@@ -320,4 +320,4 @@ RppStatus rppi_tensor_multiply_u8_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RppPtr
}
#endif
-#endif
\ No newline at end of file
+#endif
diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h
index 0a247f886..51705eefc 100644
--- a/include/rppt_tensor_arithmetic_operations.h
+++ b/include/rppt_tensor_arithmetic_operations.h
@@ -30,7 +30,7 @@ SOFTWARE.
* \brief RPPT Tensor Arithmetic operation Functions.
*
* \defgroup group_tensor_arithmetic Operations: AMD RPP Tensor Arithmetic Operations
- * \brief Tensor Color Augmentations.
+ * \brief Tensor Arithmetic Operations.
*/
#include "rpp.h"
@@ -39,53 +39,221 @@ SOFTWARE.
extern "C" {
#endif
-/*! \brief Fmadd augmentation HOST
+/*!
+ * \file
+ * \brief RPPT Tensor Operations - Arithmetic Operations.
+ * \defgroup group_tensor_arithmetic_operations RPPT Tensor Operations - Arithmetic Operations.
+ * \brief RPPT Tensor Operations - Arithmetic Operations.
+ */
+
+/*! \addtogroup group_rppt_tensor_arithmetic_operations
+ * @{
+ */
+
+/*! \brief Fused multiply add scalar augmentation on HOST backend
* \details This function performs the fmadd operation on a batch of 4D tensors.
* It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
* adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \param [in] srcPtr source tensor memory
+ * \param [in] srcPtr source tensor in HOST memory
* \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor memory
+ * \param[out] dstPtr destination tensor in HOST memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
- * \param [in] rppHandle Host-handle
- * \return RppStatus enum.
- * \returns RPP_SUCCESS \ref RppStatus on successful completion.
- * Else return RPP_ERROR
- * \ingroup group_tensor_arithmetic
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
-
-/*! \brief Fmadd augmentation GPU
+#ifdef GPU_SUPPORT
+/*! \brief Fused multiply add scalar augmentation on HIP backend
* \details This function performs the fmadd operation on a batch of 4D tensors.
* It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
* adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \param [in] srcPtr source tensor memory
+ * \param [in] srcPtr source tensor in HIP memory
* \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor memory
+ * \param[out] dstPtr destination tensor in HIP memory
* \param[in] dstGenericDescPtr destination tensor descriptor
* \param[in] mulTensor mul values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] addTensor add values for fmadd calculation (1D tensor of batchSize Rpp32f values)
* \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
* \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
- * \param [in] rppHandle Hip-handle
- * \return RppStatus enum.
- * \returns RPP_SUCCESS \ref RppStatus on successful completion.
- * Else return RPP_ERROR
- * \ingroup group_tensor_arithmetic
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Add scalar augmentation on HOST backend
+ * \details This function performs the addition operation on a batch of 4D tensors.
+ * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
*/
+RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
-RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+/*! \brief Add scalar augmentation on HIP backend
+ * \details This function performs the addition operation on a batch of 4D tensors.
+ * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HIP memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] addTensor add values for used for addition (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *addTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Subtract scalar augmentation on HOST backend
+ * \details This function performs the subtraction operation on a batch of 4D tensors.
+ * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Subtract scalar augmentation on HIP backend
+ * \details This function performs the subtraction operation on a batch of 4D tensors.
+ * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HIP memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] subtractTensor subtract values for used for subtraction (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Multiply scalar augmentation on HOST backend
+ * \details This function performs the multiplication operation on a batch of 4D tensors.
+ * It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] mulTensor multiplier values for used for multiplication (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Multiply scalar augmentation on HIP backend
+ * \details This function performs the multiplication operation on a batch of 4D tensors.
+ * It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor.
+ * Support added for f32 -> f32 dataype.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param[in] srcGenericDescPtr source tensor descriptor
+ * \param[out] dstPtr destination tensor in HIP memory
+ * \param[in] dstGenericDescPtr destination tensor descriptor
+ * \param[in] mulTensor multiplier values for used for multiplication (1D tensor of batchSize Rpp32f values)
+ * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Magnitude computation on HOST backend for a NCHW/NHWC layout tensor
+ * \details This function computes magnitude of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input1
+ * \image html img150x150_2.jpg Sample Input2
+ * \image html magnitude_operation_img150x150.jpg Sample Output
+ * \param [in] srcPtr1 source1 tensor in HOST memory
+ * \param [in] srcPtr2 source2 tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Magnitude computation on HOST backend for a NCHW/NHWC layout tensor
+ * \details This function computes magnitude of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input1
+ * \image html img150x150_2.jpg Sample Input2
+ * \image html magnitude_operation_img150x150.jpg Sample Output
+ * \param [in] srcPtr1 source1 tensor in HOST memory
+ * \param [in] srcPtr2 source2 tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! @}
+ */
+
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H
+#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H
\ No newline at end of file
diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
index 138b3baa8..31bb34eff 100644
--- a/include/rppt_tensor_audio_augmentations.h
+++ b/include/rppt_tensor_audio_augmentations.h
@@ -95,7 +95,22 @@ RppStatus rppt_to_decibels_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
*/
RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcLengthTensor, Rpp32f *coeffTensor, RpptAudioBorderType borderType, rppHandle_t rppHandle);
+/*! \brief Down Mixing augmentation on HOST backend
+* \details Down Mixing augmentation for audio data
+* \param[in] srcPtr source tensor in HOST memory
+* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param[out] dstPtr destination tensor in HOST memory
+* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+* \param[in] normalizeWeights bool flag to specify if normalization of weights is needed
+* \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+* \return A \ref RppStatus enumeration.
+* \retval RPP_SUCCESS Successful completion.
+* \retval RPP_ERROR* Unsuccessful completion.
+*/
+RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle);
+
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
diff --git a/include/rppt_tensor_color_augmentations.h b/include/rppt_tensor_color_augmentations.h
index deabd885d..99909cb42 100644
--- a/include/rppt_tensor_color_augmentations.h
+++ b/include/rppt_tensor_color_augmentations.h
@@ -417,6 +417,48 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
RppStatus rppt_lut_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RppPtr_t lutPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Color Temperature augmentation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The color temperature augmentation does a image temperature adjustment operation, taking a pixel adjustment value as argument for each image in a batch of RGB(3 channel) with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input
+ * \image html color_augmentations_color_temperature_img150x150.jpg Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp8s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Color Temperature augmentation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The color temperature augmentation does a image temperature adjustment operation, taking a pixel adjustment value as argument for each image in a batch of RGB(3 channel) with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input
+ * \image html color_augmentations_color_temperature_img150x150.jpg Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_color_temperature_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! @}
*/
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 181b1c565..3cb49a82b 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -24,6 +24,7 @@ SOFTWARE.
#ifndef RPPT_TENSOR_STATISTICAL_OPERATIONS_H
#define RPPT_TENSOR_STATISTICAL_OPERATIONS_H
+
#include "rpp.h"
#include "rppdefs.h"
#ifdef __cplusplus
@@ -77,6 +78,78 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorSumArr, Rpp32u tensorSumArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Tensor min operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] minArr destination array in HOST memory
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t minArr, Rpp32u minArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor min operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] minArr destination array in HIP memory
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMinArr, Rpp32u imageMinArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Tensor max operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] maxArr destination array in HOST memory
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t maxArr, Rpp32u maxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor max operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] maxArr destination array in HIP memory
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! @}
*/
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index 67c34de70..1e748cc86 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -2431,6 +2431,24 @@ inline RppStatus custom_convolve_image_host(T* srcPtr, RppiSize srcSize, U* dstP
// Compute Functions for RPP Tensor API
+inline void compute_multiply_16_host(__m256 *p, __m256 *pMulParam)
+{
+ p[0] = _mm256_mul_ps(p[0], pMulParam[0]); // multiply adjustment
+ p[1] = _mm256_mul_ps(p[1], pMulParam[0]); // multiply adjustment
+}
+
+inline void compute_subtract_16_host(__m256 *p, __m256 *pSubtractParam)
+{
+ p[0] = _mm256_sub_ps(p[0], pSubtractParam[0]); // subtract adjustment
+ p[1] = _mm256_sub_ps(p[1], pSubtractParam[0]); // subtract adjustment
+}
+
+inline void compute_add_16_host(__m256 *p, __m256 *pAddParam)
+{
+ p[0] = _mm256_add_ps(p[0], pAddParam[0]); // add adjustment
+ p[1] = _mm256_add_ps(p[1], pAddParam[0]); // add adjustment
+}
+
inline void compute_rmn_24_host(__m256 *p, __m256 *pRMNParams)
{
p[0] = _mm256_mul_ps(_mm256_sub_ps(p[0], pRMNParams[0]), pRMNParams[1]);
@@ -3032,6 +3050,22 @@ inline void compute_color_cast_12_host(__m128 *p, __m128 pMul, __m128 *pAdd)
p[2] = _mm_fmadd_ps(_mm_sub_ps(p[2], pAdd[2]), pMul, pAdd[2]); // color_cast adjustment Rs
}
+inline void compute_color_temperature_48_host(__m256 *p, __m256 pAdj)
+{
+ p[0] = _mm256_add_ps(p[0], pAdj); // color_temperature adjustment Rs
+ p[1] = _mm256_add_ps(p[1], pAdj); // color_temperature adjustment Rs
+ // no color_temperature adjustment Gs
+ p[4] = _mm256_sub_ps(p[4], pAdj); // color_temperature adjustment Bs
+ p[5] = _mm256_sub_ps(p[5], pAdj); // color_temperature adjustment Bs
+}
+
+inline void compute_color_temperature_24_host(__m256 *p, __m256 pAdj)
+{
+ p[0] = _mm256_add_ps(p[0], pAdj); // color_temperature adjustment Rs
+ // no color_temperature adjustment Gs
+ p[2] = _mm256_sub_ps(p[2], pAdj); // color_temperature adjustment Bs
+}
+
inline void compute_xywh_from_ltrb_host(RpptROIPtr roiPtrInput, RpptROIPtr roiPtrImage)
{
roiPtrImage->xywhROI.xy.x = roiPtrInput->ltrbROI.lt.x;
@@ -5962,4 +5996,284 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2
pSumB[0] = _mm256_add_pd(_mm256_add_pd(p[4], p[5]), pSumB[0]); //add 8B values and bring it down to 4
}
-#endif //RPP_CPU_COMMON_H
\ No newline at end of file
+inline void reduce_min_32_host(__m256i *pMin, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+ px[0] = _mm256_castsi256_si128(pMin[0]);
+ px[1] = _mm256_extracti128_si256(pMin[0], 1);
+ px[0] = _mm_min_epu8(px[0], px[1]);
+ px[1] = _mm_unpacklo_epi8(zero, px[0]);
+ px[0] = _mm_unpackhi_epi8(zero, px[0]);
+ px[0] = _mm_min_epu8(px[0], px[1]);
+ px[1] = _mm_unpacklo_epi16(zero, px[0]);
+ px[0] = _mm_unpackhi_epi16(zero, px[0]);
+ px[0] = _mm_min_epu16(px[0], px[1]);
+ px[1] = _mm_unpacklo_epi32(zero, px[0]);
+ px[0] = _mm_unpackhi_epi32(zero, px[0]);
+ px[0] = _mm_min_epu32(px[0], px[1]);
+ result[0] = _mm_shuffle_epi8(px[0], mask);
+}
+
+inline void compute_min_96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB)
+{
+ pMinR[0] = _mm256_min_epu8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min
+ pMinG[0] = _mm256_min_epu8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min
+ pMinB[0] = _mm256_min_epu8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min
+}
+
+inline void reduce_min_96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result)
+{
+ __m128i px[4];
+ __m128i zero = _mm_setzero_si128();
+ px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1));
+ px[1] = _mm_min_epu8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1));
+ px[1] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+ px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1));
+ px[0] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+ px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB)
+{
+ pMinR[0] = _mm_min_epu8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min
+ pMinG[0] = _mm_min_epu8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min
+ pMinB[0] = _mm_min_epu8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min
+}
+
+inline void reduce_min_48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ px[1] = _mm_min_epu8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0]));
+ px[0] = _mm_min_epu8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero));
+ px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void reduce_max_32_host(__m256i *pMax, __m128i *result)
+{
+ __m128i px;
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+ px = _mm_max_epu8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1));
+ px = _mm_max_epu8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px));
+ px = _mm_max_epu16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px));
+ px = _mm_max_epu32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px));
+ result[0] = _mm_shuffle_epi8(px, mask);
+}
+
+inline void compute_max_96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB)
+{
+ pMaxR[0] = _mm256_max_epu8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max
+ pMaxG[0] = _mm256_max_epu8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max
+ pMaxB[0] = _mm256_max_epu8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max
+}
+
+inline void reduce_max_96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result)
+{
+ __m128i px[4];
+ __m128i zero = _mm_setzero_si128();
+ px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1));
+ px[1] = _mm_max_epu8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1));
+ px[1] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+ px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1));
+ px[0] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+ px[1] = _mm_max_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_max_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_max_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_max_48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB)
+{
+ pMaxR[0] = _mm_max_epu8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max
+ pMaxG[0] = _mm_max_epu8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max
+ pMaxB[0] = _mm_max_epu8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max
+}
+
+inline void reduce_max_48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero));
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_float8_host(__m256 *p1, __m256 *pMin)
+{
+ pMin[0] = _mm256_min_ps(p1[0], pMin[0]); //compare and store min of 8 values into global min
+}
+
+inline void reduce_min_float8_host(__m256 *pMin, __m128 *result)
+{
+ __m128 px;
+ px = _mm_min_ps(_mm256_castps256_ps128(pMin[0]), _mm256_extractf128_ps(pMin[0], 1));
+ px = _mm_min_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px));
+ result[0] = _mm_shuffle_ps(px, px, 39);
+}
+
+inline void compute_min_float24_host(__m256 *p1, __m256 *pMinR, __m256 *pMinG, __m256 *pMinB)
+{
+ pMinR[0] = _mm256_min_ps(p1[0], pMinR[0]); //compare and store min of 8 R values into global min
+ pMinG[0] = _mm256_min_ps(p1[1], pMinG[0]); //compare and store min of 8 G values into global min
+ pMinB[0] = _mm256_min_ps(p1[2], pMinB[0]); //compare and store min of 8 B values into global min
+}
+
+inline void reduce_min_float24_host(__m256 *pMinR, __m256 *pMinG, __m256 *pMinB, __m256 *result) // TO CHANGE
+{
+ __m128 px[2];
+ px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinR[0]), _mm256_extractf128_ps(pMinR[0], 1));
+ px[1] = _mm_min_ps(_mm256_castps256_ps128(pMinG[0]), _mm256_extractf128_ps(pMinG[0], 1));
+ px[0] = _mm_min_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1]));
+ px[0] = _mm_permute_ps(px[0], 0b11011000);
+ result[0] = _mm256_castps128_ps256(px[0]);
+ px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinB[0]), _mm256_extractf128_ps(pMinB[0], 1));
+ px[1] = _mm_min_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0));
+ px[0] = _mm_shuffle_ps(px[1], px[1], 34);
+ result[0] = _mm256_insertf128_ps(result[0], px[0], 1);
+}
+
+inline void compute_max_float8_host(__m256 *p1, __m256 *pMax)
+{
+ pMax[0] = _mm256_max_ps(p1[0], pMax[0]); //compare and store max of 8 values into global min
+}
+
+inline void reduce_max_float8_host(__m256 *pMax, __m128 *result)
+{
+ __m128 px;
+ px = _mm_max_ps(_mm256_castps256_ps128(pMax[0]), _mm256_extractf128_ps(pMax[0], 1));
+ px = _mm_max_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px));
+ result[0] = _mm_shuffle_ps(px, px, 39);
+}
+
+inline void compute_max_float24_host(__m256 *p1, __m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB)
+{
+ pMaxR[0] = _mm256_max_ps(p1[0], pMaxR[0]); //compare and store max of 8 R values into global min
+ pMaxG[0] = _mm256_max_ps(p1[1], pMaxG[0]); //compare and store max of 8 G values into global min
+ pMaxB[0] = _mm256_max_ps(p1[2], pMaxB[0]); //compare and store max of 8 B values into global min
+}
+
+inline void reduce_max_float24_host(__m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB, __m256 *result)
+{
+ __m128 px[2];
+ px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxR[0]), _mm256_extractf128_ps(pMaxR[0], 1));
+ px[1] = _mm_max_ps(_mm256_castps256_ps128(pMaxG[0]), _mm256_extractf128_ps(pMaxG[0], 1));
+ px[0] = _mm_max_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1]));
+ px[0] = _mm_permute_ps(px[0], 0b11011000);
+ result[0] = _mm256_castps128_ps256(px[0]);
+ px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxB[0]), _mm256_extractf128_ps(pMaxB[0], 1));
+ px[1] = _mm_max_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0));
+ px[0] = _mm_shuffle_ps(px[1], px[1], 34);
+ result[0] = _mm256_insertf128_ps(result[0], px[0], 1);
+}
+
+inline void reduce_min_i32_host(__m256i *pMin, __m128i *result)
+{
+ __m128i px;
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+ px = _mm_min_epi8(_mm256_castsi256_si128(pMin[0]), _mm256_extracti128_si256(pMin[0], 1));
+ px = _mm_min_epi8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px));
+ px = _mm_min_epi16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px));
+ px = _mm_min_epi32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px));
+ result[0] = _mm_shuffle_epi8(px, mask);
+}
+
+inline void compute_min_i96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB)
+{
+ pMinR[0] = _mm256_min_epi8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min
+ pMinG[0] = _mm256_min_epi8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min
+ pMinB[0] = _mm256_min_epi8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min
+}
+
+inline void reduce_min_i96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result)
+{
+ __m128i px[4];
+ __m128i zero = _mm_setzero_si128();
+ px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1));
+ px[1] = _mm_min_epi8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1));
+ px[1] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+ px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1));
+ px[0] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+ px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_i48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB)
+{
+ pMinR[0] = _mm_min_epi8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min
+ pMinG[0] = _mm_min_epi8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min
+ pMinB[0] = _mm_min_epi8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min
+}
+
+inline void reduce_min_i48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ px[1] = _mm_min_epi8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0]));
+ px[0] = _mm_min_epi8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero));
+ px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void reduce_max_i32_host(__m256i *pMax, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+ px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi8(zero, px[0]), _mm_unpackhi_epi8(zero, px[0]));
+ px[0] = _mm_max_epi16(_mm_unpacklo_epi16(zero, px[0]), _mm_unpackhi_epi16(zero, px[0]));
+ px[0] = _mm_max_epi32(_mm_unpacklo_epi32(zero, px[0]), _mm_unpackhi_epi32(zero, px[0]));
+ result[0] = _mm_shuffle_epi8(px[0], mask);
+}
+
+inline void compute_max_i96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB)
+{
+ pMaxR[0] = _mm256_max_epi8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max
+ pMaxG[0] = _mm256_max_epi8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max
+ pMaxB[0] = _mm256_max_epi8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max
+}
+
+inline void reduce_max_i96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result)
+{
+ __m128i px[4];
+ __m128i zero = _mm_setzero_si128();
+ px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1));
+ px[1] = _mm_max_epi8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1));
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+ px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_max_i48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB)
+{
+ pMaxR[0] = _mm_max_epi8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max
+ pMaxG[0] = _mm_max_epi8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max
+ pMaxB[0] = _mm_max_epi8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max
+}
+
+inline void reduce_max_i48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero));
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+#endif //RPP_CPU_COMMON_H
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index 84c898b90..d03ec0e79 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -75,7 +75,7 @@ typedef union
#define SIMD_GET_PS(name) (*(const __m128 *)_xmm_const_##name)
-const __m128 xmm_p0 = _mm_set1_ps(0.0f);
+const __m128 xmm_p0 = _mm_setzero_ps();
const __m128 xmm_p1 = _mm_set1_ps(1.0f);
const __m128 xmm_p2 = _mm_set1_ps(2.0f);
const __m128 xmm_pm2 = _mm_set1_ps(-2.0f);
@@ -243,7 +243,7 @@ inline void rpp_mm256_print_epi8(__m256i vPrintArray)
printf("\n");
for (int ct = 0; ct < 32; ct++)
{
- printf("%d ", printArray[ct]);
+ printf("%d ", (unsigned char)printArray[ct]);
}
}
@@ -1271,6 +1271,20 @@ inline void rpp_load16_u8_to_u32_avx(Rpp8u *srcPtr, __m256i *p)
p[1] = _mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15)); /* Contains pixels 09-16 */
}
+inline void rpp_load96_u8_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256i *p)
+{
+ p[0] = _mm256_loadu_si256((__m256i *)srcPtrR);
+ p[1] = _mm256_loadu_si256((__m256i *)srcPtrG);
+ p[2] = _mm256_loadu_si256((__m256i *)srcPtrB);
+}
+
+inline void rpp_load96_i8_avx(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *p)
+{
+ p[0] = _mm256_load_si256((__m256i *)srcPtrR);
+ p[1] = _mm256_load_si256((__m256i *)srcPtrG);
+ p[2] = _mm256_load_si256((__m256i *)srcPtrB);
+}
+
inline void rpp_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p)
{
__m128 p128[8];
@@ -1478,6 +1492,16 @@ inline void rpp_store4_f64_to_f64_avx(Rpp64f *dstPtr, __m256d *p)
_mm256_storeu_pd(dstPtr, p[0]);
}
+inline void rpp_store16_u8_to_u8(Rpp8u *dstPtr, __m128i *p)
+{
+ _mm_storeu_si128((__m128i *)dstPtr, p[0]);
+}
+
+inline void rpp_store16_i8(Rpp8s *dstPtr, __m128i *p)
+{
+ _mm_store_si128((__m128i *)dstPtr, p[0]);
+}
+
inline void rpp_store8_f32_to_f16_avx(Rpp16f *dstPtr, __m256 *p)
{
__m128i px128 = _mm256_cvtps_ph(p[0], _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
@@ -2438,6 +2462,29 @@ static inline __m128 log_ps(__m128 x)
return x;
}
+inline Rpp32f rpp_hsum_ps(__m128 x)
+{
+ __m128 shuf = _mm_movehdup_ps(x); // broadcast elements 3,1 to 2,0
+ __m128 sums = _mm_add_ps(x, shuf);
+ shuf = _mm_movehl_ps(shuf, sums); // high half -> low half
+ sums = _mm_add_ss(sums, shuf);
+ return _mm_cvtss_f32(sums);
+}
+
+inline Rpp32f rpp_hsum_ps(__m256 x)
+{
+ __m128 p0 = _mm256_extractf128_ps(x, 1); // Contains x7, x6, x5, x4
+ __m128 p1 = _mm256_castps256_ps128(x); // Contains x3, x2, x1, x0
+ __m128 sum = _mm_add_ps(p0, p1); // Contains x3 + x7, x2 + x6, x1 + x5, x0 + x4
+ p0 = sum; // Contains -, -, x1 + x5, x0 + x4
+ p1 = _mm_movehl_ps(sum, sum); // Contains -, -, x3 + x7, x2 + x6
+ sum = _mm_add_ps(p0, p1); // Contains -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6
+ p0 = sum; // Contains -, -, -, x0 + x2 + x4 + x6
+ p1 = _mm_shuffle_ps(sum, sum, 0x1); // Contains -, -, -, x1 + x3 + x5 + x7
+ sum = _mm_add_ss(p0, p1); // Contains -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7
+ return _mm_cvtss_f32(sum);
+}
+
static inline void fast_matmul4x4_sse(float *A, float *B, float *C)
{
__m128 row1 = _mm_load_ps(&B[0]); // Row 0 of B
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index a7412aa2d..d9c0ce02d 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -184,6 +184,13 @@ inline void generate_gaussian_kernel_gpu(Rpp32f stdDev, Rpp32f* kernel, Rpp32u k
}
}
+// Retrieve Min and Max given a datatype
+
+inline void getImageBitDepthMinMax(uchar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(float *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(half *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(schar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(-128, 127); }
+
/******************** DEVICE FUNCTIONS ********************/
// -------------------- Set 0 - Range checks and Range adjustment --------------------
@@ -1560,6 +1567,20 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_to_int24_pln3(schar *srcPtr,
// /******************** DEVICE MATH HELPER FUNCTIONS ********************/
+// float8 min
+
+__device__ __forceinline__ void rpp_hip_math_min8(d_float8 *srcPtr_f8, float *dstPtr)
+{
+ *dstPtr = fminf(fminf(fminf(fminf(fminf(fminf(fminf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]);
+}
+
+// float8 max
+
+__device__ __forceinline__ void rpp_hip_math_max8(d_float8 *srcPtr_f8, float *dstPtr)
+{
+ *dstPtr = fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]);
+}
+
// d_float16 floor
__device__ __forceinline__ void rpp_hip_math_floor16(d_float16 *srcPtr_f16, d_float16 *dstPtr_f16)
diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
index 96553489d..b98145be0 100644
--- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp
+++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
@@ -26,5 +26,9 @@ SOFTWARE.
#define HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
#include "kernel/fused_multiply_add_scalar.hpp"
+#include "kernel/add_scalar.hpp"
+#include "kernel/subtract_scalar.hpp"
+#include "kernel/multiply_scalar.hpp"
+#include "kernel/magnitude.hpp"
-#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
\ No newline at end of file
+#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/cpu/host_tensor_audio_augmentations.hpp b/src/modules/cpu/host_tensor_audio_augmentations.hpp
index 7737b38c3..e2edb1afc 100644
--- a/src/modules/cpu/host_tensor_audio_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_audio_augmentations.hpp
@@ -28,5 +28,6 @@ SOFTWARE.
#include "kernel/non_silent_region_detection.hpp"
#include "kernel/to_decibels.hpp"
#include "kernel/pre_emphasis_filter.hpp"
+#include "kernel/down_mixing.hpp"
#endif // HOST_TENSOR_AUDIO_AUGMENTATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/host_tensor_color_augmentations.hpp b/src/modules/cpu/host_tensor_color_augmentations.hpp
index 19e0b471c..aba3b8158 100644
--- a/src/modules/cpu/host_tensor_color_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_color_augmentations.hpp
@@ -34,5 +34,6 @@ SOFTWARE.
#include "kernel/exposure.hpp"
#include "kernel/contrast.hpp"
#include "kernel/lut.hpp"
+#include "kernel/color_temperature.hpp"
#endif // HOST_TENSOR_COLOR_AUGMENTATIONS_HPP
diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp
index dae3e6236..32b8b62b5 100644
--- a/src/modules/cpu/host_tensor_statistical_operations.hpp
+++ b/src/modules/cpu/host_tensor_statistical_operations.hpp
@@ -26,5 +26,7 @@ SOFTWARE.
#define HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
#include "kernel/tensor_sum.hpp"
+#include "kernel/tensor_min.hpp"
+#include "kernel/tensor_max.hpp"
#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/add_scalar.hpp b/src/modules/cpu/kernel/add_scalar.hpp
new file mode 100644
index 000000000..d0179d4e1
--- /dev/null
+++ b/src/modules/cpu/kernel/add_scalar.hpp
@@ -0,0 +1,152 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus add_scalar_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *addTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI3D roiDefault;
+ if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
+ else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ RpptROI3D roi;
+ RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
+ compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ Rpp32f addParam = addTensor[batchCount];
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u vectorIncrement = 16;
+ Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+ __m256 pAddParam = _mm256_set1_ps(addParam);
+
+ // Add without fused output-layout toggle (NCDHW -> NCDHW)
+ if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtrDepth, *dstPtrDepth;
+ srcPtrDepth = srcPtrChannel;
+ dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_add_16_host(p, &pAddParam); // add adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtrTemp++ + addParam;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[3];
+ dstPtrRow += dstGenericDescPtr->strides[3];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[2];
+ dstPtrDepth += dstGenericDescPtr->strides[2];
+ }
+ srcPtrChannel += srcGenericDescPtr->strides[1];
+ dstPtrChannel += srcGenericDescPtr->strides[1];
+ }
+ }
+ // Add without fused output-layout toggle (NDHWC -> NDHWC)
+ else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+ Rpp32f *srcPtrDepth = srcPtrChannel;
+ Rpp32f *dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_add_16_host(p, &pAddParam); // add adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtrTemp++ + addParam;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[1];
+ dstPtrDepth += dstGenericDescPtr->strides[1];
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/color_temperature.hpp b/src/modules/cpu/kernel/color_temperature.hpp
new file mode 100644
index 000000000..1358ac800
--- /dev/null
+++ b/src/modules/cpu/kernel/color_temperature.hpp
@@ -0,0 +1,1035 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus color_temperature_u8_u8_host_tensor(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp8s *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(dstDescPtr->n)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f adjustmentValue = adjustmentValueTensor[batchCount];
+
+ Rpp8u *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ __m256 pAdj = _mm256_set1_ps(adjustmentValue);
+
+ Rpp8u *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ // Color Temperature with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+ Rpp8u *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 48)
+ {
+ __m256 p[6];
+
+ rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads
+ compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores
+
+ srcPtrTemp += 48;
+ dstPtrTempR += 16;
+ dstPtrTempG += 16;
+ dstPtrTempB += 16;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[0] + adjustmentValue);
+ *dstPtrTempG++ = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[1]);
+ *dstPtrTempB++ = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[2] - adjustmentValue);
+
+ srcPtrTemp += 3;
+ }
+
+ srcPtrRow += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+ {
+ __m256 p[6];
+
+ rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
+ compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p); // simd stores
+
+ srcPtrTempR += 16;
+ srcPtrTempG += 16;
+ srcPtrTempB += 16;
+ dstPtrTemp += 48;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = (Rpp8u) RPPPIXELCHECK(*srcPtrTempR + adjustmentValue);
+ dstPtrTemp[1] = (Rpp8u) RPPPIXELCHECK(*srcPtrTempG);
+ dstPtrTemp[2] = (Rpp8u) RPPPIXELCHECK(*srcPtrTempB - adjustmentValue);
+
+ dstPtrTemp += 3;
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+ Rpp8u *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 48)
+ {
+ __m256 p[6];
+
+ rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads
+ compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p); // simd stores
+
+ srcPtrTemp += 48;
+ dstPtrTemp += 48;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ dstPtrTemp[0] = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[0] + adjustmentValue);
+ dstPtrTemp[1] = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[1]);
+ dstPtrTemp[2] = (Rpp8u) RPPPIXELCHECK(srcPtrTemp[2] - adjustmentValue);
+
+ srcPtrTemp += 3;
+ dstPtrTemp += 3;
+ }
+
+ srcPtrRow += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+ {
+ __m256 p[6];
+
+ rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
+ compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores
+
+ srcPtrTempR += 16;
+ srcPtrTempG += 16;
+ srcPtrTempB += 16;
+ dstPtrTempR += 16;
+ dstPtrTempG += 16;
+ dstPtrTempB += 16;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR++ = (Rpp8u) RPPPIXELCHECK(*srcPtrTempR + adjustmentValue);
+ *dstPtrTempG++ = (Rpp8u) RPPPIXELCHECK(*srcPtrTempG);
+ *dstPtrTempB++ = (Rpp8u) RPPPIXELCHECK(*srcPtrTempB - adjustmentValue);
+
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus color_temperature_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp8s *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(dstDescPtr->n)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f adjustmentValue = adjustmentValueTensor[batchCount] * ONE_OVER_255;
+
+ Rpp32f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ __m256 pAdj = _mm256_set1_ps(adjustmentValue);
+
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ // Color Temperature with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+ Rpp32f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 24)
+ {
+ __m256 p[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads
+ compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores
+
+ srcPtrTemp += 24;
+ dstPtrTempR += 8;
+ dstPtrTempG += 8;
+ dstPtrTempB += 8;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue);
+ *dstPtrTempG++ = RPPPIXELCHECKF32(srcPtrTemp[1]);
+ *dstPtrTempB++ = RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue);
+
+ srcPtrTemp += 3;
+ }
+
+ srcPtrRow += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+ {
+ __m256 p[3];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
+ compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p); // simd stores
+
+ srcPtrTempR += 8;
+ srcPtrTempG += 8;
+ srcPtrTempB += 8;
+ dstPtrTemp += 24;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue);
+ dstPtrTemp[1] = RPPPIXELCHECKF32(*srcPtrTempG);
+ dstPtrTemp[2] = RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue);
+
+ dstPtrTemp += 3;
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 24)
+ {
+ __m256 p[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads
+ compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p); // simd stores
+
+ srcPtrTemp += 24;
+ dstPtrTemp += 24;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ dstPtrTemp[0] = RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue);
+ dstPtrTemp[1] = RPPPIXELCHECKF32(srcPtrTemp[1]);
+ dstPtrTemp[2] = RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue);
+
+ srcPtrTemp += 3;
+ dstPtrTemp += 3;
+ }
+
+ srcPtrRow += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+ {
+ __m256 p[3];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
+ compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores
+
+ srcPtrTempR += 8;
+ srcPtrTempG += 8;
+ srcPtrTempB += 8;
+ dstPtrTempR += 8;
+ dstPtrTempG += 8;
+ dstPtrTempB += 8;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR++ = RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue);
+ *dstPtrTempG++ = RPPPIXELCHECKF32(*srcPtrTempG);
+ *dstPtrTempB++ = RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue);
+
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ dstPtrRowR += srcDescPtr->strides.hStride;
+ dstPtrRowG += srcDescPtr->strides.hStride;
+ dstPtrRowB += srcDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus color_temperature_f16_f16_host_tensor(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp8s *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(dstDescPtr->n)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f adjustmentValue = adjustmentValueTensor[batchCount] * ONE_OVER_255;
+
+ Rpp16f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ __m256 pAdj = _mm256_set1_ps(adjustmentValue);
+
+ Rpp16f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+ Rpp32u vectorIncrement = 24;
+
+ // Color Temperature with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+
+ Rpp16f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[24];
+ Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8];
+
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+
+ __m256 p[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); // simd loads
+ compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, p); // simd stores
+
+ for(int cnt = 0; cnt < 8; cnt++)
+ {
+ dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt];
+ dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt];
+ dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt];
+ }
+
+ srcPtrTemp += 24;
+ dstPtrTempR += 8;
+ dstPtrTempG += 8;
+ dstPtrTempB += 8;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue);
+ *dstPtrTempG++ = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[1]);
+ *dstPtrTempB++ = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue);
+
+ srcPtrTemp += 3;
+ }
+
+ srcPtrRow += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+ {
+ Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+ Rpp32f dstPtrTemp_ps[25];
+
+ for(int cnt = 0; cnt < 8; cnt++)
+ {
+ srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+ srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+ srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+ }
+
+ __m256 p[3];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); // simd loads
+ compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, p); // simd stores
+
+ for(int cnt = 0; cnt < 24; cnt++)
+ dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+
+ srcPtrTempR += 8;
+ srcPtrTempG += 8;
+ srcPtrTempB += 8;
+ dstPtrTemp += 24;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue);
+ dstPtrTemp[1] = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempG);
+ dstPtrTemp[2] = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue);
+
+ dstPtrTemp += 3;
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+ Rpp16f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 24)
+ {
+ Rpp32f srcPtrTemp_ps[24], dstPtrTemp_ps[25];
+
+ for(int cnt = 0; cnt < 24; cnt++)
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+
+ __m256 p[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); // simd loads
+ compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, p); // simd stores
+
+ for(int cnt = 0; cnt < 24; cnt++)
+ dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+
+ srcPtrTemp += 24;
+ dstPtrTemp += 24;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ dstPtrTemp[0] = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[0] + adjustmentValue);
+ dstPtrTemp[1] = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[1]);
+ dstPtrTemp[2] = (Rpp16f) RPPPIXELCHECKF32(srcPtrTemp[2] - adjustmentValue);
+
+ srcPtrTemp += 3;
+ dstPtrTemp += 3;
+ }
+
+ srcPtrRow += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+ {
+ Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+ Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8];
+
+ for(int cnt = 0; cnt < 8; cnt++)
+ {
+ srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+ srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+ srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+ }
+
+ __m256 p[3];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); // simd loads
+ compute_color_temperature_24_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, p); // simd stores
+
+ for(int cnt = 0; cnt < 8; cnt++)
+ {
+ dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt];
+ dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt];
+ dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt];
+ }
+
+ srcPtrTempR += 8;
+ srcPtrTempG += 8;
+ srcPtrTempB += 8;
+ dstPtrTempR += 8;
+ dstPtrTempG += 8;
+ dstPtrTempB += 8;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR++ = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempR + adjustmentValue);
+ *dstPtrTempG++ = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempG);
+ *dstPtrTempB++ = (Rpp16f) RPPPIXELCHECKF32(*srcPtrTempB - adjustmentValue);
+
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ dstPtrRowR += srcDescPtr->strides.hStride;
+ dstPtrRowG += srcDescPtr->strides.hStride;
+ dstPtrRowB += srcDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus color_temperature_i8_i8_host_tensor(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp8s *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(dstDescPtr->n)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f adjustmentValue = adjustmentValueTensor[batchCount];
+
+ Rpp8s *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ __m256 pAdj = _mm256_set1_ps(adjustmentValue);
+
+ Rpp8s *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ // Color Temperature with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+ Rpp8s *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 48)
+ {
+ __m256 p[6];
+
+ rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads
+ compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores
+
+ srcPtrTemp += 48;
+ dstPtrTempR += 16;
+ dstPtrTempG += 16;
+ dstPtrTempB += 16;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[0] + adjustmentValue);
+ *dstPtrTempG++ = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[1]);
+ *dstPtrTempB++ = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[2] - adjustmentValue);
+
+ srcPtrTemp += 3;
+ }
+
+ srcPtrRow += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+ {
+ __m256 p[6];
+
+ rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
+ compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p); // simd stores
+
+ srcPtrTempR += 16;
+ srcPtrTempG += 16;
+ srcPtrTempB += 16;
+ dstPtrTemp += 48;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempR + adjustmentValue);
+ dstPtrTemp[1] = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempG);
+ dstPtrTemp[2] = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempB - adjustmentValue);
+
+ dstPtrTemp += 3;
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+ Rpp8s *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 48)
+ {
+ __m256 p[6];
+
+ rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads
+ compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p); // simd stores
+
+ srcPtrTemp += 48;
+ dstPtrTemp += 48;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ dstPtrTemp[0] = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[0] + adjustmentValue);
+ dstPtrTemp[1] = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[1]);
+ dstPtrTemp[2] = (Rpp8s) RPPPIXELCHECKI8(srcPtrTemp[2] - adjustmentValue);
+
+ srcPtrTemp += 3;
+ dstPtrTemp += 3;
+ }
+
+ srcPtrRow += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Color Temperature with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+ {
+ __m256 p[6];
+
+ rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
+ compute_color_temperature_48_host(p, pAdj); // color_temperature adjustment
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores
+
+ srcPtrTempR += 16;
+ srcPtrTempG += 16;
+ srcPtrTempB += 16;
+ dstPtrTempR += 16;
+ dstPtrTempG += 16;
+ dstPtrTempB += 16;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR++ = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempR + adjustmentValue);
+ *dstPtrTempG++ = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempG);
+ *dstPtrTempB++ = (Rpp8s) RPPPIXELCHECKI8(*srcPtrTempB - adjustmentValue);
+
+ srcPtrTempR++;
+ srcPtrTempG++;
+ srcPtrTempB++;
+ }
+
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/down_mixing.hpp b/src/modules/cpu/kernel/down_mixing.hpp
new file mode 100644
index 000000000..9cefc64a2
--- /dev/null
+++ b/src/modules/cpu/kernel/down_mixing.hpp
@@ -0,0 +1,122 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include
+
+RppStatus down_mixing_host_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32s *srcDimsTensor,
+ bool normalizeWeights,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32s samples = srcDimsTensor[batchCount * 2];
+ Rpp32s channels = srcDimsTensor[batchCount * 2 + 1];
+ bool flagAVX = 0;
+
+ if(channels == 1)
+ {
+ // No need of downmixing, do a direct memcpy
+ memcpy(dstPtrTemp, srcPtrTemp, (size_t)(samples * sizeof(Rpp32f)));
+ }
+ else
+ {
+ Rpp32f *weights = handle.GetInitHandle()->mem.mcpu.tempFloatmem + batchCount * channels;
+ std::fill(weights, weights + channels, 1.f / channels);
+
+ if(normalizeWeights)
+ {
+ // Compute sum of the weights
+ Rpp32f sum = 0.0;
+ for(int i = 0; i < channels; i++)
+ sum += weights[i];
+
+ // Normalize the weights
+ Rpp32f invSum = 1.0 / sum;
+ for(int i = 0; i < channels; i++)
+ weights[i] *= invSum;
+ }
+
+ Rpp32s channelIncrement = 4;
+ Rpp32s alignedChannels = (channels / 4) * 4;
+ if(channels > 7)
+ {
+ flagAVX = 1;
+ channelIncrement = 8;
+ alignedChannels = (channels / 8) * 8;
+ }
+
+ // use weights to downmix to mono
+ for(int64_t dstIdx = 0; dstIdx < samples; dstIdx++)
+ {
+ Rpp32s channelLoopCount = 0;
+ // if number of channels are greater than or equal to 8, use AVX implementation
+ if(flagAVX)
+ {
+ __m256 pDst = avx_p0;
+ for(; channelLoopCount < alignedChannels; channelLoopCount += channelIncrement)
+ {
+ __m256 pSrc, pWeights;
+ pWeights = _mm256_setr_ps(weights[channelLoopCount], weights[channelLoopCount + 1], weights[channelLoopCount + 2], weights[channelLoopCount + 3],
+ weights[channelLoopCount + 4], weights[channelLoopCount + 5], weights[channelLoopCount + 6], weights[channelLoopCount + 7]);
+ pSrc = _mm256_loadu_ps(srcPtrTemp);
+ pSrc = _mm256_mul_ps(pSrc, pWeights);
+ pDst = _mm256_add_ps(pDst, pSrc);
+ srcPtrTemp += channelIncrement;
+ }
+ dstPtrTemp[dstIdx] = rpp_hsum_ps(pDst);
+ }
+ else
+ {
+ __m128 pDst = xmm_p0;
+ for(; channelLoopCount < alignedChannels; channelLoopCount += channelIncrement)
+ {
+ __m128 pSrc, pWeights;
+ pWeights = _mm_setr_ps(weights[channelLoopCount], weights[channelLoopCount + 1], weights[channelLoopCount + 2], weights[channelLoopCount + 3]);
+ pSrc = _mm_loadu_ps(srcPtrTemp);
+ pSrc = _mm_mul_ps(pSrc, pWeights);
+ pDst = _mm_add_ps(pDst, pSrc);
+ srcPtrTemp += channelIncrement;
+ }
+ dstPtrTemp[dstIdx] = rpp_hsum_ps(pDst);
+ }
+ for(; channelLoopCount < channels; channelLoopCount++)
+ dstPtrTemp[dstIdx] += ((*srcPtrTemp++) * weights[channelLoopCount]);
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/magnitude.hpp b/src/modules/cpu/kernel/magnitude.hpp
new file mode 100644
index 000000000..6eaf4f236
--- /dev/null
+++ b/src/modules/cpu/kernel/magnitude.hpp
@@ -0,0 +1,1001 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus magnitude_u8_u8_host_tensor(Rpp8u *srcPtr1,
+ Rpp8u *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+#endif
+
+ // Magnitude with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[6], p2[6];
+
+ rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation
+ p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3]))); // magnitude computation
+ p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4]))); // magnitude computation
+ p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5]))); // magnitude computation
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ Rpp32f srcPtr1TempValue0 = static_cast(srcPtr1Temp[0]);
+ Rpp32f srcPtr1TempValue1 = static_cast(srcPtr1Temp[1]);
+ Rpp32f srcPtr1TempValue2 = static_cast(srcPtr1Temp[2]);
+ Rpp32f srcPtr2TempValue0 = static_cast(srcPtr2Temp[0]);
+ Rpp32f srcPtr2TempValue1 = static_cast(srcPtr2Temp[1]);
+ Rpp32f srcPtr2TempValue2 = static_cast(srcPtr2Temp[2]);
+ *dstPtrTempR++ = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)))));
+ *dstPtrTempG++ = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)))));
+ *dstPtrTempB++ = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)))));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Magnitude with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[6], p2[6];
+
+ rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation
+ p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3]))); // magnitude computation
+ p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4]))); // magnitude computation
+ p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5]))); // magnitude computation
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ Rpp32f srcPtr1TempValue0 = static_cast(*srcPtr1TempR);
+ Rpp32f srcPtr1TempValue1 = static_cast(*srcPtr1TempG);
+ Rpp32f srcPtr1TempValue2 = static_cast(*srcPtr1TempB);
+ Rpp32f srcPtr2TempValue0 = static_cast(*srcPtr2TempR);
+ Rpp32f srcPtr2TempValue1 = static_cast(*srcPtr2TempG);
+ Rpp32f srcPtr2TempValue2 = static_cast(*srcPtr2TempB);
+ dstPtrTemp[0] = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)))));
+ dstPtrTemp[1] = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)))));
+ dstPtrTemp[2] = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)))));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~15;
+#endif
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[2], p2[2];
+
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ rpp_simd_store(rpp_store16_f32_to_u8_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ Rpp32f srcPtr1TempValue = static_cast(*srcPtr1Temp);
+ Rpp32f srcPtr2TempValue = static_cast(*srcPtr2Temp);
+ *dstPtrTemp++ = static_cast(round(RPPPIXELCHECK(sqrt((srcPtr1TempValue * srcPtr1TempValue) + (srcPtr2TempValue * srcPtr2TempValue)))));
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus magnitude_f32_f32_host_tensor(Rpp32f *srcPtr1,
+ Rpp32f *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+ // Magnitude with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = RPPPIXELCHECKF32(sqrt((srcPtr1Temp[0] * srcPtr1Temp[0]) + (srcPtr2Temp[0] * srcPtr2Temp[0])));
+ *dstPtrTempG++ = RPPPIXELCHECKF32(sqrt((srcPtr1Temp[1] * srcPtr1Temp[1]) + (srcPtr2Temp[1] * srcPtr2Temp[1])));
+ *dstPtrTempB++ = RPPPIXELCHECKF32(sqrt((srcPtr1Temp[2] * srcPtr1Temp[2]) + (srcPtr2Temp[2] * srcPtr2Temp[2])));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Magnitude with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = RPPPIXELCHECKF32(sqrt((*srcPtr1TempR * *srcPtr1TempR) + (*srcPtr2TempR * *srcPtr2TempR)));
+ dstPtrTemp[1] = RPPPIXELCHECKF32(sqrt((*srcPtr1TempG * *srcPtr1TempG) + (*srcPtr2TempG * *srcPtr2TempG)));
+ dstPtrTemp[2] = RPPPIXELCHECKF32(sqrt((*srcPtr1TempB * *srcPtr1TempB) + (*srcPtr2TempB * *srcPtr2TempB)));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[1], p2[1];
+
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = RPPPIXELCHECKF32(sqrt((*srcPtr1Temp * *srcPtr1Temp) + (*srcPtr2Temp * *srcPtr2Temp)));
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus magnitude_f16_f16_host_tensor(Rpp16f *srcPtr1,
+ Rpp16f *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+ // Magnitude with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24];
+
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]);
+ srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]);
+ }
+
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp_ps, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp_ps, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32(sqrt((srcPtr1Temp[0] * srcPtr1Temp[0]) + (srcPtr2Temp[0] * srcPtr2Temp[0]))));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32(sqrt((srcPtr1Temp[1] * srcPtr1Temp[1]) + (srcPtr2Temp[1] * srcPtr2Temp[1]))));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32(sqrt((srcPtr1Temp[2] * srcPtr1Temp[2]) + (srcPtr2Temp[2] * srcPtr2Temp[2]))));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Magnitude with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24];
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtr1Temp_ps[cnt] = static_cast(srcPtr1TempR[cnt]);
+ srcPtr1Temp_ps[cnt + 8] = static_cast(srcPtr1TempG[cnt]);
+ srcPtr1Temp_ps[cnt + 16] = static_cast(srcPtr1TempB[cnt]);
+
+ srcPtr2Temp_ps[cnt] = static_cast(srcPtr2TempR[cnt]);
+ srcPtr2Temp_ps[cnt + 8] = static_cast(srcPtr2TempG[cnt]);
+ srcPtr2Temp_ps[cnt + 16] = static_cast(srcPtr2TempB[cnt]);
+ }
+
+ __m256 p1[4], p2[4];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1Temp_ps, srcPtr1Temp_ps + 8, srcPtr1Temp_ps + 16, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2Temp_ps, srcPtr2Temp_ps + 8, srcPtr2Temp_ps + 16, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32(sqrt((*srcPtr1TempR * *srcPtr1TempR) + (*srcPtr2TempR * *srcPtr2TempR))));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32(sqrt((*srcPtr1TempG * *srcPtr1TempG) + (*srcPtr2TempG * *srcPtr2TempG))));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32(sqrt((*srcPtr1TempB * *srcPtr1TempB) + (*srcPtr2TempB * *srcPtr2TempB))));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8];
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]);
+ srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]);
+ }
+
+ __m256 p1[1], p2[1];
+
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp_ps, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp_ps, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(sqrt((*srcPtr1Temp * *srcPtr1Temp) + (*srcPtr2Temp * *srcPtr2Temp))));
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus magnitude_i8_i8_host_tensor(Rpp8s *srcPtr1,
+ Rpp8s *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+#endif
+
+ // Magnitude with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[6], p2[6];
+
+ rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation
+ p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3]))); // magnitude computation
+ p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4]))); // magnitude computation
+ p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5]))); // magnitude computation
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ Rpp32f srcPtr1TempValue0 = static_cast(srcPtr1Temp[0] + 128);
+ Rpp32f srcPtr1TempValue1 = static_cast(srcPtr1Temp[1] + 128);
+ Rpp32f srcPtr1TempValue2 = static_cast(srcPtr1Temp[2] + 128);
+ Rpp32f srcPtr2TempValue0 = static_cast(srcPtr2Temp[0] + 128);
+ Rpp32f srcPtr2TempValue1 = static_cast(srcPtr2Temp[1] + 128);
+ Rpp32f srcPtr2TempValue2 = static_cast(srcPtr2Temp[2] + 128);
+ *dstPtrTempR++ = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)) - 128)));
+ *dstPtrTempG++ = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)) - 128)));
+ *dstPtrTempB++ = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)) - 128)));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Magnitude with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[6], p2[6];
+
+ rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ p1[2] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[2], p1[2], _mm256_mul_ps(p2[2], p2[2]))); // magnitude computation
+ p1[3] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[3], p1[3], _mm256_mul_ps(p2[3], p2[3]))); // magnitude computation
+ p1[4] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[4], p1[4], _mm256_mul_ps(p2[4], p2[4]))); // magnitude computation
+ p1[5] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[5], p1[5], _mm256_mul_ps(p2[5], p2[5]))); // magnitude computation
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ Rpp32f srcPtr1TempValue0 = static_cast(*srcPtr1TempR + 128);
+ Rpp32f srcPtr1TempValue1 = static_cast(*srcPtr1TempG + 128);
+ Rpp32f srcPtr1TempValue2 = static_cast(*srcPtr1TempB + 128);
+ Rpp32f srcPtr2TempValue0 = static_cast(*srcPtr2TempR + 128);
+ Rpp32f srcPtr2TempValue1 = static_cast(*srcPtr2TempG + 128);
+ Rpp32f srcPtr2TempValue2 = static_cast(*srcPtr2TempB + 128);
+ dstPtrTemp[0] = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue0 * srcPtr1TempValue0) + (srcPtr2TempValue0 * srcPtr2TempValue0)) - 128)));
+ dstPtrTemp[1] = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue1 * srcPtr1TempValue1) + (srcPtr2TempValue1 * srcPtr2TempValue1)) - 128)));
+ dstPtrTemp[2] = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue2 * srcPtr1TempValue2) + (srcPtr2TempValue2 * srcPtr2TempValue2)) - 128)));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Magnitude without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~15;
+#endif
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[2], p2[2];
+
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[0], p1[0], _mm256_mul_ps(p2[0], p2[0]))); // magnitude computation
+ p1[1] = _mm256_sqrt_ps(_mm256_fmadd_ps(p1[1], p1[1], _mm256_mul_ps(p2[1], p2[1]))); // magnitude computation
+ rpp_simd_store(rpp_store16_f32_to_i8_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ Rpp32f srcPtr1TempValue = static_cast(*srcPtr1Temp + 128);
+ Rpp32f srcPtr2TempValue = static_cast(*srcPtr2Temp + 128);
+ *dstPtrTemp++ = static_cast(round(RPPPIXELCHECKI8(sqrt((srcPtr1TempValue * srcPtr1TempValue) + (srcPtr2TempValue * srcPtr2TempValue)) - 128)));
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/multiply_scalar.hpp b/src/modules/cpu/kernel/multiply_scalar.hpp
new file mode 100644
index 000000000..a27782bcc
--- /dev/null
+++ b/src/modules/cpu/kernel/multiply_scalar.hpp
@@ -0,0 +1,152 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus multiply_scalar_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *mulTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI3D roiDefault;
+ if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
+ else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ RpptROI3D roi;
+ RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
+ compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+ Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp32f mulParam = mulTensor[batchCount];
+ Rpp32f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ dstPtrChannel = dstPtrImage;
+#if __AVX2__
+ Rpp32u vectorIncrement = 16;
+ __m256 pMulParam = _mm256_set1_ps(mulParam);
+ Rpp32u alignedLength = bufferLength & ~(vectorIncrement - 1);
+#endif
+ // multiply without fused output-layout toggle (NCDHW -> NCDHW)
+ if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtrDepth, *dstPtrDepth;
+ srcPtrDepth = srcPtrChannel;
+ dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_multiply_16_host(p, &pMulParam); // multiply adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ *dstPtrTemp++ = (*srcPtrTemp++ * mulParam);
+
+ srcPtrRow += srcGenericDescPtr->strides[3];
+ dstPtrRow += dstGenericDescPtr->strides[3];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[2];
+ dstPtrDepth += dstGenericDescPtr->strides[2];
+ }
+ srcPtrChannel += srcGenericDescPtr->strides[1];
+ dstPtrChannel += srcGenericDescPtr->strides[1];
+ }
+ }
+ // multiply without fused output-layout toggle (NDHWC -> NDHWC)
+ else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+ Rpp32f *srcPtrDepth = srcPtrChannel;
+ Rpp32f *dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_multiply_16_host(p, &pMulParam); // multiply adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ *dstPtrTemp++ = (*srcPtrTemp++ * mulParam);
+
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[1];
+ dstPtrDepth += dstGenericDescPtr->strides[1];
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/subtract_scalar.hpp b/src/modules/cpu/kernel/subtract_scalar.hpp
new file mode 100644
index 000000000..a40e6219f
--- /dev/null
+++ b/src/modules/cpu/kernel/subtract_scalar.hpp
@@ -0,0 +1,152 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus subtract_scalar_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *subtractTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI3D roiDefault;
+ if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
+ else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
+ roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ RpptROI3D roi;
+ RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
+ compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ Rpp32f subtractParam = subtractTensor[batchCount];
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u vectorIncrement = 16;
+ Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+ __m256 pSubtractParam = _mm256_set1_ps(subtractParam);
+
+ // Subtract without fused output-layout toggle (NCDHW -> NCDHW)
+ if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtrDepth, *dstPtrDepth;
+ srcPtrDepth = srcPtrChannel;
+ dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_subtract_16_host(p, &pSubtractParam); // subtract adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtrTemp++ - subtractParam;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[3];
+ dstPtrRow += dstGenericDescPtr->strides[3];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[2];
+ dstPtrDepth += dstGenericDescPtr->strides[2];
+ }
+ srcPtrChannel += srcGenericDescPtr->strides[1];
+ dstPtrChannel += srcGenericDescPtr->strides[1];
+ }
+ }
+ // Subtract without fused output-layout toggle (NDHWC -> NDHWC)
+ else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ {
+ srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
+ Rpp32f *srcPtrDepth = srcPtrChannel;
+ Rpp32f *dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtrRow;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_subtract_16_host(p, &pSubtractParam); // subtract adjustment
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtrTemp++ - subtractParam;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[1];
+ dstPtrDepth += dstGenericDescPtr->strides[1];
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/tensor_max.hpp b/src/modules/cpu/kernel/tensor_max.hpp
new file mode 100644
index 000000000..0380f4ef6
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_max.hpp
@@ -0,0 +1,847 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus tensor_max_u8_u8_host(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8u *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+
+ // Tensor max 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp8u max = 0;
+ Rpp8u resultAvx[16];
+
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pMax = _mm256_setzero_si256();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp);
+ pMax = _mm256_max_epu8(p1, pMax); //compare and store max of 32 values into global max
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ max = std::max(*srcPtrTemp++, max);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_32_host(&pMax, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ max = std::max(resultAvx[0], max);
+#endif
+ maxArr[batchCount] = max;
+ }
+ // Tensor max 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0;
+ Rpp8u resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pMaxR = _mm256_setzero_si256();
+ __m256i pMaxG = pMaxR;
+ __m256i pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[3];
+ rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_max_96_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ maxR = std::max(*srcPtrTempR++, maxR);
+ maxG = std::max(*srcPtrTempG++, maxG);
+ maxB = std::max(*srcPtrTempB++, maxB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_96_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ maxR = std::max(resultAvx[0], maxR);
+ maxG = std::max(resultAvx[1], maxG);
+ maxB = std::max(resultAvx[2], maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+
+ // Tensor max 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0;
+ Rpp8u resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+ __m128i pMaxR = _mm_setzero_si128();
+ __m128i pMaxG = pMaxR;
+ __m128i pMaxB = pMaxR;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p[3];
+ rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p);
+ compute_max_48_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ maxR = std::max(srcPtrTemp[0], maxR);
+ maxG = std::max(srcPtrTemp[1], maxG);
+ maxB = std::max(srcPtrTemp[2], maxB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_48_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ maxR = std::max(resultAvx[0], maxR);
+ maxG = std::max(resultAvx[1], maxG);
+ maxB = std::max(resultAvx[2], maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_f32_f32_host(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp32f *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+
+ // Tensor max 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp32f max = 0.0;
+ Rpp32f resultAvx[4];
+
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256 pMax = _mm256_setzero_ps();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1;
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1);
+ compute_max_float8_host(&p1, &pMax);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ max = std::max(*srcPtrTemp++, max);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128 result;
+ reduce_max_float8_host(&pMax, &result);
+ rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+ max = std::max(std::max(resultAvx[0], resultAvx[1]), max);
+#endif
+ maxArr[batchCount] = max;
+ }
+
+ // Tensor max 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+ Rpp32f resultAvx[8];
+
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256 pMaxR = _mm256_setzero_ps();
+ __m256 pMaxG = pMaxR;
+ __m256 pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ maxR = std::max(*srcPtrTempR++, maxR);
+ maxG = std::max(*srcPtrTempG++, maxG);
+ maxB = std::max(*srcPtrTempB++, maxB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+ maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+ maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+
+ // Tensor max 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+ Rpp32f resultAvx[8];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+ __m256 pMaxR = _mm256_setzero_ps();
+ __m256 pMaxG = pMaxR;
+ __m256 pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);
+ compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ maxR = std::max(srcPtrTemp[0], maxR);
+ maxG = std::max(srcPtrTemp[1], maxG);
+ maxB = std::max(srcPtrTemp[2], maxB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+ maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+ maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_f16_f16_host(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp16f *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+
+ // Tensor max 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp32f max = 0.0;
+ Rpp32f resultAvx[4];
+
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256 pMax = _mm256_setzero_ps();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[8];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+ }
+ __m256 p1;
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1);
+ compute_max_float8_host(&p1, &pMax);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ max = std::max((Rpp32f)*srcPtrTemp++, max);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128 result;
+ reduce_max_float8_host(&pMax, &result);
+ rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+ max = std::max(std::max(resultAvx[0], resultAvx[1]), max);
+#endif
+ maxArr[batchCount] = (Rpp16f)max;
+ }
+
+ // Tensor max 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+ Rpp32f resultAvx[8];
+
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256 pMaxR = _mm256_setzero_ps();
+ __m256 pMaxG = pMaxR;
+ __m256 pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+ srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+ srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+ }
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+ compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ maxR = std::max((Rpp32f)*srcPtrTempR++, maxR);
+ maxG = std::max((Rpp32f)*srcPtrTempG++, maxG);
+ maxB = std::max((Rpp32f)*srcPtrTempB++, maxB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+ maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+ maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+
+#endif
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = (Rpp16f)maxR;
+ maxArr[maxArrIndex + 1] = (Rpp16f)maxG;
+ maxArr[maxArrIndex + 2] = (Rpp16f)maxB;
+ maxArr[maxArrIndex + 3] = (Rpp16f)maxC;
+ }
+
+ // Tensor max 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+ Rpp32f resultAvx[8];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+ __m256 pMaxR = _mm256_setzero_ps();
+ __m256 pMaxG = pMaxR;
+ __m256 pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[24];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+ }
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);
+ compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ maxR = std::max((Rpp32f)srcPtrTemp[0], maxR);
+ maxG = std::max((Rpp32f)srcPtrTemp[1], maxG);
+ maxB = std::max((Rpp32f)srcPtrTemp[2], maxB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+ maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+ maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = (Rpp16f)maxR;
+ maxArr[maxArrIndex + 1] = (Rpp16f)maxG;
+ maxArr[maxArrIndex + 2] = (Rpp16f)maxB;
+ maxArr[maxArrIndex + 3] = (Rpp16f)maxC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_i8_i8_host(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8s *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+
+ // Tensor max 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp8s max = INT8_MIN;
+ Rpp8s resultAvx[16];
+
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pMax = _mm256_set1_epi8(INT8_MIN);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp);
+ pMax = _mm256_max_epi8(p1, pMax); //compare and store max of 32 values into global max
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ max = std::max(*srcPtrTemp++, max);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_i32_host(&pMax, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ max = std::max(resultAvx[0], max);
+#endif
+ maxArr[batchCount] = max;
+ }
+ // Tensor max 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN;
+ Rpp8s resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pMaxR = _mm256_set1_epi8(INT8_MIN);
+ __m256i pMaxG = pMaxR;
+ __m256i pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[3];
+ rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_max_i96_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ maxR = std::max(*srcPtrTempR++, maxR);
+ maxG = std::max(*srcPtrTempG++, maxG);
+ maxB = std::max(*srcPtrTempB++, maxB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_i96_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ maxR = std::max(resultAvx[0], maxR);
+ maxG = std::max(resultAvx[1], maxG);
+ maxB = std::max(resultAvx[2], maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+
+ // Tensor max 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN;
+ Rpp8s resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+ __m128i pMaxR = _mm_set1_epi8(INT8_MIN);
+ __m128i pMaxG = pMaxR;
+ __m128i pMaxB = pMaxR;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p[3];
+ rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p);
+ compute_max_i48_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ maxR = std::max(srcPtrTemp[0], maxR);
+ maxG = std::max(srcPtrTemp[1], maxG);
+ maxB = std::max(srcPtrTemp[2], maxB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_i48_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ maxR = std::max(resultAvx[0], maxR);
+ maxG = std::max(resultAvx[1], maxG);
+ maxB = std::max(resultAvx[2], maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+ }
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/tensor_min.hpp b/src/modules/cpu/kernel/tensor_min.hpp
new file mode 100644
index 000000000..15b9b77ba
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_min.hpp
@@ -0,0 +1,845 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus tensor_min_u8_u8_host(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8u *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+
+ // Tensor min 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp8u min = 255;
+ Rpp8u resultAvx[16];
+
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pMin = _mm256_set1_epi8((char)255);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp);
+ pMin = _mm256_min_epu8(p1, pMin);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ min = std::min(*srcPtrTemp++, min);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_min_32_host(&pMin, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+ minArr[batchCount] = min;
+ }
+
+ // Tensor min 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp8u minC = 255, minR = 255, minG = 255, minB = 255;
+ Rpp8u resultAvx[16];
+
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pMinR = _mm256_set1_epi8((char)255);
+ __m256i pMinG = pMinR;
+ __m256i pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[3];
+ rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_min_96_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ minR = std::min(*srcPtrTempR++, minR);
+ minG = std::min(*srcPtrTempG++, minG);
+ minB = std::min(*srcPtrTempB++, minB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_min_96_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ minR = std::min(resultAvx[0], minR);
+ minG = std::min(resultAvx[1], minG);
+ minB = std::min(resultAvx[2], minB);
+#endif
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+
+ // Tensor min 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp8u minC = 255, minR = 255, minG = 255, minB = 255;
+ Rpp8u resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+ __m128i pMinR = _mm_set1_epi8((char)255);
+ __m128i pMinG = pMinR;
+ __m128i pMinB = pMinR;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p[3];
+ rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p);
+ compute_min_48_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ minR = std::min(srcPtrTemp[0], minR);
+ minG = std::min(srcPtrTemp[1], minG);
+ minB = std::min(srcPtrTemp[2], minB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+ __m128i result;
+ reduce_min_48_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ minR = std::min(resultAvx[0], minR);
+ minG = std::min(resultAvx[1], minG);
+ minB = std::min(resultAvx[2], minB);
+ }
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_f32_f32_host(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp32f *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+
+ // Tensor min 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp32f min = 255.0;
+ Rpp32f resultAvx[4];
+
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256 pMin = _mm256_set1_ps(255.0);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1;
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1);
+ compute_min_float8_host(&p1, &pMin);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ min = std::min(*srcPtrTemp++, min);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m128 result;
+ reduce_min_float8_host(&pMin, &result);
+ rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+ min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+ minArr[batchCount] = min;
+ }
+
+ // Tensor min 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+ Rpp32f resultAvx[8];
+
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256 pMinR = _mm256_set1_ps(255.0);
+ __m256 pMinG = pMinR;
+ __m256 pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ minR = std::min(*srcPtrTempR++, minR);
+ minG = std::min(*srcPtrTempG++, minG);
+ minB = std::min(*srcPtrTempB++, minB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+ minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+ minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+
+ // Tensor min 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+ Rpp32f resultAvx[8];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+ __m256 pMinR = _mm256_set1_ps(255.0);
+ __m256 pMinG = pMinR;
+ __m256 pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);
+ compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ minR = std::min(srcPtrTemp[0], minR);
+ minG = std::min(srcPtrTemp[1], minG);
+ minB = std::min(srcPtrTemp[2], minB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m256 result;
+ reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+ minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+ minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+ }
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_f16_f16_host(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp16f *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+
+ // Tensor min 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp32f min = 255.0;
+ Rpp32f resultAvx[4];
+
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256 pMin = _mm256_set1_ps(255.0);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[8];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+ }
+ __m256 p1;
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1);
+ compute_min_float8_host(&p1, &pMin);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ min = std::min((Rpp32f)*srcPtrTemp++, min);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m128 result;
+ reduce_min_float8_host(&pMin, &result);
+ rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+ min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+ minArr[batchCount] = (Rpp16f) min;
+ }
+
+ // Tensor min 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+ Rpp32f resultAvx[8];
+
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256 pMinR = _mm256_set1_ps(255.0);
+ __m256 pMinG = pMinR;
+ __m256 pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+ srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+ srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+ }
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+ compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ minR = std::min((Rpp32f)*srcPtrTempR++, minR);
+ minG = std::min((Rpp32f)*srcPtrTempG++, minG);
+ minB = std::min((Rpp32f)*srcPtrTempB++, minB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+ minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+ minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = (Rpp16f) minR;
+ minArr[minArrIndex + 1] = (Rpp16f) minG;
+ minArr[minArrIndex + 2] = (Rpp16f) minB;
+ minArr[minArrIndex + 3] = (Rpp16f) minC;
+ }
+
+ // Tensor min 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+ Rpp32f resultAvx[8];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+ __m256 pMinR = _mm256_set1_ps(255.0);
+ __m256 pMinG = pMinR;
+ __m256 pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[24];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+ }
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);
+ compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ minR = std::min((Rpp32f)srcPtrTemp[0], minR);
+ minG = std::min((Rpp32f)srcPtrTemp[1], minG);
+ minB = std::min((Rpp32f)srcPtrTemp[2], minB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m256 result;
+ reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+ minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+ minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+ }
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = (Rpp16f) minR;
+ minArr[minArrIndex + 1] = (Rpp16f) minG;
+ minArr[minArrIndex + 2] = (Rpp16f) minB;
+ minArr[minArrIndex + 3] = (Rpp16f) minC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_i8_i8_host(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8s *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+
+ // Tensor min 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp8s min = 127;
+ Rpp8s resultAvx[16];
+
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pMin = _mm256_set1_epi8((char)127);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp);
+ pMin = _mm256_min_epi8(p1, pMin); //compare and store min of 32 values into global min
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ min = std::min((*srcPtrTemp++), min);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m128i result;
+ reduce_min_i32_host(&pMin, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+ minArr[batchCount] = min;
+ }
+
+ // Tensor min 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp8s minC = 127, minR = 127, minG = 127, minB = 127;
+ Rpp8s resultAvx[16];
+
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pMinR = _mm256_set1_epi8((char)127);
+ __m256i pMinG = pMinR;
+ __m256i pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[3];
+ rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_min_i96_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ minR = std::min(*srcPtrTempR++, minR);
+ minG = std::min(*srcPtrTempG++, minG);
+ minB = std::min(*srcPtrTempB++, minB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_min_i96_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ minR = std::min(resultAvx[0], minR);
+ minG = std::min(resultAvx[1], minG);
+ minB = std::min(resultAvx[2], minB);
+#endif
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+
+ // Tensor min 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp8s minC = 127, minR = 127, minG = 127, minB = 127;
+ Rpp8s resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+ __m128i pMinR = _mm_set1_epi8((char)127);
+ __m128i pMinG = pMinR;
+ __m128i pMinB = pMinR;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p[3];
+ rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p);
+ compute_min_i48_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ minR = std::min(srcPtrTemp[0], minR);
+ minG = std::min(srcPtrTemp[1], minG);
+ minB = std::min(srcPtrTemp[2], minB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_min_i48_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ minR = std::min(resultAvx[0], minR);
+ minG = std::min(resultAvx[1], minG);
+ minB = std::min(resultAvx[2], minB);
+#endif
+ }
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+ }
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
index 55fbb7832..0345171fc 100644
--- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp
+++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
@@ -26,5 +26,9 @@ SOFTWARE.
#define HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
#include "kernel/fused_multiply_add_scalar.hpp"
+#include "kernel/add_scalar.hpp"
+#include "kernel/subtract_scalar.hpp"
+#include "kernel/multiply_scalar.hpp"
+#include "kernel/magnitude.hpp"
#endif // HIP_TENSOR_ARITHMEETIC_OPERATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_color_augmentations.hpp b/src/modules/hip/hip_tensor_color_augmentations.hpp
index 873f06b97..c5610dbcb 100644
--- a/src/modules/hip/hip_tensor_color_augmentations.hpp
+++ b/src/modules/hip/hip_tensor_color_augmentations.hpp
@@ -33,5 +33,6 @@ SOFTWARE.
#include "kernel/exposure.hpp"
#include "kernel/contrast.hpp"
#include "kernel/lut.hpp"
+#include "kernel/color_temperature.hpp"
#endif // HIP_TENSOR_COLOR_AUGMENTATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp
index 328a232a1..c79e0a951 100644
--- a/src/modules/hip/hip_tensor_statistical_operations.hpp
+++ b/src/modules/hip/hip_tensor_statistical_operations.hpp
@@ -23,8 +23,9 @@ SOFTWARE.
*/
#ifndef HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
-#define HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
#include "kernel/tensor_sum.hpp"
+#include "kernel/tensor_min.hpp"
+#include "kernel/tensor_max.hpp"
-#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
+#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/add_scalar.hpp b/src/modules/hip/kernel/add_scalar.hpp
new file mode 100644
index 000000000..709337c9d
--- /dev/null
+++ b/src/modules/hip/kernel/add_scalar.hpp
@@ -0,0 +1,114 @@
+#include
+#include "rpp_hip_common.hpp"
+
+
+__global__ void add_scalar_ncdhw_hip_tensor(float *srcPtr,
+ uint3 srcStridesCDH,
+ float *dstPtr,
+ uint3 dstStridesCDH,
+ int channels,
+ float addParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x);
+ uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+
+ d_float8 val_f8;
+ for(int c = 0; c < channels; c++)
+ {
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8);
+ rpp_hip_math_add8_const(&val_f8, &val_f8, static_cast(addParam));
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+ srcIdx += srcStridesCDH.x;
+ dstIdx += dstStridesCDH.x;
+ }
+}
+
+__global__ void add_scalar_ndhwc_hip_tensor(float *srcPtr,
+ uint2 srcStridesDH,
+ float *dstPtr,
+ uint2 dstStridesDH,
+ float addParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3;
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+
+ d_float24 val_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
+ rpp_hip_math_add24_const(&val_f24, &val_f24, static_cast(addParam));
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+RppStatus hip_exec_add_scalar_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ RpptROI3DPtr roiGenericPtrSrc,
+ Rpp32f *addTensor,
+ rpp::Handle& handle)
+{
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(add_scalar_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+ dstGenericDescPtr->dims[1],
+ addTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+ else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(add_scalar_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ addTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/color_temperature.hpp b/src/modules/hip/kernel/color_temperature.hpp
new file mode 100644
index 000000000..ad8adc32a
--- /dev/null
+++ b/src/modules/hip/kernel/color_temperature.hpp
@@ -0,0 +1,223 @@
+#include
+#include "rpp_hip_common.hpp"
+
+template
+__device__ void color_temperature_hip_compute(T *srcPtr, d_float24 *pix_f24, float4 *adjustmentValue_f4)
+{
+ float4 adjustment_f4;
+ if constexpr ((std::is_same::value) || (std::is_same::value))
+ {
+ adjustment_f4 = *adjustmentValue_f4 * (float4) ONE_OVER_255;
+ rpp_hip_math_add8_const(&pix_f24->f8[0], &pix_f24->f8[0], adjustment_f4);
+ rpp_hip_math_subtract8_const(&pix_f24->f8[2], &pix_f24->f8[2], adjustment_f4);
+ }
+ else if constexpr (std::is_same::value)
+ {
+ adjustment_f4 = *adjustmentValue_f4;
+ rpp_hip_math_add24_const(pix_f24, pix_f24, (float4)128);
+ rpp_hip_math_add8_const(&pix_f24->f8[0], &pix_f24->f8[0], adjustment_f4);
+ rpp_hip_math_subtract8_const(&pix_f24->f8[2], &pix_f24->f8[2], adjustment_f4);
+ rpp_hip_pixel_check_0to255(pix_f24);
+ rpp_hip_math_subtract24_const(pix_f24, pix_f24, (float4)128);
+ }
+ else
+ {
+ rpp_hip_math_add8_const(&pix_f24->f8[0], &pix_f24->f8[0], *adjustmentValue_f4);
+ rpp_hip_math_subtract8_const(&pix_f24->f8[2], &pix_f24->f8[2], *adjustmentValue_f4);
+ }
+}
+
+template
+__global__ void color_temperature_pkd_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ int *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]);
+
+ d_float24 pix_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &pix_f24);
+ color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &pix_f24);
+}
+
+template
+__global__ void color_temperature_pln_hip_tensor(T *srcPtr,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ int *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]);
+
+ d_float24 pix_f24;
+
+ rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &pix_f24);
+ color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4);
+ rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &pix_f24);
+}
+
+template
+__global__ void color_temperature_pkd3_pln3_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ int *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]);
+
+ d_float24 pix_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &pix_f24);
+ color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4);
+ rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &pix_f24);
+}
+
+template
+__global__ void color_temperature_pln3_pkd3_hip_tensor(T *srcPtr,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ int *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ float4 adjustmentValue_f4 = (float4)((float)adjustmentValueTensor[id_z]);
+
+ d_float24 pix_f24;
+
+ rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &pix_f24);
+ color_temperature_hip_compute(srcPtr, &pix_f24, &adjustmentValue_f4);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &pix_f24);
+}
+
+template
+RppStatus hip_exec_color_temperature_tensor(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ T *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+ {
+ int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = handle.GetBatchSize();
+
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(color_temperature_pkd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ handle.GetInitHandle()->mem.mgpu.intArr[0].intmem,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(color_temperature_pln_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ handle.GetInitHandle()->mem.mgpu.intArr[0].intmem,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(color_temperature_pkd3_pln3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ handle.GetInitHandle()->mem.mgpu.intArr[0].intmem,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(color_temperature_pln3_pkd3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ handle.GetInitHandle()->mem.mgpu.intArr[0].intmem,
+ roiTensorPtrSrc);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/magnitude.hpp b/src/modules/hip/kernel/magnitude.hpp
new file mode 100644
index 000000000..902d27bde
--- /dev/null
+++ b/src/modules/hip/kernel/magnitude.hpp
@@ -0,0 +1,244 @@
+#include
+#include "rpp_hip_common.hpp"
+
+template
+__device__ void magnitude_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
+{
+ if constexpr (std::is_same::value)
+ {
+ rpp_hip_math_add8_const(src1_f8, src1_f8, (float4)128);
+ rpp_hip_math_add8_const(src2_f8, src2_f8, (float4)128);
+ }
+
+ d_float8 src1Sq_f8, src2Sq_f8, sum_f8;
+ rpp_hip_math_multiply8(src1_f8, src1_f8, &src1Sq_f8);
+ rpp_hip_math_multiply8(src2_f8, src2_f8, &src2Sq_f8);
+ rpp_hip_math_add8(&src1Sq_f8, &src2Sq_f8, &sum_f8);
+ rpp_hip_math_sqrt8(&sum_f8, dst_f8);
+
+ if constexpr (std::is_same::value)
+ {
+ dst_f8->f4[0] = rpp_hip_pixel_check_0to255(dst_f8->f4[0]) - (float4)128;
+ dst_f8->f4[1] = rpp_hip_pixel_check_0to255(dst_f8->f4[1]) - (float4)128;
+ }
+}
+
+template
+__global__ void magnitude_pkd_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3;
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+ magnitude_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ magnitude_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ magnitude_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+__global__ void magnitude_pln_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ int channelsDst,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ d_float8 src1_f8, src2_f8, dst_f8;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ magnitude_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ if (channelsDst == 3)
+ {
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ magnitude_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ magnitude_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+ }
+}
+
+template
+__global__ void magnitude_pkd3_pln3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+ magnitude_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ magnitude_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ magnitude_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
+}
+
+template
+__global__ void magnitude_pln3_pkd3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24);
+ rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24);
+ magnitude_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ magnitude_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ magnitude_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+RppStatus hip_exec_magnitude_tensor(T *srcPtr1,
+ T *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ T *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (dstDescPtr->w + 7) >> 3;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = handle.GetBatchSize();
+
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(magnitude_pkd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(magnitude_pln_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ dstDescPtr->c,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+ {
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(magnitude_pkd3_pln3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
+ hipLaunchKernelGGL(magnitude_pln3_pkd3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/multiply_scalar.hpp b/src/modules/hip/kernel/multiply_scalar.hpp
new file mode 100644
index 000000000..e0816576a
--- /dev/null
+++ b/src/modules/hip/kernel/multiply_scalar.hpp
@@ -0,0 +1,114 @@
+#include
+#include "rpp_hip_common.hpp"
+
+
+__global__ void multiply_scalar_ncdhw_hip_tensor(float *srcPtr,
+ uint3 srcStridesCDH,
+ float *dstPtr,
+ uint3 dstStridesCDH,
+ int channels,
+ float mulParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x);
+ uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+
+ d_float8 val_f8;
+ for(int c = 0; c < channels; c++)
+ {
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8);
+ rpp_hip_math_multiply8_const(&val_f8, &val_f8, static_cast(mulParam));
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+ srcIdx += srcStridesCDH.x;
+ dstIdx += dstStridesCDH.x;
+ }
+}
+
+__global__ void multiply_scalar_ndhwc_hip_tensor(float *srcPtr,
+ uint2 srcStridesDH,
+ float *dstPtr,
+ uint2 dstStridesDH,
+ float mulParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3;
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+
+ d_float24 val_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
+ rpp_hip_math_multiply24_const(&val_f24, &val_f24, static_cast(mulParam));
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+RppStatus hip_exec_multiply_scalar_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ RpptROI3DPtr roiGenericPtrSrc,
+ Rpp32f *mulTensor,
+ rpp::Handle& handle)
+{
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(multiply_scalar_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+ dstGenericDescPtr->dims[1],
+ mulTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+ else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(multiply_scalar_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ mulTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/subtract_scalar.hpp b/src/modules/hip/kernel/subtract_scalar.hpp
new file mode 100644
index 000000000..7ee128709
--- /dev/null
+++ b/src/modules/hip/kernel/subtract_scalar.hpp
@@ -0,0 +1,114 @@
+#include
+#include "rpp_hip_common.hpp"
+
+
+__global__ void subtract_scalar_ncdhw_hip_tensor(float *srcPtr,
+ uint3 srcStridesCDH,
+ float *dstPtr,
+ uint3 dstStridesCDH,
+ int channels,
+ float subtractParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x);
+ uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+
+ d_float8 val_f8;
+ for(int c = 0; c < channels; c++)
+ {
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &val_f8);
+ rpp_hip_math_subtract8_const(&val_f8, &val_f8, static_cast(subtractParam));
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+ srcIdx += srcStridesCDH.x;
+ dstIdx += dstStridesCDH.x;
+ }
+}
+
+__global__ void subtract_scalar_ndhwc_hip_tensor(float *srcPtr,
+ uint2 srcStridesDH,
+ float *dstPtr,
+ uint2 dstStridesDH,
+ float subtractParam,
+ RpptROI3DPtr roiGenericPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= roiGenericPtrSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericPtrSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericPtrSrc->xyzwhdROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = ((id_z + roiGenericPtrSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericPtrSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericPtrSrc->xyzwhdROI.xyz.x) * 3;
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+
+ d_float24 val_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
+ rpp_hip_math_subtract24_const(&val_f24, &val_f24, static_cast(subtractParam));
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+RppStatus hip_exec_subtract_scalar_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ RpptROI3DPtr roiGenericPtrSrc,
+ Rpp32f *subtractTensor,
+ rpp::Handle& handle)
+{
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(subtract_scalar_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+ dstGenericDescPtr->dims[1],
+ subtractTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+ else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(subtract_scalar_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ subtractTensor[batchCount],
+ &roiGenericPtrSrc[batchCount]);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/tensor_max.hpp b/src/modules/hip/kernel/tensor_max.hpp
new file mode 100644
index 000000000..b47fce024
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_max.hpp
@@ -0,0 +1,400 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+
+template
+__global__ void tensor_max_grid_3channel_result_hip(float *srcPtr,
+ uint xBufferLength,
+ T *dstPtr)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialRMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+ __shared__ float partialGMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+ __shared__ float partialBMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+ uint srcIdx = (id_z * xBufferLength) * 3;
+ partialRMax_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start of R channel using all 256 x 1 threads
+ partialGMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start of G channel using all 256 x 1 threads
+ partialBMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start of B channel using all 256 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ srcIdx += id_x * 3;
+
+ if (id_x + 8 > xBufferLength)
+ srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+ d_float24 src_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local mmemory
+
+ rpp_hip_math_max8(&src_f24.f8[0], &partialRMax_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[1], &partialGMax_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[2], &partialBMax_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMax_smem[hipThreadIdx_x] = fmaxf(partialRMax_smem[hipThreadIdx_x], partialRMax_smem[hipThreadIdx_x + threadMax]);
+ partialGMax_smem[hipThreadIdx_x] = fmaxf(partialGMax_smem[hipThreadIdx_x], partialGMax_smem[hipThreadIdx_x + threadMax]);
+ partialBMax_smem[hipThreadIdx_x] = fmaxf(partialBMax_smem[hipThreadIdx_x], partialBMax_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int dstIdx = hipBlockIdx_z * 4;
+ dstPtr[dstIdx] = (T) partialRMax_smem[0];
+ dstPtr[dstIdx + 1] = (T) partialGMax_smem[0];
+ dstPtr[dstIdx + 2] = (T) partialBMax_smem[0];
+ dstPtr[dstIdx + 3] = (T) (fmaxf(fmaxf(partialRMax_smem[0], partialGMax_smem[0]), partialBMax_smem[0]));
+ }
+}
+
+template
+__global__ void tensor_max_grid_result_hip(float *srcPtr,
+ uint xBufferLength,
+ T *dstPtr)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+ uint srcIdx = (id_z * xBufferLength);
+ partialMax_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start of buffer using all 256 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ srcIdx += id_x;
+
+ if (id_x + 8 > xBufferLength)
+ srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7))); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_max8(&src_f8, &partialMax_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialMax_smem[hipThreadIdx_x] = fmaxf(partialMax_smem[hipThreadIdx_x], partialMax_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ dstPtr[hipBlockIdx_z] = (T) (partialMax_smem[0]);
+}
+
+
+// -------------------- Set 1 - Reduction Stage 1 --------------------
+
+template
+__global__ void tensor_max_pkd3_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *maxArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialRMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel
+ __shared__ float partialGMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel
+ __shared__ float partialBMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel
+
+ float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for R Channel
+ float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for G Channel
+ float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for B Channel
+ uint srcIdx = (id_z * srcStridesNH.x);
+ partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+ partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads
+ partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+
+ d_float24 src_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory
+
+ rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ {
+ partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]);
+ partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]);
+ partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+ maxArr[idx] = partialRMaxRowPtr_smem[0];
+ maxArr[idx + 1] = partialGMaxRowPtr_smem[0];
+ maxArr[idx + 2] = partialBMaxRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void tensor_max_pln3_hip(T *srcPtr,
+ uint3 srcStridesNCH,
+ float *maxArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialRMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialGMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialBMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+ float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+ partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y]; // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads
+ partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y]; // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ d_float24 src_f24;
+ rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24);
+
+ rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ {
+ partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]);
+ partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]);
+ partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+ maxArr[idx] = partialRMaxRowPtr_smem[0];
+ maxArr[idx + 1] = partialGMaxRowPtr_smem[0];
+ maxArr[idx + 2] = partialBMaxRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void tensor_max_pln1_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *maxArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+
+ rpp_hip_math_max8(&src_f8, &partialMaxRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0];
+ }
+}
+
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+template
+RppStatus hip_exec_tensor_max(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ U *maxArr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+ int globalThreads_y = srcDescPtr->h;
+ int globalThreads_z = handle.GetBatchSize();
+ int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+ int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+ int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+ float2 bitDepthMinMax_f2;
+ getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2);
+ float minimum = bitDepthMinMax_f2.x;
+
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z;
+ float *partialMaxArr;
+ partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_pln1_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ partialMaxArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_grid_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMaxArr,
+ gridDim_x * gridDim_y,
+ maxArr);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ float *partialMaxArr;
+ partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_pln3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ partialMaxArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMaxArr,
+ gridDim_x * gridDim_y,
+ maxArr);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ float *partialMaxArr;
+ partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_pkd3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ partialMaxArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMaxArr,
+ gridDim_x * gridDim_y,
+ maxArr);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/tensor_min.hpp b/src/modules/hip/kernel/tensor_min.hpp
new file mode 100644
index 000000000..a883c4f3b
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_min.hpp
@@ -0,0 +1,410 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+
+template
+__global__ void tensor_min_grid_3channel_result_hip(float *srcPtr,
+ uint xBufferLength,
+ T *dstPtr)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialRMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+ __shared__ float partialGMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+ __shared__ float partialBMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+ uint srcIdx = (id_z * xBufferLength) * 3;
+ partialRMin_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start of R channel using all 256 x 1 threads
+ partialGMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start of G channel using all 256 x 1 threads
+ partialBMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start of B channel using all 256 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ srcIdx += id_x * 3;
+
+ if (id_x + 8 > xBufferLength)
+ srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+ d_float24 src_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory
+
+ rpp_hip_math_min8(&src_f24.f8[0], &partialRMin_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[1], &partialGMin_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[2], &partialBMin_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after min compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMin_smem[hipThreadIdx_x] = fminf(partialRMin_smem[hipThreadIdx_x], partialRMin_smem[hipThreadIdx_x + threadMax]);
+ partialGMin_smem[hipThreadIdx_x] = fminf(partialGMin_smem[hipThreadIdx_x], partialGMin_smem[hipThreadIdx_x + threadMax]);
+ partialBMin_smem[hipThreadIdx_x] = fminf(partialBMin_smem[hipThreadIdx_x], partialBMin_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int dstIdx = hipBlockIdx_z * 4;
+ dstPtr[dstIdx] = (T) partialRMin_smem[0];
+ dstPtr[dstIdx + 1] = (T) partialGMin_smem[0];
+ dstPtr[dstIdx + 2] = (T) partialBMin_smem[0];
+ dstPtr[dstIdx + 3] = (T) (fminf(fminf(partialRMin_smem[0], partialGMin_smem[0]), partialBMin_smem[0]));
+ }
+}
+
+template
+__global__ void tensor_min_grid_result_hip(float *srcPtr,
+ uint xBufferLength,
+ T *dstPtr)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+ uint srcIdx = (id_z * xBufferLength);
+ partialMin_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start of buffer using all 256 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ srcIdx += id_x;
+
+ if (id_x + 8 > xBufferLength)
+ srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7))); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_min8(&src_f8, &partialMin_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after min compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialMin_smem[hipThreadIdx_x] = fminf(partialMin_smem[hipThreadIdx_x], partialMin_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ dstPtr[hipBlockIdx_z] = (T) (partialMin_smem[0]);
+}
+
+
+// -------------------- Set 1 - Reduction Stage 1 --------------------
+
+template
+__global__ void tensor_min_pkd3_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *minArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialRMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel
+ __shared__ float partialGMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel
+ __shared__ float partialBMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel
+
+ float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for R Channel
+ float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for G Channel
+ float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for B Channel
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+ partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads
+ partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+
+ if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+ srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth) * 3;
+
+ d_float24 src_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory
+
+ rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ {
+ partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]);
+ partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]);
+ partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+ minArr[idx] = partialRMinRowPtr_smem[0];
+ minArr[idx + 1] = partialGMinRowPtr_smem[0];
+ minArr[idx + 2] = partialBMinRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void tensor_min_pln3_hip(T *srcPtr,
+ uint3 srcStridesNCH,
+ float *minArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialRMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialGMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialBMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+ float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+ partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y]; // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads
+ partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y]; // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+ srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth);
+
+ d_float24 src_f24;
+ rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24);
+
+ rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after min compute
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ {
+ partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]);
+ partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]);
+ partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+ minArr[idx] = partialRMinRowPtr_smem[0];
+ minArr[idx + 1] = partialGMinRowPtr_smem[0];
+ minArr[idx + 2] = partialBMinRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void tensor_min_pln1_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *minArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ float *partialMinRowPtr_smem = &partialMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ partialMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+ srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_min8(&src_f8, &partialMinRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after min compute
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialMinRowPtr_smem[hipThreadIdx_x] = fminf(partialMinRowPtr_smem[hipThreadIdx_x], partialMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialMinRowPtr_smem[0] = fminf(partialMinRowPtr_smem[0], partialMinRowPtr_smem[increment]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ minArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMinRowPtr_smem[0];
+ }
+}
+
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+template
+RppStatus hip_exec_tensor_min(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ U *minArr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle &handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+ int globalThreads_y = srcDescPtr->h;
+ int globalThreads_z = handle.GetBatchSize();
+ int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+ int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+ int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+ float2 bitDepthMinMax_f2;
+ getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2);
+ float maximum = bitDepthMinMax_f2.y;
+
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z;
+ float *partialMinArr;
+ partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_pln1_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ partialMinArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_grid_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMinArr,
+ gridDim_x * gridDim_y,
+ minArr);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ float *partialMinArr;
+ partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_pln3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ partialMinArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMinArr,
+ gridDim_x * gridDim_y,
+ minArr);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ float *partialMinArr;
+ partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_pkd3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ partialMinArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMinArr,
+ gridDim_x * gridDim_y,
+ minArr);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp
index daf0479ee..8f88ba90f 100644
--- a/src/modules/rppt_tensor_arithmetic_operations.cpp
+++ b/src/modules/rppt_tensor_arithmetic_operations.cpp
@@ -73,6 +73,188 @@ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** add_scalar ********************/
+
+RppStatus rppt_add_scalar_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *addTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams;
+ if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ add_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ addTensor,
+ roiGenericPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** subtract_scalar ********************/
+
+RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *subtractTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams;
+ if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ subtract_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ subtractTensor,
+ roiGenericPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** multiply_scalar ********************/
+
+RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *mulTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams;
+ if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ multiply_scalar_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ mulTensor,
+ roiGenericPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** magnitude ********************/
+
+RppStatus rppt_magnitude_host(RppPtr_t srcPtr1,
+ RppPtr_t srcPtr2,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ magnitude_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ magnitude_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ magnitude_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ magnitude_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
@@ -113,4 +295,163 @@ RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr,
#endif // backend
}
+/******************** add_scalar ********************/
+
+RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *addTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ hip_exec_add_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiGenericPtrSrc,
+ addTensor,
+ rpp::deref(rppHandle));
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** subtract_scalar ********************/
+
+RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *subtractTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ hip_exec_subtract_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiGenericPtrSrc,
+ subtractTensor,
+ rpp::deref(rppHandle));
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** multiply_scalar ********************/
+
+RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *mulTensor,
+ RpptROI3DPtr roiGenericPtrSrc,
+ RpptRoi3DType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if (srcGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if (dstGenericDescPtr->dataType != RpptDataType::F32) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+
+ hip_exec_multiply_scalar_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiGenericPtrSrc,
+ mulTensor,
+ rpp::deref(rppHandle));
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** magnitude ********************/
+
+RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1,
+ RppPtr_t srcPtr2,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ #ifdef HIP_COMPILE
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_magnitude_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_magnitude_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_magnitude_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_magnitude_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
#endif // GPU_SUPPORT
diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
index 23b52bc44..d78b8890a 100644
--- a/src/modules/rppt_tensor_audio_augmentations.cpp
+++ b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -126,3 +126,31 @@ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr,
return RPP_ERROR_NOT_IMPLEMENTED;
}
}
+
+/******************** down_mixing ********************/
+
+RppStatus rppt_down_mixing_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32s *srcDimsTensor,
+ bool normalizeWeights,
+ rppHandle_t rppHandle)
+{
+ if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ down_mixing_host_tensor(static_cast(srcPtr),
+ srcDescPtr,
+ static_cast(dstPtr),
+ dstDescPtr,
+ srcDimsTensor,
+ normalizeWeights,
+ rpp::deref(rppHandle));
+
+ return RPP_SUCCESS;
+ }
+ else
+ {
+ return RPP_ERROR_NOT_IMPLEMENTED;
+ }
+}
diff --git a/src/modules/rppt_tensor_color_augmentations.cpp b/src/modules/rppt_tensor_color_augmentations.cpp
index be61b6da1..3023973fc 100644
--- a/src/modules/rppt_tensor_color_augmentations.cpp
+++ b/src/modules/rppt_tensor_color_augmentations.cpp
@@ -411,7 +411,7 @@ RppStatus rppt_color_cast_host(RppPtr_t srcPtr,
{
if (srcDescPtr->c != 3)
{
- return RPP_ERROR_INVALID_ARGUMENTS;
+ return RPP_ERROR_INVALID_CHANNELS;
}
RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
@@ -671,6 +671,72 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** color_temperature ********************/
+
+RppStatus rppt_color_temperature_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp8s *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->c != 3)
+ {
+ return RPP_ERROR_INVALID_CHANNELS;
+ }
+
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ color_temperature_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ adjustmentValueTensor,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ color_temperature_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ adjustmentValueTensor,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ color_temperature_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ adjustmentValueTensor,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ color_temperature_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ adjustmentValueTensor,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
@@ -887,7 +953,7 @@ RppStatus rppt_color_twist_gpu(RppPtr_t srcPtr,
#ifdef HIP_COMPILE
if (srcDescPtr->c != 3)
{
- return RPP_ERROR_INVALID_ARGUMENTS;
+ return RPP_ERROR_INVALID_CHANNELS;
}
Rpp32u paramIndex = 0;
@@ -958,7 +1024,7 @@ RppStatus rppt_color_cast_gpu(RppPtr_t srcPtr,
#ifdef HIP_COMPILE
if (srcDescPtr->c != 3)
{
- return RPP_ERROR_INVALID_ARGUMENTS;
+ return RPP_ERROR_INVALID_CHANNELS;
}
Rpp32u paramIndex = 0;
@@ -1204,4 +1270,71 @@ RppStatus rppt_lut_gpu(RppPtr_t srcPtr,
#endif // backend
}
+/******************** color_temperature ********************/
+
+RppStatus rppt_color_temperature_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32s *adjustmentValueTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if (srcDescPtr->c != 3)
+ {
+ return RPP_ERROR_INVALID_CHANNELS;
+ }
+
+ Rpp32u paramIndex = 0;
+ copy_param_int(adjustmentValueTensor, rpp::deref(rppHandle), paramIndex++);
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_color_temperature_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_color_temperature_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_color_temperature_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_color_temperature_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
#endif // GPU_SUPPORT
diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp
index da1036256..fff62d085 100644
--- a/src/modules/rppt_tensor_geometric_augmentations.cpp
+++ b/src/modules/rppt_tensor_geometric_augmentations.cpp
@@ -1036,11 +1036,11 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr,
else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
- if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
- if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
- if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_SRC_DST_LAYOUT_MISMATCH;
if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
{
@@ -1823,11 +1823,11 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr,
rppHandle_t rppHandle)
{
#ifdef HIP_COMPILE
- if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
- if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
- if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_SRC_DST_LAYOUT_MISMATCH;
if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
{
diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp
index f17028e5e..28313a88f 100644
--- a/src/modules/rppt_tensor_statistical_operations.cpp
+++ b/src/modules/rppt_tensor_statistical_operations.cpp
@@ -107,6 +107,140 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** tensor_min ********************/
+
+RppStatus rppt_tensor_min_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->c == 1)
+ {
+ if (minArrLength < srcDescPtr->n) // 1 min for each image
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+ else if (srcDescPtr->c == 3)
+ {
+ if (minArrLength < srcDescPtr->n * 4) // min of each channel, and min of all 3 channels
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ tensor_min_u8_u8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(minArr),
+ minArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ tensor_min_f16_f16_host((Rpp16f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(minArr),
+ minArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ tensor_min_f32_f32_host((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(minArr),
+ minArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ tensor_min_i8_i8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(minArr),
+ minArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** tensor_max ********************/
+
+RppStatus rppt_tensor_max_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->c == 1)
+ {
+ if (maxArrLength < srcDescPtr->n) // 1 min for each image
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+ else if (srcDescPtr->c == 3)
+ {
+ if (maxArrLength < srcDescPtr->n * 4) // min of each channel, and min of all 3 channels
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ tensor_max_u8_u8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(maxArr),
+ maxArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ tensor_max_f16_f16_host((Rpp16f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(maxArr),
+ maxArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ tensor_max_f32_f32_host((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(maxArr),
+ maxArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ tensor_max_i8_i8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(maxArr),
+ maxArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
@@ -184,4 +318,126 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+
+/******************** tensor_min ********************/
+
+RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t imageMinArr,
+ Rpp32u imageMinArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->c == 1)
+ {
+ if (imageMinArrLength < srcDescPtr->n) // min of single channel
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+ else if (srcDescPtr->c == 3)
+ {
+ if (imageMinArrLength < srcDescPtr->n * 4) // min of each channel, and overall min of all 3 channels
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ hip_exec_tensor_min(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(imageMinArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ hip_exec_tensor_min((half*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(imageMinArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ hip_exec_tensor_min((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(imageMinArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ hip_exec_tensor_min(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(imageMinArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** tensor_max ********************/
+
+RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t imageMaxArr,
+ Rpp32u imageMaxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->c == 1)
+ {
+ if (imageMaxArrLength < srcDescPtr->n) // max of single channel
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+ else if (srcDescPtr->c == 3)
+ {
+ if (imageMaxArrLength < srcDescPtr->n * 4) // max of each channel, and overall max of all 3 channels
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ hip_exec_tensor_max(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(imageMaxArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ hip_exec_tensor_max((half*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(imageMaxArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ hip_exec_tensor_max((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(imageMaxArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ hip_exec_tensor_max(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(imageMaxArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
#endif // backend
diff --git a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp
index 250ceadfc..e298ebd99 100644
--- a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp
+++ b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp
@@ -1356,8 +1356,8 @@ int main(int argc, char **argv)
for (i = 0; i < images; i++)
{
- dstSize[i].height = srcSize[i].height / 3;
- dstSize[i].width = srcSize[i].width / 1.1;
+ dstSize[i].height = srcSize[i].height / 2;
+ dstSize[i].width = srcSize[i].width / 2;
if (maxDstHeight < dstSize[i].height)
maxDstHeight = dstSize[i].height;
if (maxDstWidth < dstSize[i].width)
diff --git a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp
index fbffdbe68..dc8679e5d 100644
--- a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp
+++ b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp
@@ -1357,8 +1357,8 @@ int main(int argc, char **argv)
for (i = 0; i < images; i++)
{
- dstSize[i].height = srcSize[i].height / 3;
- dstSize[i].width = srcSize[i].width / 1.1;
+ dstSize[i].height = srcSize[i].height / 2;
+ dstSize[i].width = srcSize[i].width / 2;
if (maxDstHeight < dstSize[i].height)
maxDstHeight = dstSize[i].height;
if (maxDstWidth < dstSize[i].width)
diff --git a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp
index ed1e7751b..271ed3d1c 100644
--- a/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp
+++ b/utilities/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp
@@ -1459,8 +1459,8 @@ int main(int argc, char **argv)
for (i = 0; i < images; i++)
{
- dstSize[i].height = srcSize[i].height / 3;
- dstSize[i].width = srcSize[i].width / 1.1;
+ dstSize[i].height = srcSize[i].height / 2;
+ dstSize[i].width = srcSize[i].width / 2;
if (maxDstHeight < dstSize[i].height)
maxDstHeight = dstSize[i].height;
if (maxDstWidth < dstSize[i].width)
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 04831ddf4..7bd46b39e 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -65,12 +65,12 @@ int main(int argc, char **argv)
bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
- bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 63);
+ bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
bool randomOutputCase = (testCase == 84 || testCase == 49 || testCase == 54);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
+ bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
bool noiseTypeCase = (testCase == 8);
bool pln1OutTypeCase = (testCase == 86);
- bool reductionTypeCase = (testCase == 87);
unsigned int verbosity = atoi(argv[11]);
unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1;
@@ -104,7 +104,7 @@ int main(int argc, char **argv)
if (layoutType == 2)
{
- if(testCase == 36 || testCase == 31 || testCase == 86)
+ if(testCase == 36 || testCase == 31 || testCase == 45 || testCase == 86)
{
printf("\ncase %d does not exist for PLN1 layout\n", testCase);
return -1;
@@ -323,35 +323,20 @@ int main(int argc, char **argv)
double wallTime;
string testCaseName;
- if(testCase == 82 && imagesMixed)
- {
- std::cerr<<"\n RICAP only works with same dimension images";
- exit(0);
- }
-
- if(testCase == 82 && batchSize < 2)
- {
- std::cerr<<"\n RICAP only works with BatchSize > 1";
- exit(0);
- }
-
- // Initialize buffers for any reductionType functions
+ // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums)
void *reductionFuncResultArr;
Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
-
- if(reductionTypeCase)
+ if (reductionTypeCase)
{
- if(dstDescPtr->dataType == RpptDataType::U8)
- CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64u)));
- else if(dstDescPtr->dataType == RpptDataType::F16)
- CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f)));
- else if(dstDescPtr->dataType == RpptDataType::F32)
- CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f)));
- else if(dstDescPtr->dataType == RpptDataType::I8)
- CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64s)));
+ int bitDepthByteSize = 0;
+ if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
+ bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+ else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
+ bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f
+ CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
}
- //Allocate hip memory for src/dst
+ // Allocate hip memory for src/dst
CHECK(hipMalloc(&d_input, inputBufferSize));
CHECK(hipMalloc(&d_output, outputBufferSize));
if(dualInputCase)
@@ -827,6 +812,22 @@ int main(int argc, char **argv)
break;
}
+ case 45:
+ {
+ testCaseName = "color_temperature";
+
+ Rpp32s adjustment[batchSize];
+ for (i = 0; i < batchSize; i++)
+ adjustment[i] = 70;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_color_temperature_gpu(d_input, srcDescPtr, d_output, dstDescPtr, adjustment, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 49:
{
testCaseName = "box_filter";
@@ -859,6 +860,18 @@ int main(int argc, char **argv)
break;
}
+ case 61:
+ {
+ testCaseName = "magnitude";
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_magnitude_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 63:
{
testCaseName = "phase";
@@ -1028,6 +1041,30 @@ int main(int argc, char **argv)
break;
}
+ case 88:
+ {
+ testCaseName = "tensor_min";
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_min_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 89:
+ {
+ testCaseName = "tensor_max";
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_max_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
missingFuncFlag = 1;
break;
@@ -1055,33 +1092,41 @@ int main(int argc, char **argv)
if(srcDescPtr->c == 3)
printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): ");
else if(srcDescPtr->c == 1)
+ {
printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): ");
+ reductionFuncResultArrLength = srcDescPtr->n;
+ }
- if(dstDescPtr->dataType == RpptDataType::U8)
+ // print reduction functions output array based on different bit depths, and precision desired
+ int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
+ if (dstDescPtr->dataType == RpptDataType::U8)
{
- Rpp64u *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %llu ", reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::F16)
+ else if (dstDescPtr->dataType == RpptDataType::F16)
{
- Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %0.3f ", (float)reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::F32)
+ else if (dstDescPtr->dataType == RpptDataType::F32)
{
- Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %0.3f ", (float)reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::I8)
+ else if (dstDescPtr->dataType == RpptDataType::I8)
{
- Rpp64s *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %lld ", reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
-
printf("\n");
/*Compare the output of the function with golden outputs only if
@@ -1089,7 +1134,12 @@ int main(int argc, char **argv)
2.input bit depth 0 (U8)
3.source and destination layout are the same*/
if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
- compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ {
+ if (testCase == 87)
+ compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ else
+ compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ }
}
else
{
@@ -1175,4 +1225,4 @@ int main(int argc, char **argv)
CHECK(hipFree(d_input_second));
CHECK(hipFree(d_output));
return 0;
-}
\ No newline at end of file
+}
diff --git a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
index f4741ad78..e8dc4e365 100644
--- a/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_voxel_hip.cpp
@@ -55,8 +55,6 @@ int main(int argc, char * argv[])
fprintf(stdout, "\nUsage: %s \n", argv[0]);
exit(1);
}
-
-
if(batchSize > MAX_BATCH_SIZE)
{
std::cout << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!";
@@ -268,6 +266,38 @@ int main(int argc, char * argv[])
break;
}
+ case 2:
+ {
+ testCaseName = "add_scalar";
+ Rpp32f addTensor[batchSize];
+
+ for (int i = 0; i < batchSize; i++)
+ addTensor[i] = 40;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 2)
+ rppt_add_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, addTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 3:
+ {
+ testCaseName = "subtract_scalar";
+ Rpp32f subtractTensor[batchSize];
+
+ for (int i = 0; i < batchSize; i++)
+ subtractTensor[i] = 40;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 2)
+ rppt_subtract_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, subtractTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 4:
{
testCaseName = "flip_voxel";
@@ -292,6 +322,22 @@ int main(int argc, char * argv[])
break;
}
+ case 5:
+ {
+ testCaseName = "multiply_scalar";
+ Rpp32f mulTensor[batchSize];
+
+ for (int i = 0; i < batchSize; i++)
+ mulTensor[i] = 80;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 2)
+ rppt_multiply_scalar_gpu(d_inputF32, descriptorPtr3D, d_outputF32, descriptorPtr3D, mulTensor, roiGenericSrcPtr, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
{
missingFuncFlag = 1;
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index 6150ad97c..2e8054332 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -153,7 +153,7 @@ def get_log_file_list(preserveOutput):
# Functionality group finder
def func_group_finder(case_number):
- if case_number < 5 or case_number == 13 or case_number == 36:
+ if case_number < 5 or case_number == 13 or case_number == 36 or case_number == 45:
return "color_augmentations"
elif case_number == 8 or case_number == 30 or case_number == 82 or case_number == 83 or case_number == 84:
return "effects_augmentations"
@@ -165,6 +165,8 @@ def func_group_finder(case_number):
return "filter_augmentations"
elif case_number < 40:
return "geometric_augmentations"
+ elif case_number == 61:
+ return "arithmetic_operations"
elif case_number < 87:
return "data_exchange_operations"
elif case_number < 88:
@@ -313,11 +315,11 @@ def rpp_test_suite_parser_and_validator():
parser = argparse.ArgumentParser()
parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1")
parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2")
- parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)")
- parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)")
- parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
- parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
- parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
+ parser.add_argument("--case_start", type = int, default = 0, help="Testing range starting case # - (0:90)")
+ parser.add_argument("--case_end", type = int, default = 90, help="Testing range ending case # - (0:90)")
+ parser.add_argument('--test_type', type = int, default = 0, help="Type of Test - (0 = Unit tests / 1 = Performance tests)")
+ parser.add_argument('--case_list', nargs = "+", help="List of case numbers to list", required=False)
+ parser.add_argument('--profiling', type = str , default='NO', help='Run with profiler? - (YES/NO)', required=False)
parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False)
parser.add_argument('--decoder_type', type = int, default = 0, help = "Type of Decoder to decode the input data - (0 = TurboJPEG / 1 = OpenCV)")
parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
@@ -332,8 +334,8 @@ def rpp_test_suite_parser_and_validator():
validate_path(qaInputFile)
# validate the parameters passed by user
- if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)):
- print("Starting case# and Ending case# must be in the 0:87 range. Aborting!")
+ if ((args.case_start < 0 or args.case_start > 90) or (args.case_end < 0 or args.case_end > 90)):
+ print("Starting case# and Ending case# must be in the 0:90 range. Aborting!")
exit(0)
elif args.case_end < args.case_start:
print("Ending case# must be greater than starting case#. Aborting!")
@@ -347,7 +349,7 @@ def rpp_test_suite_parser_and_validator():
elif args.decoder_type < 0 or args.decoder_type > 1:
print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting")
exit(0)
- elif args.case_list is not None and args.case_start > 0 and args.case_end < 87:
+ elif args.case_list is not None and args.case_start > 0 and args.case_end < 90:
print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
exit(0)
elif args.num_runs <= 0:
@@ -374,8 +376,8 @@ def rpp_test_suite_parser_and_validator():
args.case_list = [str(x) for x in args.case_list]
else:
for case in args.case_list:
- if int(case) < 0 or int(case) > 87:
- print("The case# must be in the 0:87 range!")
+ if int(case) < 0 or int(case) > 90:
+ print("The case# must be in the 0:90 range!")
exit(0)
return args
@@ -456,8 +458,8 @@ def rpp_test_suite_parser_and_validator():
if qaMode == 1 and case != "82":
srcPath1 = inFilePath1
srcPath2 = inFilePath2
- if int(case) < 0 or int(case) > 87:
- print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+ if int(case) < 0 or int(case) > 89:
+ print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
continue
for layout in range(3):
dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath)
@@ -474,8 +476,8 @@ def rpp_test_suite_parser_and_validator():
else:
if (testType == 1 and profilingOption == "NO"):
for case in caseList:
- if int(case) < 0 or int(case) > 87:
- print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+ if int(case) < 0 or int(case) > 89:
+ print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
continue
if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
srcPath1 = ricapInFilePath
@@ -489,8 +491,8 @@ def rpp_test_suite_parser_and_validator():
NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69]
for case in caseList:
- if int(case) < 0 or int(case) > 87:
- print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+ if int(case) < 0 or int(case) > 89:
+ print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
continue
if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
srcPath1 = ricapInFilePath
@@ -627,7 +629,9 @@ def rpp_test_suite_parser_and_validator():
"effects_augmentations",
"filter_augmentations",
"geometric_augmentations",
- "morphological_operations"
+ "morphological_operations",
+ "arithmetic_operations",
+ "statistical_operations"
]
for log_file in log_file_list:
# Opening log file
@@ -692,7 +696,7 @@ def rpp_test_suite_parser_and_validator():
f.close()
# print the results of qa tests
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '54', '63', '70', '80', '82', '83', '84', '85', '86', '87']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89']
nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
@@ -717,4 +721,4 @@ def rpp_test_suite_parser_and_validator():
resultsInfo += "\n - Total augmentations with golden output QA test support = " + str(len(supportedCaseList) - len(nonQACaseList))
resultsInfo += "\n - Total augmentations without golden ouput QA test support (due to randomization involved) = " + str(len(nonQACaseList))
f.write(resultsInfo)
- print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------")
\ No newline at end of file
+ print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------")
diff --git a/utilities/test_suite/HIP/runTests_voxel.py b/utilities/test_suite/HIP/runTests_voxel.py
index 2f007ecaa..b6648affb 100644
--- a/utilities/test_suite/HIP/runTests_voxel.py
+++ b/utilities/test_suite/HIP/runTests_voxel.py
@@ -39,7 +39,7 @@
outFolderPath = os.getcwd()
buildFolderPath = os.getcwd()
caseMin = 0
-caseMax = 4
+caseMax = 5
# Check if folder path is empty, if it is the root folder, or if it exists, and remove its contents
def validate_and_remove_contents(path):
@@ -258,8 +258,8 @@ def rpp_test_suite_parser_and_validator():
parser = argparse.ArgumentParser()
parser.add_argument("--header_path", type = str, default = headerFilePath, help = "Path to the nii header")
parser.add_argument("--data_path", type = str, default = dataFilePath, help = "Path to the nii data file")
- parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing range starting case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
- parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing range ending case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+ parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+ parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
@@ -309,8 +309,8 @@ def rpp_test_suite_parser_and_validator():
else:
for case in args.case_list:
if int(case) < caseMin or int(case) > caseMax:
- print("The case# must be in the 0:1 range!")
- exit(0)
+ print("The case# must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+ exit(0)
# if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs
if args.qa_mode:
@@ -470,7 +470,7 @@ def rpp_test_suite_parser_and_validator():
print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv")
# print the results of qa tests
-supportedCaseList = ['0', '1', '4']
+supportedCaseList = ['0', '1', '2', '3', '4', '5']
nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
diff --git a/utilities/test_suite/HOST/CMakeLists.txt b/utilities/test_suite/HOST/CMakeLists.txt
index 6adc461b3..b7abf5d77 100644
--- a/utilities/test_suite/HOST/CMakeLists.txt
+++ b/utilities/test_suite/HOST/CMakeLists.txt
@@ -82,8 +82,15 @@ if (OpenCV_FOUND)
link_directories(${ROCM_PATH}/lib /usr/local/lib)
add_executable(Tensor_host Tensor_host.cpp)
+ add_executable(BatchPD_host_pkd3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pkd3.cpp)
+ add_executable(BatchPD_host_pln1 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln1.cpp)
+ add_executable(BatchPD_host_pln3 ${ROCM_PATH}/share/rpp/test/rpp-performancetests/HOST_NEW/BatchPD_host_pln3.cpp)
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
target_link_libraries(Tensor_host ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
+ target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
+ target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
+ target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
else()
message("-- ${Red}Error: OpenCV must be installed to install ${PROJECT_NAME} successfully!${ColourReset}")
endif()
@@ -102,7 +109,7 @@ else()
endif()
if(NOT libsnd_LIBS)
- message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_voxel_host successfully!${ColourReset}")
+ message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_host successfully!${ColourReset}")
else()
message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}")
include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include)
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index 1e416ed52..b698a2def 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -65,14 +65,15 @@ int main(int argc, char **argv)
int batchSize = atoi(argv[14]);
bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24);
- bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 63);
+ bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
bool randomOutputCase = (testCase == 84);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
+ bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
bool noiseTypeCase = (testCase == 8);
bool pln1OutTypeCase = (testCase == 86);
+
unsigned int verbosity = atoi(argv[11]);
unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1;
- bool reductionTypeCase = (testCase == 87);
int roiList[4] = {atoi(argv[15]), atoi(argv[16]), atoi(argv[17]), atoi(argv[18])};
string scriptPath = argv[19];
@@ -102,7 +103,7 @@ int main(int argc, char **argv)
if (layoutType == 2)
{
- if(testCase == 36 || testCase == 31 || testCase == 86)
+ if(testCase == 31 || testCase == 36 || testCase == 45 || testCase == 86)
{
printf("\ncase %d does not exist for PLN1 layout\n", testCase);
return -1;
@@ -140,6 +141,11 @@ int main(int argc, char **argv)
std::cerr << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!";
exit(0);
}
+ else if(testCase == 82 && batchSize < 2)
+ {
+ std::cerr<<"\n RICAP only works with BatchSize > 1";
+ exit(0);
+ }
// Get function name
string funcName = augmentationMap[testCase];
@@ -310,6 +316,24 @@ int main(int argc, char **argv)
input_second = static_cast(calloc(inputBufferSize, 1));
output = static_cast(calloc(outputBufferSize, 1));
+ // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums)
+ void *reductionFuncResultArr;
+ Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
+ if (reductionTypeCase)
+ {
+ int bitDepthByteSize = 0;
+ if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+ reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize));
+ }
+ else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f
+ reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize));
+ }
+ }
+
// Set the number of threads to be used by OpenMP pragma for RPP batch processing on host.
// If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size
Rpp32u numThreads = 0;
@@ -321,33 +345,6 @@ int main(int argc, char **argv)
double cpuTime, wallTime;
string testCaseName;
- if(testCase == 82 && imagesMixed)
- {
- std::cerr<<"\n RICAP only works with same dimension images";
- exit(0);
- }
-
- if(testCase == 82 && batchSize < 2)
- {
- std::cerr<<"\n RICAP only works with BatchSize > 1";
- exit(0);
- }
-
- // Initialize buffers for any reductionType functions
- void *reductionFuncResultArr;
- Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
- if(reductionTypeCase)
- {
- if(dstDescPtr->dataType == RpptDataType::U8)
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp64u)));
- else if(dstDescPtr->dataType == RpptDataType::F16)
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp32f)));
- else if(dstDescPtr->dataType == RpptDataType::F32)
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp32f)));
- else if(dstDescPtr->dataType == RpptDataType::I8)
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp64s)));
- }
-
// case-wise RPP API and measure time script for Unit and Performance test
printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize);
for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++)
@@ -818,6 +815,36 @@ int main(int argc, char **argv)
break;
}
+ case 45:
+ {
+ testCaseName = "color_temperature";
+
+ Rpp8s adjustment[batchSize];
+ for (i = 0; i < batchSize; i++)
+ adjustment[i] = 70;
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_color_temperature_host(input, srcDescPtr, output, dstDescPtr, adjustment, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 61:
+ {
+ testCaseName = "magnitude";
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_magnitude_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 63:
{
testCaseName = "phase";
@@ -1032,6 +1059,40 @@ int main(int argc, char **argv)
break;
}
+ case 88:
+ {
+ testCaseName = "tensor_min";
+
+ if(srcDescPtr->c == 1)
+ reductionFuncResultArrLength = srcDescPtr->n;
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_min_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 89:
+ {
+ testCaseName = "tensor_max";
+
+ if(srcDescPtr->c == 1)
+ reductionFuncResultArrLength = srcDescPtr->n;
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_max_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
missingFuncFlag = 1;
break;
@@ -1064,33 +1125,41 @@ int main(int argc, char **argv)
if(srcDescPtr->c == 3)
printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): ");
else if(srcDescPtr->c == 1)
+ {
printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): ");
+ reductionFuncResultArrLength = srcDescPtr->n;
+ }
- if(dstDescPtr->dataType == RpptDataType::U8)
+ // print reduction functions output array based on different bit depths, and precision desired
+ int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
+ if (dstDescPtr->dataType == RpptDataType::U8)
{
- Rpp64u *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %llu ", reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::F16)
+ else if (dstDescPtr->dataType == RpptDataType::F16)
{
- Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %0.3f ", (float)reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::F32)
+ else if (dstDescPtr->dataType == RpptDataType::F32)
{
- Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %0.3f ", (float)reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast