diff --git a/.Doxyfile b/.Doxyfile index 77e66d9e9..066a53c02 100644 --- a/.Doxyfile +++ b/.Doxyfile @@ -967,8 +967,9 @@ INPUT = README.md \ include/rppt_tensor_geometric_augmentations.h \ include/rppt_tensor_morphological_operations.h \ include/rppt_tensor_statistical_operations.h \ - include/rppt_tensor_arithmetic_operations.h - include/rppt_tensor_audio_augmentations.h + include/rppt_tensor_arithmetic_operations.h \ + include/rppt_tensor_audio_augmentations.h \ + include/rppt_tensor_logical_operations.h # This tag can be used to specify the character encoding of the source files diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index f6cfda25f..bfc270b6a 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -966,7 +966,8 @@ INPUT = ../../README.md \ ../../include/rppt_tensor_filter_augmentations.h \ ../../include/rppt_tensor_geometric_augmentations.h \ ../../include/rppt_tensor_morphological_operations.h \ - ../../include/rppt_tensor_statistical_operations.h + ../../include/rppt_tensor_statistical_operations.h \ + ../../include/rppt_tensor_logical_operations.h # This tag can be used to specify the character encoding of the source files diff --git a/include/rppt.h b/include/rppt.h index 0a20921d8..b466fa373 100644 --- a/include/rppt.h +++ b/include/rppt.h @@ -46,6 +46,7 @@ extern "C" { #include "rppt_tensor_arithmetic_operations.h" #include "rppt_tensor_statistical_operations.h" #include "rppt_tensor_audio_augmentations.h" +#include "rppt_tensor_logical_operations.h" #ifdef __cplusplus } diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h index 51705eefc..81ed9cc17 100644 --- a/include/rppt_tensor_arithmetic_operations.h +++ b/include/rppt_tensor_arithmetic_operations.h @@ -228,21 +228,21 @@ RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #ifdef GPU_SUPPORT -/*! \brief Magnitude computation on HOST backend for a NCHW/NHWC layout tensor +/*! \brief Magnitude computation on HIP backend for a NCHW/NHWC layout tensor * \details This function computes magnitude of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * dstPtr depth ranges - Will be same depth as srcPtr.
* \image html img150x150.jpg Sample Input1 * \image html img150x150_2.jpg Sample Input2 * \image html magnitude_operation_img150x150.jpg Sample Output - * \param [in] srcPtr1 source1 tensor in HOST memory - * \param [in] srcPtr2 source2 tensor in HOST memory + * \param [in] srcPtr1 source1 tensor in HIP memory + * \param [in] srcPtr2 source2 tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) - * \param [out] dstPtr destination tensor in HOST memory + * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) - * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithStreamAndBatchSize() + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. @@ -256,4 +256,4 @@ RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr src #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H \ No newline at end of file +#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h new file mode 100644 index 000000000..fb0049d2c --- /dev/null +++ b/include/rppt_tensor_logical_operations.h @@ -0,0 +1,139 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef RPPT_TENSOR_LOGICAL_OPERATIONS_H +#define RPPT_TENSOR_LOGICAL_OPERATIONS_H + +#include "rpp.h" +#include "rppdefs.h" +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \file + * \brief RPPT Tensor Operations - Logical Operations. + * \defgroup group_tensor_logical_operations RPPT Tensor Operations - Logical Operations. + * \brief RPPT Tensor Operations - Logical Operations. + */ + +/*! \addtogroup group_rppt_tensor_logical_operations + * @{ + */ + +/*! \brief Bitwise AND computation on HOST backend for a NCHW/NHWC layout tensor + * \details This function computes bitwise AND of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.jpg Sample Input1 + * \image html img150x150_2.jpg Sample Input2 + * \image html bitwise_and_operation_img150x150.jpg Sample Output + * \param [in] srcPtr1 source1 tensor in HOST memory + * \param [in] srcPtr2 source2 tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Bitwise AND computation on HIP backend for a NCHW/NHWC layout tensor + * \details This function computes bitwise AND of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.jpg Sample Input1 + * \image html img150x150_2.jpg Sample Input2 + * \image html bitwise_and_operation_img150x150.jpg Sample Output + * \param [in] srcPtr1 source1 tensor in HIP memory + * \param [in] srcPtr2 source2 tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! \brief Bitwise OR computation on HOST backend for a NCHW/NHWC layout tensor + * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.jpg Sample Input1 + * \image html img150x150_2.jpg Sample Input2 + * \image html bitwise_or_operation_img150x150.jpg Sample Output + * \param [in] srcPtr1 source1 tensor in HOST memory + * \param [in] srcPtr2 source2 tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Bitwise OR computation on HIP backend for a NCHW/NHWC layout tensor + * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.jpg Sample Input1 + * \image html img150x150_2.jpg Sample Input2 + * \image html bitwise_or_operation_img150x150.jpg Sample Output + * \param [in] srcPtr1 source1 tensor in HIP memory + * \param [in] srcPtr2 source2 tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! @} + */ + +#ifdef __cplusplus +} +#endif +#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H \ No newline at end of file diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index d03ec0e79..19121957b 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -719,6 +719,29 @@ inline void rpp_load48_i8pkd3_to_i8pln3(Rpp8s *srcPtr, __m128i *px) px[2] = _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */ } +inline void rpp_load48_i8pkd3_to_u8pln3(Rpp8s *srcPtr, __m128i *px) +{ + __m128i pxSrc[8]; + __m128i pxMask = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15); + __m128i pxMaskRGB = _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15); + + pxSrc[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */ + pxSrc[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */ + pxSrc[2] = _mm_loadu_si128((__m128i *)(srcPtr + 24)); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 */ + pxSrc[3] = _mm_loadu_si128((__m128i *)(srcPtr + 36)); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 */ + pxSrc[0] = _mm_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */ + pxSrc[1] = _mm_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */ + pxSrc[2] = _mm_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */ + pxSrc[3] = _mm_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */ + pxSrc[4] = _mm_unpacklo_epi8(pxSrc[0], pxSrc[1]); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] */ + pxSrc[5] = _mm_unpacklo_epi8(pxSrc[2], pxSrc[3]); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] */ + pxSrc[6] = _mm_unpackhi_epi8(pxSrc[0], pxSrc[1]); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] */ + pxSrc[7] = _mm_unpackhi_epi8(pxSrc[2], pxSrc[3]); /* unpack 8 hi-pixels of pxSrc[2] and pxSrc[3] */ + px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[4], pxSrc[5]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] to get R01-16 and add 128 to get u8 from i8 */ + px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_shuffle_epi8(_mm_unpackhi_epi8(pxSrc[4], pxSrc[5]), pxMaskRGB)); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] to get G01-16 and add 128 to get u8 from i8 */ + px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 and add 128 to get u8 from i8 */ +} + inline void rpp_store48_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128i *px) { _mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ @@ -726,6 +749,13 @@ inline void rpp_store48_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s * _mm_storeu_si128((__m128i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } +inline void rpp_store48_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128i *px) +{ + _mm_storeu_si128((__m128i *)dstPtrR, _mm_sub_epi8(px[0], xmm_pxConvertI8)); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + _mm_storeu_si128((__m128i *)dstPtrG, _mm_sub_epi8(px[1], xmm_pxConvertI8)); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + _mm_storeu_si128((__m128i *)dstPtrB, _mm_sub_epi8(px[2], xmm_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ +} + inline void rpp_load48_i8pkd3_to_i32pln3_avx(Rpp8s *srcPtr, __m256i *p) { __m128i pxSrc[8]; @@ -759,6 +789,13 @@ inline void rpp_load48_i8pln3_to_i8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *s px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } +inline void rpp_load48_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m128i *px) +{ + px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrR)); /* load and convert to u8 [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrG)); /* load and convert to u8 [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ +} + inline void rpp_store48_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px) { __m128i pxDst[4]; @@ -774,6 +811,21 @@ inline void rpp_store48_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px) _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ } +inline void rpp_store48_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px) +{ + __m128i pxDst[4]; + __m128i pxZero = _mm_setzero_si128(); + __m128i pxMaskRGBAtoRGB = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); + pxDst[0] = _mm_unpacklo_epi8(px[1], pxZero); /* unpack 8 lo-pixels of px[1] and pxZero */ + pxDst[1] = _mm_unpackhi_epi8(px[1], pxZero); /* unpack 8 hi-pixels of px[1] and pxZero */ + pxDst[2] = _mm_unpacklo_epi8(px[0], px[2]); /* unpack 8 lo-pixels of px[0] and px[2] */ + pxDst[3] = _mm_unpackhi_epi8(px[0], px[2]); /* unpack 8 hi-pixels of px[0] and px[2] */ + _mm_storeu_si128((__m128i *)dstPtr, _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ +} + inline void rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p) { __m128i px = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */ diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index d9c0ce02d..1c38846ea 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -1713,6 +1713,34 @@ __device__ __forceinline__ void rpp_hip_math_multiply24_const(d_float24 *src_f24 dst_f24->f4[5] = src_f24->f4[5] * multiplier_f4; } +// d_float8 bitwiseAND + +__device__ __forceinline__ void rpp_hip_math_bitwiseAnd8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) +{ + dst_f8->f1[0] = (float)((uchar)(src1_f8->f1[0]) & (uchar)(src2_f8->f1[0])); + dst_f8->f1[1] = (float)((uchar)(src1_f8->f1[1]) & (uchar)(src2_f8->f1[1])); + dst_f8->f1[2] = (float)((uchar)(src1_f8->f1[2]) & (uchar)(src2_f8->f1[2])); + dst_f8->f1[3] = (float)((uchar)(src1_f8->f1[3]) & (uchar)(src2_f8->f1[3])); + dst_f8->f1[4] = (float)((uchar)(src1_f8->f1[4]) & (uchar)(src2_f8->f1[4])); + dst_f8->f1[5] = (float)((uchar)(src1_f8->f1[5]) & (uchar)(src2_f8->f1[5])); + dst_f8->f1[6] = (float)((uchar)(src1_f8->f1[6]) & (uchar)(src2_f8->f1[6])); + dst_f8->f1[7] = (float)((uchar)(src1_f8->f1[7]) & (uchar)(src2_f8->f1[7])); +} + +// d_float8 bitwiseOR + +__device__ __forceinline__ void rpp_hip_math_bitwiseOr8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) +{ + dst_f8->f1[0] = (float)((uchar)(src1_f8->f1[0]) | (uchar)(src2_f8->f1[0])); + dst_f8->f1[1] = (float)((uchar)(src1_f8->f1[1]) | (uchar)(src2_f8->f1[1])); + dst_f8->f1[2] = (float)((uchar)(src1_f8->f1[2]) | (uchar)(src2_f8->f1[2])); + dst_f8->f1[3] = (float)((uchar)(src1_f8->f1[3]) | (uchar)(src2_f8->f1[3])); + dst_f8->f1[4] = (float)((uchar)(src1_f8->f1[4]) | (uchar)(src2_f8->f1[4])); + dst_f8->f1[5] = (float)((uchar)(src1_f8->f1[5]) | (uchar)(src2_f8->f1[5])); + dst_f8->f1[6] = (float)((uchar)(src1_f8->f1[6]) | (uchar)(src2_f8->f1[6])); + dst_f8->f1[7] = (float)((uchar)(src1_f8->f1[7]) | (uchar)(src2_f8->f1[7])); +} + __device__ __forceinline__ float rpp_hip_math_inverse_sqrt1(float x) { float xHalf = 0.5f * x; diff --git a/src/modules/cpu/host_tensor_logical_operations.hpp b/src/modules/cpu/host_tensor_logical_operations.hpp new file mode 100644 index 000000000..0fb3fe5eb --- /dev/null +++ b/src/modules/cpu/host_tensor_logical_operations.hpp @@ -0,0 +1,31 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef HOST_TENSOR_LOGICAL_OPERATIONS_HPP +#define HOST_TENSOR_LOGICAL_OPERATIONS_HPP + +#include "kernel/bitwise_and.hpp" +#include "kernel/bitwise_or.hpp" + +#endif // HOST_TENSOR_LOGICAL_OPERATIONS_HPP \ No newline at end of file diff --git a/src/modules/cpu/kernel/bitwise_and.hpp b/src/modules/cpu/kernel/bitwise_and.hpp new file mode 100644 index 000000000..dea11e2c6 --- /dev/null +++ b/src/modules/cpu/kernel/bitwise_and.hpp @@ -0,0 +1,965 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus bitwise_and_u8_u8_host_tensor(Rpp8u *srcPtr1, + Rpp8u *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; + + // Bitwise AND with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads + p1[0] = _mm_and_si128(p1[0], p2[0]); // bitwise_and computation + p1[1] = _mm_and_si128(p1[1], p2[1]); // bitwise_and computation + p1[2] = _mm_and_si128(p1[2], p2[2]); // bitwise_and computation + rpp_simd_store(rpp_store48_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECK((srcPtr1Temp[0] & srcPtr2Temp[0]))); + *dstPtrTempG++ = static_cast(RPPPIXELCHECK((srcPtr1Temp[1] & srcPtr2Temp[1]))); + *dstPtrTempB++ = static_cast(RPPPIXELCHECK((srcPtr1Temp[2] & srcPtr2Temp[2]))); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Bitwise AND with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm_and_si128(p1[0], p2[0]); // bitwise_and computation + p1[1] = _mm_and_si128(p1[1], p2[1]); // bitwise_and computation + p1[2] = _mm_and_si128(p1[2], p2[2]); // bitwise_and computation + rpp_simd_store(rpp_store48_u8pln3_to_u8pkd3, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECK((*srcPtr1TempR & *srcPtr2TempR))); + dstPtrTemp[1] = static_cast(RPPPIXELCHECK((*srcPtr1TempG & *srcPtr2TempG))); + dstPtrTemp[2] = static_cast(RPPPIXELCHECK((*srcPtr1TempB & *srcPtr2TempB))); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Bitwise AND without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + alignedLength = bufferLength & ~15; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1, p2; + + p1 = _mm_loadu_si128((__m128i *)srcPtr1Temp); // simd loads + p2 = _mm_loadu_si128((__m128i *)srcPtr2Temp); // simd loads + p1 = _mm_and_si128(p1, p2); // bitwise_and computation + _mm_storeu_si128((__m128i *)dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECK((*srcPtr1Temp & *srcPtr2Temp))); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +/* BitwiseAND is logical operation only on U8/I8 types. + For a Rpp32f precision image (pixel values from 0-1), the BitwiseAND is applied on a 0-255 + range-translated approximation, of the original 0-1 decimal-range image. + Link: https://stackoverflow.com/questions/1723575/how-to-perform-a-bitwise-operation-on-floating-point-numbers */ +RppStatus bitwise_and_f32_f32_host_tensor(Rpp32f *srcPtr1, + Rpp32f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Bitwise AND with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation + p1[1] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_and computation + p1[2] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_and computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) & (uint)(srcPtr2Temp[0] * 255)) / 255); + *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) & (uint)(srcPtr2Temp[1] * 255)) / 255); + *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) & (uint)(srcPtr2Temp[2] * 255)) / 255); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Bitwise AND with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation + p1[1] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_and computation + p1[2] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_and computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) & (uint)(*srcPtr2TempR * 255)) / 255); + dstPtrTemp[1] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) & (uint)(*srcPtr2TempG * 255)) / 255); + dstPtrTemp[2] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) & (uint)(*srcPtr2TempB * 255)) / 255); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Bitwise AND without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) & (uint)(*srcPtr2Temp * 255)) / 255); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus bitwise_and_f16_f16_host_tensor(Rpp16f *srcPtr1, + Rpp16f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Bitwise AND with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; + + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); + } + + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp_ps, p1); // simd loads + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp_ps, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation + p1[1] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_and computation + p1[2] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_and computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) & (uint)(srcPtr2Temp[0] * 255)) / 255)); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) & (uint)(srcPtr2Temp[1] * 255)) / 255)); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) & (uint)(srcPtr2Temp[2] * 255)) / 255)); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Bitwise AND with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1TempR[cnt]); + srcPtr1Temp_ps[cnt + 8] = static_cast(srcPtr1TempG[cnt]); + srcPtr1Temp_ps[cnt + 16] = static_cast(srcPtr1TempB[cnt]); + + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2TempR[cnt]); + srcPtr2Temp_ps[cnt + 8] = static_cast(srcPtr2TempG[cnt]); + srcPtr2Temp_ps[cnt + 16] = static_cast(srcPtr2TempB[cnt]); + } + + __m256 p1[4], p2[4]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1Temp_ps, srcPtr1Temp_ps + 8, srcPtr1Temp_ps + 16, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2Temp_ps, srcPtr2Temp_ps + 8, srcPtr2Temp_ps + 16, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation + p1[1] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_and computation + p1[2] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_and computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) & (uint)(*srcPtr2TempR * 255)) / 255)); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) & (uint)(*srcPtr2TempG * 255)) / 255)); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) & (uint)(*srcPtr2TempB * 255)) / 255)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Bitwise AND without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8]; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); + } + + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp_ps, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp_ps, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) & (uint)(*srcPtr2Temp * 255)) / 255)); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus bitwise_and_i8_i8_host_tensor(Rpp8s *srcPtr1, + Rpp8s *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; + + // Bitwise AND with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_i8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load48_i8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads + p1[0] = _mm_and_si128(p1[0], p2[0]); // bitwise_and computation + p1[1] = _mm_and_si128(p1[1], p2[1]); // bitwise_and computation + p1[2] = _mm_and_si128(p1[2], p2[2]); // bitwise_and computation + rpp_simd_store(rpp_store48_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[0] + 128) & (srcPtr2Temp[0] + 128)) - 128)); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[1] + 128) & (srcPtr2Temp[1] + 128)) - 128)); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[2] + 128) & (srcPtr2Temp[2] + 128)) - 128)); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Bitwise AND with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load48_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm_and_si128(p1[0], p2[0]); // bitwise_and computation + p1[1] = _mm_and_si128(p1[1], p2[1]); // bitwise_and computation + p1[2] = _mm_and_si128(p1[2], p2[2]); // bitwise_and computation + rpp_simd_store(rpp_store48_u8pln3_to_i8pkd3, dstPtrTemp, p1); // simd stores + + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) & static_cast(*srcPtr2TempR + 128)))) - 128)); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) & static_cast(*srcPtr2TempG + 128)))) - 128)); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) & static_cast(*srcPtr2TempB + 128)))) - 128)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Bitwise AND without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + alignedLength = bufferLength & ~15; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1, p2; + + p1 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr1Temp)); // simd loads + p2 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr2Temp)); // simd loads + p1 = _mm_and_si128(p1, p2); // bitwise_and computation + _mm_storeu_si128((__m128i *)dstPtrTemp, _mm_sub_epi8(p1, xmm_pxConvertI8)); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) & static_cast(*srcPtr2Temp + 128)))) - 128)); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/bitwise_or.hpp b/src/modules/cpu/kernel/bitwise_or.hpp new file mode 100644 index 000000000..21e3b9f5f --- /dev/null +++ b/src/modules/cpu/kernel/bitwise_or.hpp @@ -0,0 +1,965 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software OR associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, OR/or sell +copies of the Software, OR to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice OR this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus bitwise_or_u8_u8_host_tensor(Rpp8u *srcPtr1, + Rpp8u *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; + + // Bitwise OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads + p1[0] = _mm_or_si128(p1[0], p2[0]); // bitwise_or computation + p1[1] = _mm_or_si128(p1[1], p2[1]); // bitwise_or computation + p1[2] = _mm_or_si128(p1[2], p2[2]); // bitwise_or computation + rpp_simd_store(rpp_store48_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECK((srcPtr1Temp[0] | srcPtr2Temp[0]))); + *dstPtrTempG++ = static_cast(RPPPIXELCHECK((srcPtr1Temp[1] | srcPtr2Temp[1]))); + *dstPtrTempB++ = static_cast(RPPPIXELCHECK((srcPtr1Temp[2] | srcPtr2Temp[2]))); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm_or_si128(p1[0], p2[0]); // bitwise_or computation + p1[1] = _mm_or_si128(p1[1], p2[1]); // bitwise_or computation + p1[2] = _mm_or_si128(p1[2], p2[2]); // bitwise_or computation + rpp_simd_store(rpp_store48_u8pln3_to_u8pkd3, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECK((*srcPtr1TempR | *srcPtr2TempR))); + dstPtrTemp[1] = static_cast(RPPPIXELCHECK((*srcPtr1TempG | *srcPtr2TempG))); + dstPtrTemp[2] = static_cast(RPPPIXELCHECK((*srcPtr1TempB | *srcPtr2TempB))); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + alignedLength = bufferLength & ~15; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1, p2; + + p1 = _mm_loadu_si128((__m128i *)srcPtr1Temp); // simd loads + p2 = _mm_loadu_si128((__m128i *)srcPtr2Temp); // simd loads + p1 = _mm_or_si128(p1, p2); // bitwise_or computation + _mm_storeu_si128((__m128i *)dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECK((*srcPtr1Temp | *srcPtr2Temp))); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +/* BitwiseOR is logical operation only on U8/I8 types. + For a Rpp32f precision image (pixel values from 0-1), the BitwiseOR is applied on a 0-255 + range-translated approximation, of the original 0-1 decimal-range image. + Link: https://stackoverflow.com/questions/1723575/how-to-perform-a-bitwise-operation-on-floating-point-numbers */ +RppStatus bitwise_or_f32_f32_host_tensor(Rpp32f *srcPtr1, + Rpp32f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Bitwise OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) | (uint)(srcPtr2Temp[0] * 255)) / 255); + *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) | (uint)(srcPtr2Temp[1] * 255)) / 255); + *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) | (uint)(srcPtr2Temp[2] * 255)) / 255); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) | (uint)(*srcPtr2TempR * 255)) / 255); + dstPtrTemp[1] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) | (uint)(*srcPtr2TempG * 255)) / 255); + dstPtrTemp[2] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) | (uint)(*srcPtr2TempB * 255)) / 255); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) | (uint)(*srcPtr2Temp * 255)) / 255); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus bitwise_or_f16_f16_host_tensor(Rpp16f *srcPtr1, + Rpp16f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Bitwise OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; + + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); + } + + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp_ps, p1); // simd loads + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp_ps, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) | (uint)(srcPtr2Temp[0] * 255)) / 255)); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) | (uint)(srcPtr2Temp[1] * 255)) / 255)); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) | (uint)(srcPtr2Temp[2] * 255)) / 255)); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1TempR[cnt]); + srcPtr1Temp_ps[cnt + 8] = static_cast(srcPtr1TempG[cnt]); + srcPtr1Temp_ps[cnt + 16] = static_cast(srcPtr1TempB[cnt]); + + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2TempR[cnt]); + srcPtr2Temp_ps[cnt + 8] = static_cast(srcPtr2TempG[cnt]); + srcPtr2Temp_ps[cnt + 16] = static_cast(srcPtr2TempB[cnt]); + } + + __m256 p1[4], p2[4]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1Temp_ps, srcPtr1Temp_ps + 8, srcPtr1Temp_ps + 16, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2Temp_ps, srcPtr2Temp_ps + 8, srcPtr2Temp_ps + 16, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) | (uint)(*srcPtr2TempR * 255)) / 255)); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) | (uint)(*srcPtr2TempG * 255)) / 255)); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) | (uint)(*srcPtr2TempB * 255)) / 255)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8]; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); + } + + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp_ps, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp_ps, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) | (uint)(*srcPtr2Temp * 255)) / 255)); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus bitwise_or_i8_i8_host_tensor(Rpp8s *srcPtr1, + Rpp8s *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; + + // Bitwise OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_i8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load48_i8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads + p1[0] = _mm_or_si128(p1[0], p2[0]); // bitwise_or computation + p1[1] = _mm_or_si128(p1[1], p2[1]); // bitwise_or computation + p1[2] = _mm_or_si128(p1[2], p2[2]); // bitwise_or computation + rpp_simd_store(rpp_store48_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[0] + 128) | (srcPtr2Temp[0] + 128)) - 128)); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[1] + 128) | (srcPtr2Temp[1] + 128)) - 128)); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[2] + 128) | (srcPtr2Temp[2] + 128)) - 128)); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load48_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm_or_si128(p1[0], p2[0]); // bitwise_or computation + p1[1] = _mm_or_si128(p1[1], p2[1]); // bitwise_or computation + p1[2] = _mm_or_si128(p1[2], p2[2]); // bitwise_or computation + rpp_simd_store(rpp_store48_u8pln3_to_i8pkd3, dstPtrTemp, p1); // simd stores + + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) | static_cast(*srcPtr2TempR + 128)))) - 128)); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) | static_cast(*srcPtr2TempG + 128)))) - 128)); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) | static_cast(*srcPtr2TempB + 128)))) - 128)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + alignedLength = bufferLength & ~15; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1, p2; + + p1 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr1Temp)); // simd loads + p2 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr2Temp)); // simd loads + p1 = _mm_or_si128(p1, p2); // bitwise_or computation + _mm_storeu_si128((__m128i *)dstPtrTemp, _mm_sub_epi8(p1, xmm_pxConvertI8)); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) | static_cast(*srcPtr2Temp + 128)))) - 128)); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/hip_tensor_logical_operations.hpp b/src/modules/hip/hip_tensor_logical_operations.hpp new file mode 100644 index 000000000..636789246 --- /dev/null +++ b/src/modules/hip/hip_tensor_logical_operations.hpp @@ -0,0 +1,31 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef HIP_TENSOR_LOGICAL_OPERATIONS_HPP +#define HIP_TENSOR_LOGICAL_OPERATIONS_HPP + +#include "kernel/bitwise_and.hpp" +#include "kernel/bitwise_or.hpp" + +#endif // HIP_TENSOR_LOGICAL_OPERATIONS_HPP \ No newline at end of file diff --git a/src/modules/hip/kernel/bitwise_and.hpp b/src/modules/hip/kernel/bitwise_and.hpp new file mode 100644 index 000000000..ca9f30c11 --- /dev/null +++ b/src/modules/hip/kernel/bitwise_and.hpp @@ -0,0 +1,247 @@ +#include +#include "rpp_hip_common.hpp" + +/* BitwiseAND is logical operation only on U8/I8 types. + For a Rpp32f precision image (pixel values from 0-1), the BitwiseAND is applied on a 0-255 + range-translated approximation, of the original 0-1 decimal-range image. + Link: https://stackoverflow.com/questions/1723575/how-to-perform-a-bitwise-operation-on-floating-point-numbers */ +template +__device__ void bitwise_and_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) +{ + if constexpr ((std::is_same::value) || (std::is_same::value)) + { + rpp_hip_math_multiply8_const(src1_f8, src1_f8, (float4)255); + rpp_hip_math_multiply8_const(src2_f8, src2_f8, (float4)255); + rpp_hip_math_bitwiseAnd8(src1_f8, src2_f8, dst_f8); + rpp_hip_math_multiply8_const(dst_f8, dst_f8, (float4)ONE_OVER_255); + } + else if constexpr (std::is_same::value) + { + rpp_hip_math_add8_const(src1_f8, src1_f8, (float4)128); + rpp_hip_math_add8_const(src2_f8, src2_f8, (float4)128); + rpp_hip_math_bitwiseAnd8(src1_f8, src2_f8, dst_f8); + rpp_hip_math_subtract8_const(dst_f8, dst_f8, (float4)128); + } + else + rpp_hip_math_bitwiseAnd8(src1_f8, src2_f8, dst_f8); +} + +template +__global__ void bitwise_and_pkd_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +__global__ void bitwise_and_pln_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int channelsDst, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float8 src1_f8, src2_f8, dst_f8; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + bitwise_and_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + if (channelsDst == 3) + { + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + bitwise_and_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + bitwise_and_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + } +} + +template +__global__ void bitwise_and_pkd3_pln3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24); +} + +template +__global__ void bitwise_and_pln3_pkd3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24); + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24); + bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_bitwise_and_tensor(T *srcPtr1, + T *srcPtr2, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (dstDescPtr->w + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(bitwise_and_pkd_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(bitwise_and_pln_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + dstDescPtr->c, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(bitwise_and_pkd3_pln3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; + hipLaunchKernelGGL(bitwise_and_pln3_pkd3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/bitwise_or.hpp b/src/modules/hip/kernel/bitwise_or.hpp new file mode 100644 index 000000000..ab0c962ef --- /dev/null +++ b/src/modules/hip/kernel/bitwise_or.hpp @@ -0,0 +1,247 @@ +#include +#include "rpp_hip_common.hpp" + +/* BitwiseOR is logical operation only on U8/I8 types. + For a Rpp32f precision image (pixel values from 0-1), the BitwiseOR is applied on a 0-255 + range-translated approximation, of the original 0-1 decimal-range image. + Link: https://stackoverflow.com/questions/1723575/how-to-perform-a-bitwise-operation-on-floating-point-numbers */ +template +__device__ void bitwise_or_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) +{ + if constexpr ((std::is_same::value) || (std::is_same::value)) + { + rpp_hip_math_multiply8_const(src1_f8, src1_f8, (float4)255); + rpp_hip_math_multiply8_const(src2_f8, src2_f8, (float4)255); + rpp_hip_math_bitwiseOr8(src1_f8, src2_f8, dst_f8); + rpp_hip_math_multiply8_const(dst_f8, dst_f8, (float4)ONE_OVER_255); + } + else if constexpr (std::is_same::value) + { + rpp_hip_math_add8_const(src1_f8, src1_f8, (float4)128); + rpp_hip_math_add8_const(src2_f8, src2_f8, (float4)128); + rpp_hip_math_bitwiseOr8(src1_f8, src2_f8, dst_f8); + rpp_hip_math_subtract8_const(dst_f8, dst_f8, (float4)128); + } + else + rpp_hip_math_bitwiseOr8(src1_f8, src2_f8, dst_f8); +} + +template +__global__ void bitwise_or_pkd_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +__global__ void bitwise_or_pln_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int channelsDst, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float8 src1_f8, src2_f8, dst_f8; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + bitwise_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + if (channelsDst == 3) + { + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + bitwise_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + bitwise_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + } +} + +template +__global__ void bitwise_or_pkd3_pln3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24); +} + +template +__global__ void bitwise_or_pln3_pkd3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24); + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24); + bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_bitwise_or_tensor(T *srcPtr1, + T *srcPtr2, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (dstDescPtr->w + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(bitwise_or_pkd_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(bitwise_or_pln_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + dstDescPtr->c, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(bitwise_or_pkd3_pln3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; + hipLaunchKernelGGL(bitwise_or_pln3_pkd3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/rppt_tensor_logical_operations.cpp b/src/modules/rppt_tensor_logical_operations.cpp new file mode 100644 index 000000000..7d28fe96b --- /dev/null +++ b/src/modules/rppt_tensor_logical_operations.cpp @@ -0,0 +1,300 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rppi_validate.hpp" +#include "rppt_tensor_logical_operations.h" +#include "cpu/host_tensor_logical_operations.hpp" + +#ifdef HIP_COMPILE + #include + #include "hip/hip_tensor_logical_operations.hpp" +#endif // HIP_COMPILE + +/******************** bitwise AND ********************/ + +RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + bitwise_and_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + bitwise_and_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + bitwise_and_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + bitwise_and_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** bitwise OR ********************/ + +RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + bitwise_or_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + bitwise_or_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + bitwise_or_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + bitwise_or_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + + +/********************************************************************************************************************/ +/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ +/********************************************************************************************************************/ + +#ifdef GPU_SUPPORT + +/******************** bitwise AND ********************/ + +RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_bitwise_and_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_bitwise_and_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_bitwise_and_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_bitwise_and_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** bitwise OR ********************/ + +RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_bitwise_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_bitwise_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_bitwise_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_bitwise_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +#endif // GPU_SUPPORT diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 7bd46b39e..cc31d506d 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -65,7 +65,7 @@ int main(int argc, char **argv) bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54); bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54); - bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63); + bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); bool randomOutputCase = (testCase == 84 || testCase == 49 || testCase == 54); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24); bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89); @@ -884,6 +884,30 @@ int main(int argc, char **argv) break; } + case 65: + { + testCaseName = "bitwise_and"; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_bitwise_and_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 68: + { + testCaseName = "bitwise_or"; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_bitwise_or_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 70: { testCaseName = "copy"; diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py index 2e8054332..0d16b779c 100644 --- a/utilities/test_suite/HIP/runTests.py +++ b/utilities/test_suite/HIP/runTests.py @@ -165,8 +165,10 @@ def func_group_finder(case_number): return "filter_augmentations" elif case_number < 40: return "geometric_augmentations" - elif case_number == 61: + elif case_number < 62: return "arithmetic_operations" + elif case_number < 69: + return "logical_operations" elif case_number < 87: return "data_exchange_operations" elif case_number < 88: @@ -631,6 +633,7 @@ def rpp_test_suite_parser_and_validator(): "geometric_augmentations", "morphological_operations", "arithmetic_operations", + "logical_operations", "statistical_operations" ] for log_file in log_file_list: @@ -696,7 +699,7 @@ def rpp_test_suite_parser_and_validator(): f.close() # print the results of qa tests -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '65', '68', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89'] nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp index b698a2def..7d2c8db7f 100644 --- a/utilities/test_suite/HOST/Tensor_host.cpp +++ b/utilities/test_suite/HOST/Tensor_host.cpp @@ -65,7 +65,7 @@ int main(int argc, char **argv) int batchSize = atoi(argv[14]); bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24); - bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63); + bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); bool randomOutputCase = (testCase == 84); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24); bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89); @@ -858,6 +858,32 @@ int main(int argc, char **argv) break; } + case 65: + { + testCaseName = "bitwise_and"; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_bitwise_and_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 68: + { + testCaseName = "bitwise_or"; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_bitwise_or_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 70: { testCaseName = "copy"; diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py index b08c4d5e8..080e4bb3f 100644 --- a/utilities/test_suite/HOST/runTests.py +++ b/utilities/test_suite/HOST/runTests.py @@ -119,10 +119,12 @@ def func_group_finder(case_number): return "color_augmentations" elif case_number == 8 or case_number == 30 or case_number == 82 or case_number == 83 or case_number == 84: return "effects_augmentations" - elif case_number < 40: + elif case_number < 40 or case_number == 63: return "geometric_augmentations" - elif case_number == 61: + elif case_number < 62: return "arithmetic_operations" + elif case_number < 69: + return "logical_operations" elif case_number < 87: return "data_exchange_operations" elif case_number < 88: @@ -412,7 +414,7 @@ def rpp_test_suite_parser_and_validator(): run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList) # print the results of qa tests -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89'] +supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '65', '68', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89'] nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: @@ -590,9 +592,12 @@ def rpp_test_suite_parser_and_validator(): "color_augmentations", "data_exchange_operations", "effects_augmentations", + "filter_augmentations", "geometric_augmentations", + "morphological_operations", "arithmetic_operations", - "statistical_operations", + "logical_operations", + "statistical_operations" ] for log_file in log_file_list: diff --git a/utilities/test_suite/REFERENCE_OUTPUT/bitwise_and/bitwise_and_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/bitwise_and/bitwise_and_u8_Tensor.bin new file mode 100644 index 000000000..952959dc5 Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/bitwise_and/bitwise_and_u8_Tensor.bin differ diff --git a/utilities/test_suite/REFERENCE_OUTPUT/bitwise_or/bitwise_or_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/bitwise_or/bitwise_or_u8_Tensor.bin new file mode 100644 index 000000000..2d1076614 Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/bitwise_or/bitwise_or_u8_Tensor.bin differ diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h index 58fee0c5d..01dc160b3 100644 --- a/utilities/test_suite/rpp_test_suite_common.h +++ b/utilities/test_suite/rpp_test_suite_common.h @@ -91,6 +91,8 @@ std::map augmentationMap = {54, "gaussian_filter"}, {61, "magnitude"}, {63, "phase"}, + {65, "bitwise_and"}, + {68, "bitwise_or"}, {70, "copy"}, {80, "resize_mirror_normalize"}, {81, "color_jitter"},