diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c439d35d..16c4251f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,30 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/rpp/en/latest/). -### RPP 1.5.0 (unreleased) +### RPP 1.8.0 (unreleased) + +### Changes + +* Prerequisites - ROCm install requires only --usecase=rocm +* Use pre-allocated common scratchBufferHip everywhere in Tensor code for scratch HIP memory +* Use CHECK_RETURN_STATUS everywhere to adhere to C++17 for hip +* RPP Tensor Audio support on HOST for Spectrogram +* RPP Tensor Audio support on HOST/HIP for Slice, by modifying voxel slice kernels to now accept anchor and shape params for a more generic version +* RPP Tensor Audio support on HOST for Mel Filter Bank +* RPP Tensor Normalize ND support on HOST and HIP + +### Tested configurations + +* Linux distribution + * Ubuntu - `20.04` / `22.04` + * CentOS - `7` + * RHEL - `8`/`9` +* ROCm: rocm-core - `6.1.0.60100` +* Clang - Version `5.0.1` +* CMake - Version `3.22.3` +* IEEE 754-based half-precision floating-point library - Version `1.12.0` + +### RPP 1.5.0 ### Changes @@ -376,4 +399,4 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r ### Known issues -* `CPU` backend is not enabled \ No newline at end of file +* `CPU` backend is not enabled diff --git a/CMakeLists.txt b/CMakeLists.txt index f5659a8e6..a53e2a0a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -308,6 +308,7 @@ message("-- ${White}${PROJECT_NAME} -- Link Libraries: ${LINK_LIBRARY_LIST}${Col target_link_libraries(${PROJECT_NAME} ${LINK_LIBRARY_LIST}) set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE CXX) +target_link_libraries(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/libs/third_party/ffts/libffts.a) set_target_properties(${PROJECT_NAME} PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR}) target_include_directories(${PROJECT_NAME} @@ -429,15 +430,12 @@ set(CPACK_RPM_COMPONENT_INSTALL ON) set(CPACK_RPM_RUNTIME_PACKAGE_NAME "${PROJECT_NAME}") set(CPACK_RPM_RUNTIME_PACKAGE_REQUIRES "rocm-core, ${RPP_RPM_PACKAGE_LIST}") set(CPACK_RPM_RUNTIME_PACKAGE_PROVIDES "${PROJECT_NAME}") -set(CPACK_RPM_RUNTIME_PACKAGE_OBSOLETES "${PROJECT_NAME}") set(CPACK_RPM_DEV_PACKAGE_NAME "${PROJECT_NAME}-devel") set(CPACK_RPM_DEV_PACKAGE_REQUIRES "rocm-core, ${PROJECT_NAME}, ${RPP_RPM_DEV_PACKAGE_LIST}") set(CPACK_RPM_DEV_PACKAGE_PROVIDES "${PROJECT_NAME}-devel") -set(CPACK_RPM_DEV_PACKAGE_OBSOLETES "${PROJECT_NAME}-devel") set(CPACK_RPM_TEST_PACKAGE_NAME "${PROJECT_NAME}-test") set(CPACK_RPM_TEST_PACKAGE_REQUIRES "rocm-core, ${PROJECT_NAME}-devel") set(CPACK_RPM_TEST_PACKAGE_PROVIDES "${PROJECT_NAME}-test") -set(CPACK_RPM_TEST_PACKAGE_OBSOLETES "${PROJECT_NAME}-test") set(CPACK_RPM_PACKAGE_LICENSE "MIT" ) # RPM package specific variable for ASAN set(CPACK_RPM_ASAN_PACKAGE_NAME "${PROJECT_NAME}-asan" ) diff --git a/README.md b/README.md index c9349c394..1235faed1 100644 --- a/README.md +++ b/README.md @@ -26,14 +26,14 @@ Input
(nifti1 .nii medical image) | fused_multiply_add_scalar
(brightened ## Prerequisites * Linux - * **Ubuntu** - `20.04` / `22.04` - * **CentOS** - `7` - * **RedHat** - `8` / `9` - * **SLES** - `15-SP4` + * Ubuntu - `20.04` / `22.04` + * CentOS - `7` + * RedHat - `8` / `9` + * SLES - `15-SP4` * [ROCm supported hardware](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) -* Install ROCm with [amdgpu-install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/amdgpu-install.html) with `--usecase=graphics,rocm --no-32` +* Install ROCm with [amdgpu-install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/amdgpu-install.html) with `--usecase=rocm` * Clang Version `5.0.1` and above @@ -74,15 +74,15 @@ Input
(nifti1 .nii medical image) | fused_multiply_add_scalar
(brightened sudo apt-get install half ``` -> [!NOTE] -> You must use the appropriate package manager for your operating system. - * Compiler with support for C++ Version `17` and above * OpenMP * Threads +> [!NOTE] +> You must use the appropriate package manager for your operating system. + ## Build and install instructions ### Package install diff --git a/docker/rpp-on-ubuntu20.dockerfile b/docker/rpp-on-ubuntu20.dockerfile index 568b427ab..cff7071c3 100644 --- a/docker/rpp-on-ubuntu20.dockerfile +++ b/docker/rpp-on-ubuntu20.dockerfile @@ -15,7 +15,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get -y install initramfs-tools libnuma-de wget ${ROCM_INSTALLER_REPO} && \ sudo apt-get install -y ./${ROCM_INSTALLER_PACKAGE} && \ sudo apt-get update -y && \ - sudo amdgpu-install -y --usecase=graphics,rocm + sudo amdgpu-install -y --usecase=rocm # install rpp dependencies - half.hpp RUN wget https://sourceforge.net/projects/half/files/half/1.12.0/half-1.12.0.zip && \ unzip half-1.12.0.zip -d half-files && mkdir -p /usr/local/include/half && cp half-files/include/half.hpp /usr/local/include/half diff --git a/docker/rpp-on-ubuntu22.dockerfile b/docker/rpp-on-ubuntu22.dockerfile index b5571d039..dd94379ff 100644 --- a/docker/rpp-on-ubuntu22.dockerfile +++ b/docker/rpp-on-ubuntu22.dockerfile @@ -15,7 +15,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get -y install initramfs-tools libnuma-de wget ${ROCM_INSTALLER_REPO} && \ sudo apt-get install -y ./${ROCM_INSTALLER_PACKAGE} && \ sudo apt-get update -y && \ - sudo amdgpu-install -y --usecase=graphics,rocm + sudo amdgpu-install -y --usecase=rocm # install rpp dependencies - half.hpp RUN wget https://sourceforge.net/projects/half/files/half/1.12.0/half-1.12.0.zip && \ unzip half-1.12.0.zip -d half-files && mkdir -p /usr/local/include/half && cp half-files/include/half.hpp /usr/local/include/half diff --git a/docs/install/install.rst b/docs/install/install.rst index 7693cc5af..cc0a4a0a1 100644 --- a/docs/install/install.rst +++ b/docs/install/install.rst @@ -22,7 +22,7 @@ Prerequisites * `ROCm supported hardware `_ -* Install ROCm with `amdgpu-install `_ with ``--usecase=graphics,rocm --no-32`` +* Install ROCm with `amdgpu-install `_ with ``--usecase=rocm`` * Clang Version `5.0.1` and above diff --git a/include/rppdefs.h b/include/rppdefs.h index c5861e0e6..28876d7f5 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -134,7 +134,7 @@ typedef enum /*! \brief Out of bound source ROI \ingroup group_rppdefs */ RPP_ERROR_OUT_OF_BOUND_SRC_ROI = -17, /*! \brief src and dst layout mismatch \ingroup group_rppdefs */ - RPP_ERROR_SRC_DST_LAYOUT_MISMATCH = -18, + RPP_ERROR_LAYOUT_MISMATCH = -18, /*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */ RPP_ERROR_INVALID_CHANNELS = -19 } RppStatus; @@ -369,10 +369,13 @@ typedef enum */ typedef enum { - NCHW, - NHWC, - NCDHW, - NDHWC + NCHW, // BatchSize-Channels-Height-Width + NHWC, // BatchSize-Height-Width-Channels + NCDHW, // BatchSize-Channels-Depth-Height-Width + NDHWC, // BatchSize-Depth-Height-Width-Channels + NHW, // BatchSize-Height-Width + NFT, // BatchSize-Frequency-Time -> Frequency Major used for Spectrogram / MelfilterBank + NTF // BatchSize-Time-Frequency -> Time Major used for Spectrogram / MelfilterBank } RpptLayout; /*! \brief RPPT Tensor 2D ROI type enum @@ -425,6 +428,15 @@ typedef enum REFLECT } RpptAudioBorderType; +/*! \brief RPPT Mel Scale Formula + * \ingroup group_rppdefs + */ +typedef enum +{ + SLANEY = 0, // Follows Slaney’s MATLAB Auditory Modelling Work behavior + HTK, // Follows O’Shaughnessy’s book formula, consistent with Hidden Markov Toolkit(HTK), m = 2595 * log10(1 + (f/700)) +} RpptMelScaleFormula; + /*! \brief RPPT Tensor 2D ROI LTRB struct * \ingroup group_rppdefs */ @@ -1024,6 +1036,7 @@ typedef struct Rpp64u* dstBatchIndex; Rpp32u* inc; Rpp32u* dstInc; + hipMemRpp32u scratchBuf; } memGPU; /*! \brief RPP HIP-HOST memory management diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h index 1941df0da..31f3e95ef 100644 --- a/include/rppt_tensor_audio_augmentations.h +++ b/include/rppt_tensor_audio_augmentations.h @@ -60,7 +60,7 @@ extern "C" { * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32f *detectedIndexTensor, Rpp32f *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle); +RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle); /*! \brief To Decibels augmentation on HOST backend * \details To Decibels augmentation for 1D audio buffer converts magnitude values to decibel values @@ -110,6 +110,47 @@ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, */ RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle); +/*! \brief Produces a spectrogram from a 1D audio buffer on HOST backend + * \details Spectrogram for 1D audio buffer + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT / NTF) + * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize) + * \param [in] centerWindows indicates whether extracted windows should be padded so that the window function is centered at multiples of window_step + * \param [in] reflectPadding indicates the padding policy when sampling outside the bounds of the signal + * \param [in] windowFunction samples of the window function that will be multiplied to each extracted window when calculating the Short Time Fourier Transform (STFT) + * \param [in] nfft size of the FFT + * \param [in] power exponent of the magnitude of the spectrum + * \param [in] windowLength window size in number of samples + * \param [in] windowStep step between the STFT windows in number of samples + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_spectrogram_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcLengthTensor, bool centerWindows, bool reflectPadding, Rpp32f *windowFunction, Rpp32s nfft, Rpp32s power, Rpp32s windowLength, Rpp32s windowStep, rppHandle_t rppHandle); + +/*! \brief Mel filter bank augmentation HOST backend + * \details Mel filter bank augmentation for audio data + * \param[in] srcPtr source tensor in HOST memory + * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT) + * \param[out] dstPtr destination tensor in HOST memory + * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT) + * \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) + * \param[in] maxFreq maximum frequency if not provided maxFreq = sampleRate / 2 + * \param[in] minFreq minimum frequency + * \param[in] melFormula formula used to convert frequencies from hertz to mel and from mel to hertz (SLANEY / HTK) + * \param[in] numFilter number of mel filters + * \param[in] sampleRate sampling rate of the audio + * \param[in] normalize boolean variable that determine whether to normalize weights / not + * \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDims, Rpp32f maxFreq, Rpp32f minFreq, RpptMelScaleFormula melFormula, Rpp32s numFilter, Rpp32f sampleRate, bool normalize, rppHandle_t rppHandle); + /*! \brief Resample augmentation on HOST backend * \details Resample augmentation for audio data * \param[in] srcPtr source tensor in HOST memory diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h index 695c3252d..8e846a41b 100644 --- a/include/rppt_tensor_geometric_augmentations.h +++ b/include/rppt_tensor_geometric_augmentations.h @@ -448,38 +448,42 @@ RppStatus rppt_phase_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc * \details This function performs slice augmentation on a generic 4D tensor. * Slice augmentation involves selecting a region of interest (ROI) from the source tensor * and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes. - * \param[in] srcPtr source tensor in HOST memory - * \param[in] srcGenericDescPtr source tensor descriptor - * \param[out] dstPtr destination tensor in HOST memory - * \param[in] dstGenericDescPtr destination tensor descriptor - * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) - * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] srcPtr source tensor memory in HOST memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor memory in HOST memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions) + * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions) + * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true + * \param [in] enablePadding boolean flag to specify if padding is enabled or not + * \param [in] roiTensor roi data in HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. - * \ingroup group_tensor_geometric */ -RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle); #ifdef GPU_SUPPORT /*! \brief Slice augmentation GPU * \details This function performs slice augmentation on a generic 4D tensor. * Slice augmentation involves selecting a region of interest (ROI) from the source tensor * and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes. - * \param[in] srcPtr source tensor in HIP memory - * \param[in] srcGenericDescPtr source tensor descriptor - * \param[out] dstPtr destination tensor in HIP memory - * \param[in] dstGenericDescPtr destination tensor descriptor - * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) - * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] srcPtr source tensor memory in HIP memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor memory in HIP memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions) + * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions) + * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true + * \param [in] enablePadding boolean flag to specify if padding is enabled or not + * \param [in] roiTensor roi data in pinned/HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. - * \ingroup group_tensor_geometric */ -RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle); #endif // GPU_SUPPORT /*! \brief Crop and Patch augmentation on HOST backend for a NCHW/NHWC layout tensor @@ -539,15 +543,15 @@ RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPt
Support added for f32 -> f32 and u8 -> u8 dataypes. * \image html input150x150x4.gif Sample Input * \image html geometric_augmentations_flip_150x150x4.gif Sample Output - * \param[in] srcPtr source tensor in HOST memory - * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) - * \param[out] dstPtr destination tensor in HOST memory - * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1) * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1) * \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in HOST memory, of size batchSize, with depthTensor[i] = 0/1) - * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) - * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. @@ -562,15 +566,15 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes
Support added for f32 -> f32 and u8 -> u8 dataypes. * \image html input150x150x4.gif Sample Input * \image html geometric_augmentations_flip_150x150x4.gif Sample Output - * \param[in] srcPtr source tensor in HIP memory - * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) - * \param[out] dstPtr destination tensor in HIP memory - * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1) * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1) * \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with depthTensor[i] = 0/1) - * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) - * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) + * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values) + * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h index 89418be78..7026b13b4 100644 --- a/include/rppt_tensor_statistical_operations.h +++ b/include/rppt_tensor_statistical_operations.h @@ -49,7 +49,7 @@ extern "C" { * \param [in] srcPtr source tensor in HOST memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorSumArr destination array in HOST memory - * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength = srcDescPtr->n * 4) + * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() @@ -67,7 +67,7 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcPtr source tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorSumArr destination array in HIP memory - * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength = srcDescPtr->n * 4) + * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() @@ -85,7 +85,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcPtr source tensor in HOST memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] minArr destination array in HOST memory - * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then minArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then minArrLength = srcDescPtr->n * 4) + * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() @@ -103,7 +103,7 @@ RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcPtr source tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] minArr destination array in HIP memory - * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then minArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then minArrLength = srcDescPtr->n * 4) + * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() @@ -121,7 +121,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcPtr source tensor in HOST memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] maxArr destination array in HOST memory - * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then maxArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then maxArrLength = srcDescPtr->n * 4) + * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() @@ -139,10 +139,10 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcPtr source tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] maxArr destination array in HIP memory - * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then maxArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then maxArrLength = srcDescPtr->n * 4) + * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) - * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. @@ -150,6 +150,49 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Normalize Generic augmentation on HOST backend + * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor. + * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. Also has toggle variant(NHWC->NCHW) support for 3D. + * \param [in] srcPtr source tensor memory in HOST memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor memory in HOST memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] axisMask axis along which normalization needs to be done + * \param [in] meanTensor values to be subtracted from input + * \param [in] stdDevTensor standard deviation values to scale the input + * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided) + * \param [in] scale value to be multiplied with data after subtracting from mean + * \param [in] shift value to be added finally + * \param [in] roiTensor values to represent dimensions of input tensor + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_normalize_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Normalize Generic augmentation on HIP backend + * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor. + * \param [in] srcPtr source tensor memory in HIP memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor memory in HIP memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] axisMask axis along which normalization needs to be done + * \param [in] meanTensor values to be subtracted from input + * \param [in] stdDevTensor standard deviation values to scale the input + * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided) + * \param [in] scale value to be multiplied with data after subtracting from mean + * \param [in] shift value to be added finally + * \param [in] roiTensor values to represent dimensions of input tensor + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief Tensor mean operation on HOST backend for a NCHW/NHWC layout tensor * \details The tensor mean is a reduction operation that finds the channel-wise (R mean / G mean / B mean) and total mean for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). @@ -229,4 +272,4 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H +#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H \ No newline at end of file diff --git a/include/third_party/ffts/ffts.h b/include/third_party/ffts/ffts.h new file mode 100644 index 000000000..cc85a885b --- /dev/null +++ b/include/third_party/ffts/ffts.h @@ -0,0 +1,110 @@ +/* + + This file is part of FFTS. + + Copyright (c) 2012, Anthony M. Blake + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef FFTS_H +#define FFTS_H + +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if (defined(_WIN32) || defined(WIN32)) && defined(FFTS_SHARED) +# ifdef FFTS_BUILD +# define FFTS_API __declspec(dllexport) +# else +# define FFTS_API __declspec(dllimport) +# endif +#else +# if (__GNUC__ >= 4) || defined(HAVE_GCC_VISIBILITY) +# define FFTS_API __attribute__ ((visibility("default"))) +# else +# define FFTS_API +# endif +#endif + +/* The direction of the transform + (i.e, the sign of the exponent in the transform.) +*/ +#define FFTS_FORWARD (-1) +#define FFTS_BACKWARD (+1) + +struct _ffts_plan_t; +typedef struct _ffts_plan_t ffts_plan_t; + +/* Complex data is stored in the interleaved format + (i.e, the real and imaginary parts composing each + element of complex data are stored adjacently in memory) + + The multi-dimensional arrays passed are expected to be + stored as a single contiguous block in row-major order +*/ +FFTS_API ffts_plan_t* +ffts_init_1d(size_t N, int sign); + +FFTS_API ffts_plan_t* +ffts_init_2d(size_t N1, size_t N2, int sign); + +FFTS_API ffts_plan_t* +ffts_init_nd(int rank, size_t *Ns, int sign); + +/* For real transforms, sign == FFTS_FORWARD implies a real-to-complex + forwards tranform, and sign == FFTS_BACKWARD implies a complex-to-real + backwards transform. + + The output of a real-to-complex transform is N/2+1 complex numbers, + where the redundant outputs have been omitted. +*/ +FFTS_API ffts_plan_t* +ffts_init_1d_real(size_t N, int sign); + +FFTS_API ffts_plan_t* +ffts_init_2d_real(size_t N1, size_t N2, int sign); + +FFTS_API ffts_plan_t* +ffts_init_nd_real(int rank, size_t *Ns, int sign); + +FFTS_API void +ffts_execute(ffts_plan_t *p, const void *input, void *output); + +FFTS_API void +ffts_free(ffts_plan_t *p); + +#ifdef __cplusplus +} +#endif + +#endif /* FFTS_H */ diff --git a/include/third_party/ffts/ffts_attributes.h b/include/third_party/ffts/ffts_attributes.h new file mode 100644 index 000000000..bdfd6162f --- /dev/null +++ b/include/third_party/ffts/ffts_attributes.h @@ -0,0 +1,111 @@ +/* + + This file is part of FFTS -- The Fastest Fourier Transform in the South + + Copyright (c) 2012, Anthony M. Blake + Copyright (c) 2012, The University of Waikato + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the organization nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef FFTS_ATTRIBUTES_H +#define FFTS_ATTRIBUTES_H + +#if defined (_MSC_VER) && (_MSC_VER >= 1020) +#pragma once +#endif + +/* Macro definitions for various function/variable attributes */ +#ifdef __GNUC__ +#define GCC_VERSION_AT_LEAST(x,y) \ + (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y) +#else +#define GCC_VERSION_AT_LEAST(x,y) 0 +#endif + +#ifdef __GNUC__ +#define FFTS_ALIGN(x) __attribute__((aligned(x))) +#elif defined(_MSC_VER) +#define FFTS_ALIGN(x) __declspec(align(x)) +#else +#define FFTS_ALIGN(x) +#endif + +#if GCC_VERSION_AT_LEAST(3,1) +#define FFTS_ALWAYS_INLINE __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +#define FFTS_ALWAYS_INLINE __forceinline +#else +#define FFTS_ALWAYS_INLINE inline +#endif + +#if defined(_MSC_VER) +#define FFTS_INLINE __inline +#else +#define FFTS_INLINE inline +#endif + +#if defined(__GNUC__) +#define FFTS_RESTRICT __restrict +#elif defined(_MSC_VER) +#define FFTS_RESTRICT __restrict +#else +#define FFTS_RESTRICT +#endif + +#if GCC_VERSION_AT_LEAST(4,5) +#define FFTS_ASSUME(cond) do { if (!(cond)) __builtin_unreachable(); } while (0) +#elif defined(_MSC_VER) +#define FFTS_ASSUME(cond) __assume(cond) +#else +#define FFTS_ASSUME(cond) +#endif + +#if GCC_VERSION_AT_LEAST(4,7) +#define FFTS_ASSUME_ALIGNED_16(x) __builtin_assume_aligned(x, 16) +#else +#define FFTS_ASSUME_ALIGNED_16(x) x +#endif + +#if GCC_VERSION_AT_LEAST(4,7) +#define FFTS_ASSUME_ALIGNED_32(x) __builtin_assume_aligned(x, 32) +#else +#define FFTS_ASSUME_ALIGNED_32(x) x +#endif + +#if defined(__GNUC__) +#define FFTS_LIKELY(cond) __builtin_expect(!!(cond), 1) +#else +#define FFTS_LIKELY(cond) cond +#endif + +#if defined(__GNUC__) +#define FFTS_UNLIKELY(cond) __builtin_expect(!!(cond), 0) +#else +#define FFTS_UNLIKELY(cond) cond +#endif + +#endif /* FFTS_ATTRIBUTES_H */ diff --git a/libs/third_party/ffts/libffts.a b/libs/third_party/ffts/libffts.a new file mode 100644 index 000000000..b30b11471 Binary files /dev/null and b/libs/third_party/ffts/libffts.a differ diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index a6d58d5a2..5f8048a19 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -6123,41 +6123,41 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2 inline void compute_variance_8_host(__m256d *p1, __m256d *pMean, __m256d *pVar) { __m256d pSub = _mm256_sub_pd(p1[0], pMean[0]); - pVar[0] = _mm256_fmadd_pd(pSub, pSub, pVar[0]); + pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]); pSub = _mm256_sub_pd(p1[1], pMean[0]); - pVar[0] = _mm256_fmadd_pd(pSub, pSub, pVar[0]); + pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]); } -inline void compute_varianceChannel_pln3_24_host(__m256d *p1, __m256d *pMeanR, __m256d *pMeanG, __m256d *pMeanB, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB) +inline void compute_variance_channel_pln3_24_host(__m256d *p1, __m256d *pMeanR, __m256d *pMeanG, __m256d *pMeanB, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB) { __m256d pSub = _mm256_sub_pd(p1[0], pMeanR[0]); - pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]); + pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]); pSub = _mm256_sub_pd(p1[1], pMeanR[0]); - pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]); + pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]); pSub = _mm256_sub_pd(p1[2], pMeanG[0]); - pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]); + pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]); pSub = _mm256_sub_pd(p1[3], pMeanG[0]); - pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]); + pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]); pSub = _mm256_sub_pd(p1[4], pMeanB[0]); - pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]); + pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]); pSub = _mm256_sub_pd(p1[5], pMeanB[0]); - pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]); + pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]); } -inline void compute_varianceImage_pln3_24_host(__m256d *p1, __m256d *pMean, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB) +inline void compute_variance_image_pln3_24_host(__m256d *p1, __m256d *pMean, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB) { __m256d pSub = _mm256_sub_pd(p1[0], pMean[0]); - pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]); + pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]); pSub = _mm256_sub_pd(p1[1], pMean[0]); - pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]); + pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]); pSub = _mm256_sub_pd(p1[2], pMean[0]); - pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]); + pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]); pSub = _mm256_sub_pd(pMean[0], p1[3]); - pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]); + pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]); pSub = _mm256_sub_pd(p1[4], pMean[0]); - pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]); + pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]); pSub = _mm256_sub_pd(p1[5], pMean[0]); - pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]); + pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]); } inline void compute_vignette_48_host(__m256 *p, __m256 &pMultiplier, __m256 &pILocComponent, __m256 &pJLocComponent) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index d47193831..fd9373669 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -2611,6 +2611,127 @@ inline Rpp32f rpp_hsum_ps(__m256 x) return _mm_cvtss_f32(sum); } +/* Computes inverse square root */ +inline Rpp32f rpp_rsqrt_ps(Rpp32f x) +{ + __m128 X = _mm_set_ss(x); + __m128 tmp = _mm_rsqrt_ss(X); + Rpp32f y = _mm_cvtss_f32(tmp); + return y * (1.5f - x * 0.5f * y * y); +} + +/* Compute inverse square root */ +/* SSE matches to 6 decimal places with raw C version due to newton rhapson approximation*/ +inline void rpp_rsqrt_sse(Rpp32f *input, Rpp64s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f mul) +{ + Rpp64s i = 0; + __m128 rdivx4 = _mm_set1_ps(rdiv); + __m128 mulx4 = _mm_set1_ps(mul * 0.5f); + if (eps) // epsilon is present - no need for masking, but we need to add it + { + __m128 epsx4 = _mm_set1_ps(eps); + for (; i + 4 <= numElements; i += 4) + { + __m128 x = _mm_loadu_ps(&input[i]); + x = _mm_mul_ps(x, rdivx4); + x = _mm_add_ps(x, epsx4); + __m128 y = _mm_rsqrt_ps(x); + __m128 y2 = _mm_mul_ps(y, y); + __m128 xy2 = _mm_mul_ps(x, y2); + __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2); + y = _mm_mul_ps(y, three_minus_xy2); + y = _mm_mul_ps(y, mulx4); + _mm_storeu_ps(&input[i], y); + } + } + else + { + for (; i + 4 <= numElements; i += 4) + { + __m128 x = _mm_loadu_ps(&input[i]); + x = _mm_mul_ps(x, rdivx4); + __m128 mask = _mm_cmpneq_ps(x, xmm_p0); + __m128 y = _mm_rsqrt_ps(x); + y = _mm_and_ps(y, mask); + __m128 y2 = _mm_mul_ps(y, y); + __m128 xy2 = _mm_mul_ps(x, y2); + __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2); + y = _mm_mul_ps(y, three_minus_xy2); + y = _mm_mul_ps(y, mulx4); + _mm_storeu_ps(&input[i], y); + } + } + if (eps) + { + for (; i < numElements; i++) + input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * mul; + } + else + { + for (; i < numElements; i++) + { + Rpp32f x = input[i] * rdiv; + input[i] = x ? rpp_rsqrt_ps(x) * mul : 0; + } + } +} + +/* Compute inverse square root */ +/* AVX2 matches to 6 decimal places with raw C version due to newton rhapson approximation*/ +inline void rpp_rsqrt_avx(Rpp32f *input, Rpp32s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f scale) +{ + Rpp32s i = 0; + __m256 rdivx8 = _mm256_set1_ps(rdiv); + __m256 mulx8 = _mm256_set1_ps(scale * 0.5f); + if (eps) // epsilon is present - no need for masking, but we need to add it + { + __m256 epsx8 = _mm256_set1_ps(eps); + for (; i + 8 <= numElements; i += 8) + { + __m256 x = _mm256_loadu_ps(&input[i]); + x = _mm256_mul_ps(x, rdivx8); + x = _mm256_add_ps(x, epsx8); + __m256 y = _mm256_rsqrt_ps(x); + __m256 y2 = _mm256_mul_ps(y, y); + __m256 xy2 = _mm256_mul_ps(x, y2); + __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2); + y = _mm256_mul_ps(y, three_minus_xy2); + y = _mm256_mul_ps(y, mulx8); + _mm256_storeu_ps(&input[i], y); + } + } + else + { + for (; i + 8 <= numElements; i += 8) + { + __m256 x = _mm256_loadu_ps(&input[i]); + x = _mm256_mul_ps(x, rdivx8); + __m256 mask = _mm256_cmp_ps(x, avx_p0, _CMP_NEQ_OQ); + __m256 y = _mm256_rsqrt_ps(x); + y = _mm256_and_ps(y, mask); + __m256 y2 = _mm256_mul_ps(y, y); + __m256 xy2 = _mm256_mul_ps(x, y2); + __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2); + y = _mm256_mul_ps(y, three_minus_xy2); + y = _mm256_mul_ps(y, mulx8); + _mm256_storeu_ps(&input[i], y); + } + } + if (eps) + { + for (; i < numElements; i++) + input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * scale; + } + else + { + for (; i < numElements; i++) + { + Rpp32f x = input[i] * rdiv; + input[i] = x ? rpp_rsqrt_ps(x) * scale : 0; + } + } +} + static inline void fast_matmul4x4_sse(float *A, float *B, float *C) { __m128 row1 = _mm_load_ps(&B[0]); // Row 0 of B diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt index 1f47e8ee7..4338483a9 100644 --- a/src/modules/CMakeLists.txt +++ b/src/modules/CMakeLists.txt @@ -97,6 +97,7 @@ if( "${BACKEND}" STREQUAL "HIP") set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS}") set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) + set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs # Add HIP specific preprocessor flags add_definitions(-DHIP_COMPILE) @@ -111,6 +112,7 @@ elseif( "${BACKEND}" STREQUAL "OCL") list(APPEND Rpp_Source ${CPPFILES} ${MOD_CL_CPP}) message("-- ${Green}OpenCL kernels added!${ColourReset}") set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) + set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs # Add OpenCL specific preprocessor flags add_definitions(-DOCL_COMPILE) @@ -125,6 +127,7 @@ elseif( "${BACKEND}" STREQUAL "CPU") # Add CPU specific includes set(INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/common/) set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) + set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs endif() message("-- ${White}AMD RPP ${PROJECT_NAME} -- Include Directories:${INCLUDE_LIST}${ColourReset}") add_compile_options("-Wno-unused-result") diff --git a/src/modules/cpu/host_tensor_audio_augmentations.hpp b/src/modules/cpu/host_tensor_audio_augmentations.hpp index 0c6ccf211..82d43d082 100644 --- a/src/modules/cpu/host_tensor_audio_augmentations.hpp +++ b/src/modules/cpu/host_tensor_audio_augmentations.hpp @@ -29,6 +29,8 @@ SOFTWARE. #include "kernel/to_decibels.hpp" #include "kernel/pre_emphasis_filter.hpp" #include "kernel/down_mixing.hpp" +#include "kernel/spectrogram.hpp" +#include "kernel/mel_filter_bank.hpp" #include "kernel/resample.hpp" #endif // HOST_TENSOR_AUDIO_AUGMENTATIONS_HPP \ No newline at end of file diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp index 15a4245c5..0a7e8ff51 100644 --- a/src/modules/cpu/host_tensor_statistical_operations.hpp +++ b/src/modules/cpu/host_tensor_statistical_operations.hpp @@ -30,5 +30,6 @@ SOFTWARE. #include "kernel/tensor_max.hpp" #include "kernel/tensor_mean.hpp" #include "kernel/tensor_stddev.hpp" +#include "kernel/normalize.hpp" -#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP \ No newline at end of file +#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP diff --git a/src/modules/cpu/kernel/mel_filter_bank.hpp b/src/modules/cpu/kernel/mel_filter_bank.hpp new file mode 100644 index 000000000..9cc6d26d2 --- /dev/null +++ b/src/modules/cpu/kernel/mel_filter_bank.hpp @@ -0,0 +1,252 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +struct BaseMelScale +{ + public: + virtual Rpp32f hz_to_mel(Rpp32f hz) = 0; + virtual Rpp32f mel_to_hz(Rpp32f mel) = 0; + virtual ~BaseMelScale() = default; +}; + +struct HtkMelScale : public BaseMelScale +{ + Rpp32f hz_to_mel(Rpp32f hz) { return 1127.0f * std::log(1.0f + (hz / 700.0f)); } + Rpp32f mel_to_hz(Rpp32f mel) { return 700.0f * (std::exp(mel / 1127.0f) - 1.0f); } + public: + ~HtkMelScale() {}; +}; + +struct SlaneyMelScale : public BaseMelScale +{ + const Rpp32f freqLow = 0; + const Rpp32f fsp = 200.0 / 3.0; + const Rpp32f minLogHz = 1000.0; + const Rpp32f minLogMel = (minLogHz - freqLow) / fsp; + const Rpp32f stepLog = 0.068751777; // Equivalent to std::log(6.4) / 27.0; + + const Rpp32f invMinLogHz = 1.0f / 1000.0; + const Rpp32f invStepLog = 1.0f / stepLog; + const Rpp32f invFsp = 1.0f / fsp; + + Rpp32f hz_to_mel(Rpp32f hz) + { + Rpp32f mel = 0.0f; + if (hz >= minLogHz) + mel = minLogMel + std::log(hz * invMinLogHz) * invStepLog; + else + mel = (hz - freqLow) * invFsp; + + return mel; + } + + Rpp32f mel_to_hz(Rpp32f mel) + { + Rpp32f hz = 0.0f; + if (mel >= minLogMel) + hz = minLogHz * std::exp(stepLog * (mel - minLogMel)); + else + hz = freqLow + mel * fsp; + return hz; + } + public: + ~SlaneyMelScale() {}; +}; + +RppStatus mel_filter_bank_host_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s *srcDimsTensor, + Rpp32f maxFreqVal, // check unused + Rpp32f minFreqVal, + RpptMelScaleFormula melFormula, + Rpp32s numFilter, + Rpp32f sampleRate, + bool normalize, + rpp::Handle& handle) +{ + BaseMelScale *melScalePtr; + switch(melFormula) + { + case RpptMelScaleFormula::HTK: + melScalePtr = new HtkMelScale; + break; + case RpptMelScaleFormula::SLANEY: + default: + melScalePtr = new SlaneyMelScale(); + break; + } + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u batchSize = srcDescPtr->n; + Rpp32f *scratchMem = handle.GetInitHandle()->mem.mcpu.scratchBufferHost; + + Rpp32f maxFreq = sampleRate / 2; + Rpp32f minFreq = minFreqVal; + + // Convert lower, higher frequencies to mel scale and find melStep + Rpp64f melLow = melScalePtr->hz_to_mel(minFreq); + Rpp64f melHigh = melScalePtr->hz_to_mel(maxFreq); + Rpp64f melStep = (melHigh - melLow) / (numFilter + 1); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride; + + // Extract nfft, number of Frames, numBins + Rpp32s nfft = (srcDimsTensor[batchCount * 2] - 1) * 2; + Rpp32s numBins = nfft / 2 + 1; + Rpp32s numFrames = srcDimsTensor[batchCount * 2 + 1]; + + // Find hzStep + Rpp64f hzStep = static_cast(sampleRate) / nfft; + Rpp64f invHzStep = 1.0 / hzStep; + + // Find fftBinStart and fftBinEnd + Rpp32s fftBinStart = std::ceil(minFreq * invHzStep); + Rpp32s fftBinEnd = std::ceil(maxFreq * invHzStep); + fftBinEnd = std::min(fftBinEnd, numBins); + + // Set/Fill normFactors, weightsDown and intervals + Rpp32f *normFactors = scratchMem + (batchCount * numFilter); + std::fill(normFactors, normFactors + numFilter, 1.f); // normFactors contain numFilter values of type float + Rpp32f *weightsDown = scratchMem + (batchSize * numFilter) + (batchCount * numBins); + memset(weightsDown, 0, sizeof(numBins * sizeof(Rpp32f))); // weightsDown contain numBins values of type float + Rpp32s *intervals = reinterpret_cast(weightsDown + (batchSize * numBins)); + std::fill(intervals, intervals + numBins, -1); // intervals contain numBins values of type integer + + Rpp32s fftBin = fftBinStart; + Rpp64f mel0 = melLow, mel1 = melLow + melStep; + Rpp64f fIter = fftBin * hzStep; + for (int interval = 0; interval < numFilter + 1; interval++, mel0 = mel1, mel1 += melStep) + { + Rpp64f f0 = melScalePtr->mel_to_hz(mel0); + Rpp64f f1 = melScalePtr->mel_to_hz(interval == numFilter ? melHigh : mel1); + Rpp64f slope = 1. / (f1 - f0); + + if (normalize && interval < numFilter) + { + Rpp64f f2 = melScalePtr->mel_to_hz(mel1 + melStep); + normFactors[interval] = 2.0 / (f2 - f0); + } + + for (; fftBin < fftBinEnd && fIter < f1; fftBin++, fIter = fftBin * hzStep) + { + weightsDown[fftBin] = (f1 - fIter) * slope; + intervals[fftBin] = interval; + } + } + + Rpp32u maxFrames = std::min(static_cast(numFrames + 8), dstDescPtr->strides.hStride); + Rpp32u maxAlignedLength = maxFrames & ~7; + Rpp32u vectorIncrement = 8; + + // Set ROI values in dst buffer to 0.0 + for(int i = 0; i < numFilter; i++) + { + Rpp32f *dstPtrRow = dstPtrTemp + i * dstDescPtr->strides.hStride; + Rpp32u vectorLoopCount = 0; + for(; vectorLoopCount < maxAlignedLength; vectorLoopCount += 8) + { + _mm256_storeu_ps(dstPtrRow, avx_p0); + dstPtrRow += 8; + } + for(; vectorLoopCount < maxFrames; vectorLoopCount++) + *dstPtrRow++ = 0.0f; + } + + Rpp32u alignedLength = numFrames & ~7; + __m256 pSrc, pDst; + Rpp32f *srcRowPtr = srcPtrTemp + fftBinStart * srcDescPtr->strides.hStride; + for (int64_t fftBin = fftBinStart; fftBin < fftBinEnd; fftBin++) + { + auto filterUp = intervals[fftBin]; + auto weightUp = 1.0f - weightsDown[fftBin]; + auto filterDown = filterUp - 1; + auto weightDown = weightsDown[fftBin]; + + if (filterDown >= 0) + { + Rpp32f *dstRowPtrTemp = dstPtrTemp + filterDown * dstDescPtr->strides.hStride; + Rpp32f *srcRowPtrTemp = srcRowPtr; + + if (normalize) + weightDown *= normFactors[filterDown]; + __m256 pWeightDown = _mm256_set1_ps(weightDown); + + int vectorLoopCount = 0; + for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + pSrc = _mm256_loadu_ps(srcRowPtrTemp); + pSrc = _mm256_mul_ps(pSrc, pWeightDown); + pDst = _mm256_loadu_ps(dstRowPtrTemp); + pDst = _mm256_add_ps(pDst, pSrc); + _mm256_storeu_ps(dstRowPtrTemp, pDst); + dstRowPtrTemp += vectorIncrement; + srcRowPtrTemp += vectorIncrement; + } + + for (; vectorLoopCount < numFrames; vectorLoopCount++) + (*dstRowPtrTemp++) += weightDown * (*srcRowPtrTemp++); + } + + if (filterUp >= 0 && filterUp < numFilter) + { + Rpp32f *dstRowPtrTemp = dstPtrTemp + filterUp * dstDescPtr->strides.hStride; + Rpp32f *srcRowPtrTemp = srcRowPtr; + + if (normalize) + weightUp *= normFactors[filterUp]; + __m256 pWeightUp = _mm256_set1_ps(weightUp); + + int vectorLoopCount = 0; + for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + pSrc = _mm256_loadu_ps(srcRowPtrTemp); + pSrc = _mm256_mul_ps(pSrc, pWeightUp); + pDst = _mm256_loadu_ps(dstRowPtrTemp); + pDst = _mm256_add_ps(pDst, pSrc); + _mm256_storeu_ps(dstRowPtrTemp, pDst); + dstRowPtrTemp += vectorIncrement; + srcRowPtrTemp += vectorIncrement; + } + + for (; vectorLoopCount < numFrames; vectorLoopCount++) + (*dstRowPtrTemp++) += weightUp * (*srcRowPtrTemp++); + } + + srcRowPtr += srcDescPtr->strides.hStride; + } + } + delete melScalePtr; + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/non_silent_region_detection.hpp b/src/modules/cpu/kernel/non_silent_region_detection.hpp index 74dffb18e..39d9e6940 100644 --- a/src/modules/cpu/kernel/non_silent_region_detection.hpp +++ b/src/modules/cpu/kernel/non_silent_region_detection.hpp @@ -95,8 +95,8 @@ Rpp32f getSquare(Rpp32f &value) RppStatus non_silent_region_detection_host_tensor(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, - Rpp32f *detectedIndexTensor, - Rpp32f *detectionLengthTensor, + Rpp32s *detectedIndexTensor, + Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, diff --git a/src/modules/cpu/kernel/normalize.hpp b/src/modules/cpu/kernel/normalize.hpp new file mode 100644 index 000000000..dbe746d1a --- /dev/null +++ b/src/modules/cpu/kernel/normalize.hpp @@ -0,0 +1,882 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +// Computes strides +void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim) +{ + if (tensorDim > 0) + { + Rpp32u v = 1; + for (Rpp32u i = tensorDim - 1; i > 0; i--) + { + strides[i] = v; + v *= shape[i]; + } + strides[0] = v; + } +} + +// Recursive reduction helper function to compute difference of input with mean and squares them up +template +void compute_diff_square_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements, Rpp32f mean) +{ + if (numElements > 32) + { + Rpp32s currElements = numElements >> 1; + Rpp32f tmp1 = 0, tmp2 = 0; + + // reduce first half and accumulate + compute_diff_square_sum(tmp1, input, inputStride, currElements, mean); + + // reduce second half and accumulate + compute_diff_square_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements, mean); + + tmp1 += tmp2; + output += tmp1; + } + else + { + // reduce to a temporary + Rpp32f tmp = 0; + for (Rpp32s i = 0; i < numElements; i++) + { + Rpp32f curr = (input[i * inputStride] - mean); + auto curSq = curr * curr; + tmp += curSq; + } + + // accumulate in target value + output += tmp; + } +} + +// Recursive reduction helper function to sum up input values +template +void compute_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements) +{ + if (numElements > 32) + { + Rpp32s currElements = numElements >> 1; + Rpp32f tmp1 = 0, tmp2 = 0; + + // reduce first half and accumulate + compute_sum(tmp1, input, inputStride, currElements); + + // reduce second half and accumulate + compute_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements); + + tmp1 += tmp2; + output += tmp1; + } + else + { + // reduce to a temporary + Rpp32f tmp = 0; + for (Rpp32s i = 0; i < numElements; i++) + tmp += input[i * inputStride]; + + // accumulate in target value + output += tmp; + } +} + +// Computes mean for 2D inputs +void compute_2D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride) +{ + Rpp32f *srcPtrTemp = srcPtr; + Rpp32f normFactor = 1.0 / dims[1]; + for(Rpp32u i = 0; i < dims[0]; i++) + { + meanPtr[i] = 0; + compute_sum(meanPtr[i], srcPtrTemp, stride[0], dims[1]); + srcPtrTemp += stride[1]; + meanPtr[i] *= normFactor; + } +} + +// Computes inverse stddev for 2D inputs +void compute_2D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale) +{ + + Rpp32f *srcPtrTemp = srcPtr; + Rpp32f normFactor = (Rpp32f)(1.0 / dims[1]); + for(Rpp32u i = 0; i < dims[0]; i++) + { + stdDevPtr[i] = 0; + compute_diff_square_sum(stdDevPtr[i], srcPtrTemp, stride[0], dims[1], meanPtr[i]); + srcPtrTemp += stride[1]; + } + rpp_rsqrt_sse(stdDevPtr, (Rpp32s)dims[0], 0, normFactor, scale); +} + +// Computes mean for 3D inputs +void compute_3D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, bool isConsecutive = true) +{ + Rpp32f *srcPtrTemp = srcPtr; + if(isConsecutive) + { + Rpp32f normFactor = 1.0 / dims[2]; + for(Rpp32u i = 0; i < dims[0]; i++) + { + float *srcPtrRow = srcPtrTemp; + for(Rpp32u j = 0; j < dims[1]; j++) + { + Rpp32u index = i * dims[1] + j; + meanPtr[index] = 0; + compute_sum(meanPtr[index], srcPtrRow, stride[0], dims[2]); + srcPtrRow += stride[1]; + meanPtr[index] *= normFactor; + } + srcPtrTemp += stride[2]; + } + } + else + { + Rpp32f normFactor = 1.0 / (dims[1] * dims[2]); + for(Rpp32u i = 0; i < dims[0]; i++) + { + meanPtr[i] = 0; + Rpp32f *srcPtrRow = srcPtrTemp; + for(Rpp32u j = 0; j < dims[1]; j++) + { + compute_sum(meanPtr[i], srcPtrRow, stride[0], dims[2]); + srcPtrRow += stride[1]; + } + meanPtr[i] *= normFactor; + srcPtrTemp += stride[2]; + } + } +} + +// Computes inverse stddev for 3D inputs +void compute_3D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale, bool isConsecutive = true) +{ + Rpp32f *srcPtrTemp = srcPtr; + if(isConsecutive) + { + Rpp32f normFactor = (Rpp32f)(1.0 / dims[2]); + for(Rpp32u i = 0; i < dims[0]; i++) + { + float *srcPtrRow = srcPtrTemp; + for(Rpp32u j = 0; j < dims[1]; j++) + { + Rpp32u index = i * dims[1] + j; + stdDevPtr[index] = 0; + compute_diff_square_sum(stdDevPtr[index], srcPtrRow, stride[0], dims[2], meanPtr[index]); + srcPtrRow += stride[1]; + } + srcPtrTemp += stride[2]; + } + rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0] * dims[1]), 0, normFactor, scale); + } + else + { + Rpp32f normFactor = (Rpp32f)(1.0 / (dims[1] * dims[2])); + for(Rpp32u i = 0; i < dims[0]; i++) + { + stdDevPtr[i] = 0; + Rpp32f *srcPtrRow = srcPtrTemp; + for(Rpp32u j = 0; j < dims[1]; j++) + { + compute_diff_square_sum(stdDevPtr[i], srcPtrRow, stride[0], dims[2], meanPtr[i]); + srcPtrRow += stride[1]; + } + srcPtrTemp += stride[2]; + } + rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0]), 0, normFactor, scale); + } +} + +// Computes mean for ND inputs +template +void compute_ND_mean(T *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis) +{ + if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeSum when last dimension is to be normalized + compute_sum(meanPtr[index], srcPtr, stride[level], dims[level]); + else if(level == tensorDim) // Calls computeSum when only 1st axis need to be normalized + compute_sum(meanPtr[index], srcPtr, stride[norm], dims[norm]); + else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store mean + { + for(Rpp32u i = 0; i < dims[level]; i++) + compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis); + } + else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized + compute_ND_mean(srcPtr, meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis); + else if(axis[level]) // Called when axis at present level needs to be normalized + { + for(Rpp32u i = 0; i < dims[level]; i++) + compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis); + } +} + +// Computes inverse stddev for ND inputs +template +void compute_ND_stddev(T *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis) +{ + if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeDiffSumSquare when last dimension is to be normalized + compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[level], dims[level], meanPtr[index]); + else if(level == tensorDim) // Calls computeDiffSumSquare when only 1st axis need to be normalized + compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[norm], dims[norm], meanPtr[index]); + else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store stddev + { + for(Rpp32u i = 0; i < dims[level]; i++) + compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis); + } + else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized + compute_ND_stddev(srcPtr, meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis); + else if(axis[level]) // Called when axis at present level needs to be normalized + { + for(Rpp32u i = 0; i < dims[level]; i++) + compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis); + } +} + +// Computes normalize for 3D non toggle variants +void normalize_3D_tensor_nontoggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length) +{ + Rpp32s paramIdx = 0; + Rpp32s idx1 = 0; + + for(Rpp32u i = 0; i < length[0]; i++) + { + Rpp32f *srcPtrRow = srcPtr; + Rpp32f *dstPtrRow = dstPtr; + for(Rpp32u j = 0; j < length[1]; j++) + { + Rpp32f *srcPtrRowTemp = srcPtrRow; + Rpp32f *dstPtrRowTemp = dstPtrRow; + idx1 = paramIdx; + for(Rpp32u k = 0; k < length[2]; k++) + { + *dstPtrRowTemp = ((*srcPtrRowTemp - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift; + if(k < length[2] - 1) + paramIdx += paramStride[2]; + srcPtrRowTemp++; + dstPtrRowTemp++; + } + if(j < length[1] - 1) + paramIdx = (!paramStride[1]) ? idx1 : paramIdx + paramStride[1]; + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + if(i < length[0] - 1) + paramIdx = (!paramStride[0]) ? 0 : paramIdx + paramStride[0]; + srcPtr += srcGenericDescPtr->strides[1]; + dstPtr += dstGenericDescPtr->strides[1]; + } +} + +// Computes normalize for 3D toggle variants when axis mask is set to 3 +void normalize_3D_tensor_axis3_toggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length) +{ + Rpp32f *srcPtrTemp = srcPtr; + Rpp32f *dstPtrTemp[length[2]]; + dstPtrTemp[0] = dstPtr; + for(Rpp32u i = 1; i < length[2]; i++) + dstPtrTemp[i] = dstPtrTemp[i-1] + dstGenericDescPtr->strides[1]; + Rpp32s paramIdx = 0; + + for(Rpp32u i = 0; i < length[0]; i++) + { + Rpp32f *srcPtrRow = srcPtrTemp; + Rpp32f *dstPtrRow[length[2]]; + for(Rpp32u l = 0; l < length[2]; l++) + dstPtrRow[l] = dstPtrTemp[l]; + for(Rpp32u j = 0; j < length[1]; j++) + { + Rpp32f *srcPtrRowTemp = srcPtrRow; + Rpp32f *dstPtrRowTemp[length[2]]; + for(Rpp32u l = 0; l < length[2]; l++) + dstPtrRowTemp[l] = dstPtrRow[l]; + for(Rpp32u k = 0; k < length[2]; k++) + { + *dstPtrRowTemp[k]++ = ((*srcPtrRowTemp++ - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift; + paramIdx += paramStride[2]; + } + paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1]; + srcPtrRow += srcGenericDescPtr->strides[2]; + for(Rpp32u l = 0; l < length[2]; l++) + dstPtrRow[l] += dstGenericDescPtr->strides[3]; + } + srcPtrTemp += srcGenericDescPtr->strides[1]; + for(Rpp32u l = 0; l < length[2]; l++) + dstPtrTemp[l] += dstGenericDescPtr->strides[2]; + } +} + +// Computes normalize for 3D non toggle variants, optimized with AVX when axis mask set to 3 and 16 channel normalize +void normalize_3D_tensor_avx_axis3(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u bufferLength, Rpp32u *length) +{ + Rpp32u vectorIncrement = 16; + Rpp32u alignedLength = (bufferLength / 16) * 16; + Rpp32u outerDim = length[0]; + + // set shift, mean and stddev + __m256 pShift = _mm256_set1_ps(shift); + __m256 pMean1 = _mm256_loadu_ps(meanPtr); + __m256 pMean2 = _mm256_loadu_ps(meanPtr + 8); + __m256 pMultiplier1 = _mm256_loadu_ps(multiplierPtr); + __m256 pMultiplier2 = _mm256_loadu_ps(multiplierPtr + 8); + + for(Rpp32u i = 0; i < outerDim; i++) + { + Rpp32f *srcPtrTemp = srcPtr + i * srcGenericDescPtr->strides[1]; + Rpp32f *dstPtrTemp = dstPtr + i * dstGenericDescPtr->strides[1]; + + Rpp32u vectorLoopCount = 0; + for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement) + { + __m256 pSrc1 = _mm256_loadu_ps(srcPtrTemp); + __m256 pSrc2 = _mm256_loadu_ps(srcPtrTemp + 8); + __m256 pDst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc1, pMean1), pMultiplier1), pShift); + __m256 pDst2 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc2, pMean2), pMultiplier2), pShift); + _mm256_storeu_ps(dstPtrTemp, pDst1); + _mm256_storeu_ps(dstPtrTemp + 8, pDst2); + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + } +} + +// Computes normalize for ND non toggle variants for i8 dataype +void normalize_ND_tensor_nontoggle(Rpp32s *srcPtr, Rpp32u *srcStride, Rpp32f *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr, + Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx) +{ + Rpp32u idx1 = 0; + if(tensorDim == 1) + { + for(Rpp32u k = 0; k < length[level]; k++) + { + *dstPtr++ = (((Rpp32f)(*srcPtr + 128) - meanPtr[idx]) * multiplierPtr[idx]) + shift; + if(k < length[level] - 1) + idx += paramStride[level]; + srcPtr++; + } + } + else + { + idx1 = idx; + for (Rpp32u i = 0; i < length[level]; i++) + { + normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx); + if(i < length[level] - 1) + idx = (!paramStride[level]) ? idx1 : idx + paramStride[level]; + dstPtr += srcStride[level]; + srcPtr += srcStride[level]; + } + } +} + +// Computes normalize for ND non toggle variants +template +void normalize_ND_tensor_nontoggle(T1 *srcPtr, Rpp32u *srcStride, T2 *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr, + Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx) +{ + Rpp32u idx1 = 0; + if(tensorDim == 1) + { + T1 *srcPtrTemp = srcPtr; + T2 *dstPtrTemp = dstPtr; + + for(Rpp32u k = 0; k < length[level]; k++) + { + *dstPtrTemp = (((T2)*srcPtrTemp - meanPtr[idx]) * multiplierPtr[idx]) + shift; + if(k < length[level] - 1) + idx += paramStride[level]; + srcPtrTemp++; + dstPtrTemp++; + } + } + else + { + idx1 = idx; + for (Rpp32u i = 0; i < length[level]; i++) + { + normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx); + if(i < length[level] - 1) + idx = (!paramStride[level]) ? idx1 : idx + paramStride[level]; + dstPtr += srcStride[level]; + srcPtr += srcStride[level]; + } + } +} + +// Computes normalize for 2D +void normalize_2D_tensor(Rpp32f *srcPtr, RpptGenericDescPtr srcDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstDescPtr, + Rpp32f *meanPtr, Rpp32f *invStdDevPtr, Rpp32f shift, Rpp32u *dims, Rpp32u *paramStride) +{ + if (paramStride[1]) // Optimized with AVX when axis mask set to 2 + { + Rpp32u vectorIncrement = 8; + Rpp32u bufferLength = dims[1]; + Rpp32u alignedLength = (bufferLength / 8) * 8; + + __m256 pShift = _mm256_set1_ps(shift); + for(Rpp32u i = 0; i < dims[0]; i++) + { + Rpp32f *srcPtrTemp = srcPtr + i * srcDescPtr->strides[1]; + Rpp32f *dstPtrTemp = dstPtr + i * dstDescPtr->strides[1]; + + // set mean and stddev + Rpp32f mean = meanPtr[i]; + Rpp32f invStdDev = invStdDevPtr[i]; + __m256 pMean, pInvStdDev; + pMean = _mm256_set1_ps(mean); + pInvStdDev = _mm256_set1_ps(invStdDev); + + Rpp32u vectorLoopCount = 0; + for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement) + { + __m256 pSrc = _mm256_loadu_ps(srcPtrTemp); + __m256 pDst = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc, pMean), pInvStdDev), pShift); + _mm256_storeu_ps(dstPtrTemp, pDst); + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for(; vectorLoopCount < dims[1] ; vectorLoopCount ++) + *dstPtrTemp++ = (*srcPtrTemp++ - mean) * invStdDev + shift; + } + } + else + { + Rpp32s paramIdx = 0; + for(Rpp32u i = 0; i < dims[0]; i++) + { + Rpp32f *srcPtrTemp = srcPtr; + Rpp32f *dstPtrTemp = dstPtr; + for(Rpp32u j = 0; j < dims[1]; j++) + { + *dstPtrTemp++ = (*srcPtrTemp++ - meanPtr[paramIdx]) * invStdDevPtr[paramIdx] + shift; + paramIdx += paramStride[0]; + } + paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1]; + srcPtr += srcDescPtr->strides[1]; + dstPtr += dstDescPtr->strides[1]; + } + } +} + +// Performs collapse axis operation wherein continuous axis that require normalization are combined together +void collapse_axis(Rpp32u *tensorDim, Rpp32u *axis, Rpp32u *length, Rpp32u *newAxis, Rpp32u *newDims, Rpp32u *lastNormAxis) +{ + int skipped = 0, prev = -2, k = 0; + for(Rpp32u i = 0; i < *tensorDim; i++) + { + if(axis[i]) + { + int temp = i - skipped; + if(temp != prev + 1) + { + newAxis[k] = 1; + newDims[k] = length[i]; + prev = i; + k++; + } + else if(prev >= 0) + { + newDims[prev] *= length[i]; + skipped++; + } + } + else + { + newDims[k] = length[i]; + k++; + } + } + *tensorDim -= skipped; + for(Rpp32u i = 0; i < *tensorDim; i++) + { + if(newAxis[i]) + *lastNormAxis = i; + } +} + +RppStatus normalize_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensorPtr, + Rpp32f *stdDevTensorPtr, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + Rpp32u maxSize = 1; + // Compute maxSize as length of input tensors differ based on axisMask and tensorDims + for(int batch = 0; batch < batchSize; batch++) + { + Rpp32u size = 1; + for(int i = 0; i < tensorDims; i++) + size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i]; + maxSize = std::max(maxSize, size); + } + + if(!computeMeanStddev) + { + for(Rpp32u i = 0; i < maxSize; i++) + stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i]; + maxSize = 0; + } + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * tensorDims * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[tensorDims]; + + Rpp32f *srcPtrTemp, *dstPtrTemp, *meanTensor, *stdDevTensor; + srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + meanTensor = meanTensorPtr + batchCount * maxSize; + stdDevTensor = stdDevTensorPtr + batchCount * maxSize; + + // Set all values in dst buffer to 0.0 + for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++) + dstPtrTemp[cnt] = 0.0f; + + Rpp32f *srcPtrChannel = srcPtrTemp; + + if(tensorDims == 2) // Called for audio testcase and for any other 2D case + { + Rpp32u paramStride[2]; + Rpp32u srcReductionDims[2], srcStride[2]; + if (axisMask == 3) + { + srcStride[0] = srcStride[1] = srcGenericDescPtr->strides[2]; + srcReductionDims[0] = 1; + srcReductionDims[1] = length[0] * length[1]; + paramStride[0] = paramStride[1] = 0; + } + else if (axisMask == 1) + { + srcStride[0] = srcGenericDescPtr->strides[1]; + srcStride[1] = srcGenericDescPtr->strides[2]; + srcReductionDims[0] = length[1]; + srcReductionDims[1] = length[0]; + paramStride[0] = 1; + paramStride[1] = 0; + } + else if (axisMask == 2) + { + srcStride[0] = srcGenericDescPtr->strides[2]; + srcStride[1] = srcGenericDescPtr->strides[1]; + srcReductionDims[0] = length[0]; + srcReductionDims[1] = length[1]; + paramStride[0] = 0; + paramStride[1] = 1; + } + + if(computeMeanStddev & 1) // Check if mean is to be computed internally + compute_2D_mean(srcPtrTemp, meanTensor, srcReductionDims, srcStride); + if(computeMeanStddev & 2) // Check if stddev is to be computed internally + compute_2D_inv_std_dev(srcPtrTemp, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale); + + normalize_2D_tensor(srcPtrTemp, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, length, paramStride); + } + else if(tensorDims == 3) // Called when a 3D tensor is passed to kernel + { + Rpp32u paramStride[3]; + Rpp32u srcReductionDims[3], srcStride[3]; + Rpp32u reductionDims; + bool isConsecutive = true; + switch(axisMask) + { + case 1: // Normalize axes 0 + { + reductionDims = length[1] * length[2]; + paramStride[0] = 0; + paramStride[1] = paramStride[2] = 1; + srcReductionDims[0] = length[1]; + srcReductionDims[1] = length[2]; + srcReductionDims[2] = length[0]; + srcStride[0] = srcGenericDescPtr->strides[1]; + srcStride[1] = srcGenericDescPtr->strides[3]; + srcStride[2] = srcGenericDescPtr->strides[2]; + break; + } + case 2: // Normalize axes 1 + { + reductionDims = length[0] * length[2]; + paramStride[1] = 0; + paramStride[0] = paramStride[2] = 1; + srcReductionDims[0] = length[0]; + srcReductionDims[1] = length[2]; + srcReductionDims[2] = length[1]; + srcStride[0] = srcGenericDescPtr->strides[2]; + srcStride[1] = srcGenericDescPtr->strides[3]; + srcStride[2] = srcGenericDescPtr->strides[1]; + break; + } + case 3: // Normalize axes 0, 1 + { + reductionDims = length[2]; + paramStride[0] = paramStride[1] = 0; + paramStride[2] = 1; + srcReductionDims[0] = 1; + srcReductionDims[1] = length[2]; + srcReductionDims[2] = length[0] * length[1]; + srcStride[0] = srcGenericDescPtr->strides[2]; + srcStride[1] = srcGenericDescPtr->strides[3]; + srcStride[2] = srcGenericDescPtr->strides[3]; + break; + } + case 4: // Normalize across 2 + { + reductionDims = length[0] * length[1]; + paramStride[2] = 0; + paramStride[0] = paramStride[1] = 1; + srcReductionDims[0] = length[0]; + srcReductionDims[1] = length[1]; + srcReductionDims[2] = length[2]; + srcStride[0] = srcGenericDescPtr->strides[3]; + srcStride[1] = srcGenericDescPtr->strides[2]; + srcStride[2] = srcGenericDescPtr->strides[1]; + break; + } + case 5: // Normalize across 0, 2 + { + reductionDims = length[1]; + paramStride[0] = paramStride[2] = 0; + paramStride[1] = 1; + srcReductionDims[0] = length[1]; + srcReductionDims[1] = length[0]; + srcReductionDims[2] = length[2]; + srcStride[0] = srcGenericDescPtr->strides[3]; + srcStride[1] = srcGenericDescPtr->strides[1]; + srcStride[2] = srcGenericDescPtr->strides[2]; + isConsecutive = false; + break; + } + case 6: // Normalize across 1, 2 + { + reductionDims = length[0]; + paramStride[1] = paramStride[2] = 0; + paramStride[0] = 1; + srcReductionDims[0] = 1; + srcReductionDims[1] = length[0]; + srcReductionDims[2] = length[1] * length[2]; + srcStride[0] = srcGenericDescPtr->strides[3]; + srcStride[1] = srcGenericDescPtr->strides[1]; + srcStride[2] = srcGenericDescPtr->strides[3]; + break; + } + case 7: // Normalize across 0, 1, 2 + { + reductionDims = 1; + paramStride[0] = paramStride[1] = paramStride[2] = 0; + srcReductionDims[0] = 1; + srcReductionDims[1] = 1; + srcReductionDims[2] = length[0] * length[1] * length[2]; + srcStride[0] = srcStride[1] = srcStride[2] = srcGenericDescPtr->strides[3]; + break; + } + default: + { + std::cout<<"Invalid Axis mask"<strides[i]; + + if(computeMeanStddev & 1) // Check if mean is to be computed internally + compute_3D_mean(srcPtrChannel, meanTensor, srcReductionDims, srcStride, isConsecutive); + if(computeMeanStddev & 2) // Check if stddev is to be computed internally + compute_3D_inv_std_dev(srcPtrChannel, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale, isConsecutive); + + if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC) && (srcGenericDescPtr->dims[3] == 16)) + normalize_3D_tensor_avx_axis3(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length[1] * layoutParams.bufferMultiplier, length); + else if((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC)) + normalize_3D_tensor_nontoggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length); + else if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NCHW)) + normalize_3D_tensor_axis3_toggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length); + } + else // Handle any other ND tensor is passed to kernel + { + // Compute length of input tensors as they differ based on axisMask and tensorDims + int size = 1; + for(int i = 0; i < tensorDims; i++) + size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i]; + + Rpp32u totalElements = 1; + Rpp32u lastNormAxis = 0; + Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims]; + // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis + memset(newAxis, 0, tensorDims * sizeof(Rpp32u)); + memset(newDims, 0, tensorDims * sizeof(Rpp32u)); + + for(Rpp32u i = 0; i < tensorDims; i++) + { + axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0; + totalElements *= axis[i] ? length[i] : 1; + srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1]; + } + + Rpp32u paramStride[tensorDims], srcStride[tensorDims]; + collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis); + compute_strides(srcStride, newDims, tensorDims); + + if(computeMeanStddev & 1) // Check if mean is to be computed internally + { + compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis); + Rpp32f normFactor = 1.0 / totalElements; + for(int i = 0; i < size; i++) + meanTensor[i] *= normFactor; + } + if(computeMeanStddev & 2) // Check if stddev is to be computed internally + { + compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis); + Rpp32f normFactor = (Rpp32f)(1.0 / totalElements); + rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale); + } + + for(Rpp32u i = 0; i < tensorDims; i++) + paramStride[i] = !newAxis[i]; + + Rpp32u idx = 0; + normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx); + } + } + + return RPP_SUCCESS; +} +template +RppStatus normalize_generic_host_tensor(T1 *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + T2 *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensorPtr, + Rpp32f *stdDevTensorPtr, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + Rpp32u maxSize = 1; + for(int batch = 0; batch < batchSize; batch++) + { + Rpp32u size = 1; // length of input tensors differ based on axisMask and tensorDims + for(int i = 0; i < tensorDims; i++) + size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i]; + maxSize = std::max(maxSize, size); + } + if(!computeMeanStddev) + { + for(Rpp32u i = 0; i < maxSize; i++) + stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i]; + maxSize = 0; + } + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + int size = 1; + Rpp32u *roi = roiTensor + batchCount * tensorDims * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[tensorDims]; + + for(int i = 0; i < tensorDims; i++) + size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i]; + + T1 *srcPtrTemp; + T2 *dstPtrTemp; + Rpp32f *meanTensor, *stdDevTensor; + srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + meanTensor = meanTensorPtr + batchCount * maxSize; + stdDevTensor = stdDevTensorPtr + batchCount * maxSize; + + // Set all values in dst buffer to 0.0 + for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++) + dstPtrTemp[cnt] = 0.0f; + + T1 *srcPtrChannel = srcPtrTemp; + + int totalElements = 1; + Rpp32u lastNormAxis = 0; + Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims]; + // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis + memset(newAxis, 0, sizeof(newAxis)); + memset(newDims, 0, sizeof(newDims)); + + for(int i = 0; i < tensorDims; i++) + { + axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0; + totalElements *= axis[i] ? length[i] : 1; + srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1]; + } + + Rpp32u paramStride[tensorDims], srcStride[tensorDims]; + collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis); + compute_strides(srcStride, newDims, tensorDims); + + if(computeMeanStddev & 1) // Check if mean is to be computed internally + { + compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis); + Rpp32f normFactor = 1.0 / totalElements; + for(int i = 0; i < size; i++) + meanTensor[i] *= normFactor; + } + if(computeMeanStddev & 2) // Check if stddev is to be computed internally + { + compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis); + Rpp32f normFactor = (Rpp32f)(1.0 / totalElements); + rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale); + } + + for(int i = 0; i < tensorDims; i++) + paramStride[i] = !newAxis[i]; + + Rpp32u idx = 0; + normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/cpu/kernel/pre_emphasis_filter.hpp b/src/modules/cpu/kernel/pre_emphasis_filter.hpp index 1d25921ad..889cd2dec 100644 --- a/src/modules/cpu/kernel/pre_emphasis_filter.hpp +++ b/src/modules/cpu/kernel/pre_emphasis_filter.hpp @@ -50,7 +50,7 @@ RppStatus pre_emphasis_filter_host_tensor(Rpp32f *srcPtr, dstPtrTemp[0] = srcPtrTemp[0] - coeff * border; Rpp32s vectorIncrement = 8; - Rpp32s alignedLength = (bufferLength / 8) * 8; + Rpp32s alignedLength = (bufferLength / 8) * 8 - 8; __m256 pCoeff = _mm256_set1_ps(coeff); Rpp32s vectorLoopCount = 1; diff --git a/src/modules/cpu/kernel/slice.hpp b/src/modules/cpu/kernel/slice.hpp index c451b67b4..37c3097c9 100644 --- a/src/modules/cpu/kernel/slice.hpp +++ b/src/modules/cpu/kernel/slice.hpp @@ -26,184 +26,213 @@ SOFTWARE. #include "rpp_cpu_simd.hpp" #include "rpp_cpu_common.hpp" -RppStatus slice_f32_f32_host_tensor(Rpp32f *srcPtr, - RpptGenericDescPtr srcGenericDescPtr, - Rpp32f *dstPtr, - RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, - RpptRoi3DType roiType, - RppLayoutParams layoutParams, - rpp::Handle& handle) +template +RppStatus slice_host_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + T *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + T* fillValue, + bool enablePadding, + Rpp32u *roiTensor, + RppLayoutParams layoutParams, + rpp::Handle& handle) { - RpptROI3D roiDefault; - if(srcGenericDescPtr->layout==RpptLayout::NCDHW) - roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]}; - else if(srcGenericDescPtr->layout==RpptLayout::NDHWC) - roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]}; Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims omp_set_dynamic(0); #pragma omp parallel for num_threads(numThreads) for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) { - RpptROI3D roi; - RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount]; - compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + T *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0]; - Rpp32f *srcPtrImage, *dstPtrImage; - srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0]; - dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + T *srcPtrChannel, *dstPtrChannel; + dstPtrChannel = dstPtrTemp; - Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; - Rpp32f *srcPtrChannel, *dstPtrChannel; - dstPtrChannel = dstPtrImage; + // get the starting address of length values from roiTensor + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); - // Slice without fused output-layout toggle (NCDHW -> NCDHW) - if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + if (numDims == 4) { - srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); - - Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f); - for(int c = 0; c < layoutParams.channelParam; c++) + // order of dims + Rpp32s dimsOrder[3]; + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + dimsOrder[0] = 1; // depth + dimsOrder[1] = 2; // height + dimsOrder[2] = 3; // width + } + else { - Rpp32f *srcPtrDepth, *dstPtrDepth; - srcPtrDepth = srcPtrChannel; - dstPtrDepth = dstPtrChannel; + dimsOrder[0] = 0; // depth + dimsOrder[1] = 1; // height + dimsOrder[2] = 2; // width + } + Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]); + Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier; + Rpp32u copyLengthInBytes = bufferLength * sizeof(T); + + // if padding is required, fill the buffer with fill value specified + bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) || + ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) || + ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]])); + if (needPadding && enablePadding) + std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue); + + // slice without fused output-layout toggle (NCDHW -> NCDHW) + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * srcGenericDescPtr->strides[3]) + (anchor[3] * layoutParams.bufferMultiplier); + for(int c = 0; c < layoutParams.channelParam; c++) + { + T *srcPtrDepth, *dstPtrDepth; + srcPtrDepth = srcPtrChannel; + dstPtrDepth = dstPtrChannel; + for(int i = 0; i < maxDepth; i++) + { + T *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrDepth; + dstPtrRow = dstPtrDepth; + for(int j = 0; j < maxHeight; j++) + { + memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); + srcPtrRow += srcGenericDescPtr->strides[3]; + dstPtrRow += dstGenericDescPtr->strides[3]; + } + srcPtrDepth += srcGenericDescPtr->strides[2]; + dstPtrDepth += dstGenericDescPtr->strides[2]; + } + srcPtrChannel += srcGenericDescPtr->strides[1]; + dstPtrChannel += srcGenericDescPtr->strides[1]; + } + } - for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + // slice without fused output-layout toggle (NDHWC -> NDHWC) + else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier); + T *srcPtrDepth = srcPtrChannel; + T *dstPtrDepth = dstPtrChannel; + for(int i = 0; i < maxDepth; i++) { - Rpp32f *srcPtrRow, *dstPtrRow; + T *srcPtrRow, *dstPtrRow; srcPtrRow = srcPtrDepth; dstPtrRow = dstPtrDepth; - - for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + for(int j = 0; j < maxHeight; j++) { memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); - - srcPtrRow += srcGenericDescPtr->strides[3]; - dstPtrRow += dstGenericDescPtr->strides[3]; + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; } - srcPtrDepth += srcGenericDescPtr->strides[2]; - dstPtrDepth += dstGenericDescPtr->strides[2]; + srcPtrDepth += srcGenericDescPtr->strides[1]; + dstPtrDepth += dstGenericDescPtr->strides[1]; } - - srcPtrChannel += srcGenericDescPtr->strides[1]; - dstPtrChannel += srcGenericDescPtr->strides[1]; } } - // Slice without fused output-layout toggle (NDHWC -> NDHWC) - else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + else if (numDims == 3) { - Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f); - srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); - Rpp32f *srcPtrDepth = srcPtrChannel; - Rpp32f *dstPtrDepth = dstPtrChannel; - - for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + // order of dims + Rpp32s dimsOrder[2]; + if (dstGenericDescPtr->layout == RpptLayout::NCHW) { - Rpp32f *srcPtrRow, *dstPtrRow; - srcPtrRow = srcPtrDepth; - dstPtrRow = dstPtrDepth; - - for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) - { - memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); - - srcPtrRow += srcGenericDescPtr->strides[2]; - dstPtrRow += dstGenericDescPtr->strides[2]; - } - srcPtrDepth += srcGenericDescPtr->strides[1]; - dstPtrDepth += dstGenericDescPtr->strides[1]; + dimsOrder[0] = 1; // height + dimsOrder[1] = 2; // width + } + else + { + dimsOrder[0] = 0; // height + dimsOrder[1] = 1; // width } - } - } - - return RPP_SUCCESS; -} - -RppStatus slice_u8_u8_host_tensor(Rpp8u *srcPtr, - RpptGenericDescPtr srcGenericDescPtr, - Rpp8u *dstPtr, - RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, - RpptRoi3DType roiType, - RppLayoutParams layoutParams, - rpp::Handle& handle) -{ - RpptROI3D roiDefault; - if(srcGenericDescPtr->layout==RpptLayout::NCDHW) - roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]}; - else if(srcGenericDescPtr->layout==RpptLayout::NDHWC) - roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]}; - Rpp32u numThreads = handle.GetNumThreads(); - - omp_set_dynamic(0); -#pragma omp parallel for num_threads(numThreads) - for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) - { - RpptROI3D roi; - RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount]; - compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType); - Rpp8u *srcPtrImage, *dstPtrImage; - srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0]; - dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier; + Rpp32u copyLengthInBytes = bufferLength * sizeof(T); - Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier; - Rpp8u *srcPtrChannel, *dstPtrChannel; - dstPtrChannel = dstPtrImage; + // if padding is required, fill the buffer with fill value specified + bool needPadding = ((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) || + ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]); + if (needPadding && enablePadding) + std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue); - // Slice without fused output-layout toggle (NCDHW -> NCDHW) - if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) - { - srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); - for(int c = 0; c < layoutParams.channelParam; c++) + // slice without fused output-layout toggle (NCHW -> NCHW) + if (dstGenericDescPtr->layout == RpptLayout::NCHW) { - Rpp8u *srcPtrDepth, *dstPtrDepth; - srcPtrDepth = srcPtrChannel; - dstPtrDepth = dstPtrChannel; - for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier); + for(int c = 0; c < layoutParams.channelParam; c++) { - Rpp8u *srcPtrRow, *dstPtrRow; - srcPtrRow = srcPtrDepth; - dstPtrRow = dstPtrDepth; - for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + T *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + for(int j = 0; j < maxHeight; j++) { - memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u)); - srcPtrRow += srcGenericDescPtr->strides[3]; - dstPtrRow += dstGenericDescPtr->strides[3]; + memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; } - srcPtrDepth += srcGenericDescPtr->strides[2]; - dstPtrDepth += dstGenericDescPtr->strides[2]; + srcPtrChannel += srcGenericDescPtr->strides[1]; + dstPtrChannel += srcGenericDescPtr->strides[1]; } - srcPtrChannel += srcGenericDescPtr->strides[1]; - dstPtrChannel += srcGenericDescPtr->strides[1]; } - } - // Slice without fused output-layout toggle (NDHWC -> NDHWC) - else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) - { - srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier); - Rpp8u *srcPtrDepth = srcPtrChannel; - Rpp8u *dstPtrDepth = dstPtrChannel; - for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++) + // slice without fused output-layout toggle (NHWC -> NHWC) + else if (dstGenericDescPtr->layout == RpptLayout::NHWC) { - Rpp8u *srcPtrRow, *dstPtrRow; - srcPtrRow = srcPtrDepth; - dstPtrRow = dstPtrDepth; - - for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++) + srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * layoutParams.bufferMultiplier); + T *srcPtrRow = srcPtrChannel; + T *dstPtrRow = dstPtrChannel; + for(int j = 0; j < maxHeight; j++) { - memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u)); - srcPtrRow += srcGenericDescPtr->strides[2]; - dstPtrRow += dstGenericDescPtr->strides[2]; + memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); + srcPtrRow += srcGenericDescPtr->strides[1]; + dstPtrRow += dstGenericDescPtr->strides[1]; } - srcPtrDepth += srcGenericDescPtr->strides[1]; - dstPtrDepth += dstGenericDescPtr->strides[1]; } } + else if (numDims == 2) + { + srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + anchor[1]; + Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]); + Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]); + Rpp32u copyLengthInBytes = maxWidth * sizeof(T); + + // if padding is required, fill the buffer with fill value specified + bool needPadding = ((anchor[0] + shape[0]) > length[0]) || + ((anchor[1] + shape[1]) > length[1]); + if (needPadding && enablePadding) + std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue); + + T *srcPtrRow = srcPtrChannel; + T *dstPtrRow = dstPtrChannel; + for(int j = 0; j < maxHeight; j++) + { + memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes); + srcPtrRow += srcGenericDescPtr->strides[1]; + dstPtrRow += dstGenericDescPtr->strides[1]; + } + } + else if (numDims == 1) + { + srcPtrChannel = srcPtrTemp + anchor[0]; + Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]); + Rpp32u copyLengthInBytes = maxLength * sizeof(T); + + // if padding is required, fill the buffer with fill value specified + bool needPadding = ((anchor[0] + shape[0]) > length[0]); + if (needPadding && enablePadding) + std::fill(dstPtrTemp, dstPtrTemp + dstGenericDescPtr->strides[0] - 1, *fillValue); + memcpy(dstPtrChannel, srcPtrChannel, copyLengthInBytes); + } } return RPP_SUCCESS; diff --git a/src/modules/cpu/kernel/spectrogram.hpp b/src/modules/cpu/kernel/spectrogram.hpp new file mode 100644 index 000000000..2489d2180 --- /dev/null +++ b/src/modules/cpu/kernel/spectrogram.hpp @@ -0,0 +1,245 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "third_party/ffts/ffts.h" +#include "third_party/ffts/ffts_attributes.h" +#include + +bool is_pow2(Rpp64s n) { return (n & (n-1)) == 0; } +inline bool can_use_real_impl(Rpp64s n) { return is_pow2(n); } +inline Rpp64s size_in_buf(Rpp64s n) { return can_use_real_impl(n) ? n : 2 * n; } +inline Rpp64s size_out_buf(Rpp64s n) { return can_use_real_impl(n) ? n + 2 : 2 * n; } + +// Compute hanning window +inline void hann_window(Rpp32f *output, Rpp32s windowSize) +{ + Rpp64f a = (2.0 * M_PI) / windowSize; + for (Rpp32s t = 0; t < windowSize; t++) + { + Rpp64f phase = a * (t + 0.5); + output[t] = (0.5 * (1.0 - std::cos(phase))); + } +} + +// Compute number of spectrogram windows +inline Rpp32s get_num_windows(Rpp32s length, Rpp32s windowLength, Rpp32s windowStep, bool centerWindows) +{ + if (!centerWindows) + length -= windowLength; + return ((length / windowStep) + 1); +} + +// Compute reflect start idx to pad +inline Rpp32s get_idx_reflect(Rpp32s loc, Rpp32s minLoc, Rpp32s maxLoc) +{ + if (maxLoc - minLoc < 2) + return maxLoc - 1; + for (;;) + { + if (loc < minLoc) + loc = 2 * minLoc - loc; + else if (loc >= maxLoc) + loc = 2 * maxLoc - 2 - loc; + else + break; + } + return loc; +} + +RppStatus spectrogram_host_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s *srcLengthTensor, + bool centerWindows, + bool reflectPadding, + Rpp32f *windowFunction, + Rpp32s nfft, + Rpp32s power, + Rpp32s windowLength, + Rpp32s windowStep, + rpp::Handle& handle) +{ + Rpp32s windowCenterOffset = 0; + bool vertical = (dstDescPtr->layout == RpptLayout::NFT); + if (centerWindows) windowCenterOffset = windowLength / 2; + if (nfft == 0) nfft = windowLength; + const Rpp32s numBins = nfft / 2 + 1; + const Rpp32f mulFactor = (2.0 * M_PI) / nfft; + const Rpp32u hStride = dstDescPtr->strides.hStride; + const Rpp32s alignedNfftLength = nfft & ~7; + const Rpp32s alignedNbinsLength = numBins & ~7; + const Rpp32s alignedWindowLength = windowLength & ~7; + bool useRealImpl = can_use_real_impl(nfft); + const auto fftInSize = size_in_buf(nfft); + const auto fftOutSize = size_out_buf(nfft); + + Rpp32f *windowFn = static_cast(calloc(windowLength, sizeof(Rpp32f))); + + // Generate hanning window + if (windowFunction == NULL) + hann_window(windowFn, windowLength); + else + memcpy(windowFn, windowFunction, windowLength * sizeof(Rpp32f)); + Rpp32u numThreads = handle.GetNumThreads(); + + // Get windows output + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (Rpp32s batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride; + Rpp32s bufferLength = srcLengthTensor[batchCount]; + Rpp32s numWindows = get_num_windows(bufferLength, windowLength, windowStep, centerWindows); + Rpp32f windowOutput[numWindows * nfft]; + std::fill_n(windowOutput, numWindows * nfft, 0); + for (Rpp32s w = 0; w < numWindows; w++) + { + Rpp32s windowStart = w * windowStep - windowCenterOffset; + Rpp32f *windowOutputTemp = windowOutput + (w * nfft); + // Pad when either windowStart less than zero or length greater than input srclength + if (windowStart < 0 || (windowStart + windowLength) > bufferLength) + { + for (Rpp32s t = 0; t < windowLength; t++) + { + Rpp32s inIdx = windowStart + t; + if (reflectPadding) + { + inIdx = get_idx_reflect(inIdx, 0, bufferLength); + *windowOutputTemp++ = windowFn[t] * srcPtrTemp[inIdx]; + } + else + { + if (inIdx >= 0 && inIdx < bufferLength) + *windowOutputTemp++ = windowFn[t] * srcPtrTemp[inIdx]; + else + *windowOutputTemp++ = 0; + } + } + } + else + { + Rpp32f *srcPtrWindowTemp = srcPtrTemp + windowStart; + Rpp32f *windowFnTemp = windowFn; + Rpp32s t = 0; + for (; t < alignedWindowLength; t += 8) + { + __m256 pSrc, pWindowFn; + pSrc = _mm256_loadu_ps(srcPtrWindowTemp); + pWindowFn = _mm256_loadu_ps(windowFnTemp); + pSrc = _mm256_mul_ps(pSrc, pWindowFn); + _mm256_storeu_ps(windowOutputTemp, pSrc); + srcPtrWindowTemp += 8; + windowFnTemp += 8; + windowOutputTemp += 8; + } + for (; t < windowLength; t++) + *windowOutputTemp++ = (*windowFnTemp++) * (*srcPtrWindowTemp++); + } + } + + // Generate FFT output + ffts_plan_t *p; + if(useRealImpl) + p = ffts_init_1d_real(nfft, FFTS_FORWARD); + else + p = ffts_init_1d(nfft, FFTS_FORWARD); + + if (!p) + { + printf("FFT Plan is unsupported. Exiting the code\n"); + exit(0); + } + + // Set temporary buffers to 0 + Rpp32f FFTS_ALIGN(32) *fftInBuf = static_cast(_mm_malloc(fftInSize * sizeof(Rpp32f), 32)); // ffts requires 32-byte aligned memory + Rpp32f FFTS_ALIGN(32) *fftOutBuf = static_cast(_mm_malloc(fftOutSize * sizeof(Rpp32f), 32)); // ffts requires 32-byte aligned memory + + for (Rpp32s w = 0; w < numWindows; w++) + { + Rpp32f *dstPtrBinTemp = dstPtrTemp + (w * hStride); + Rpp32f *windowOutputTemp = windowOutput + (w * nfft); + for(int k = 0; k < fftInSize; k++) + fftInBuf[k] = 0.0f; + + for(int k = 0; k < fftOutSize; k++) + fftOutBuf[k] = 0.0f; + + Rpp32s inWindowStart = windowLength < nfft ? (nfft - windowLength) / 2 : 0; + // Copy the window input to fftInBuf + if (useRealImpl) + { + for (int i = 0; i < windowLength; i++) + fftInBuf[inWindowStart + i] = windowOutputTemp[i]; + } + else + { + for (int i = 0; i < windowLength; i++) + { + Rpp32s off = 2 * (inWindowStart + i); + fftInBuf[off] = windowOutputTemp[i]; + fftInBuf[off + 1] = 0.0f; + } + } + + ffts_execute(p, fftInBuf, fftOutBuf); + auto *complexFft = reinterpret_cast *>(fftOutBuf); + Rpp32s outIdx = w; + if (vertical) + { + if (power == 1) + { + for (int i = 0; i < numBins; i++, outIdx += hStride) + dstPtrTemp[outIdx] = std::abs(complexFft[i]); + } + else + { + for (int i = 0; i < numBins; i++, outIdx += hStride) + dstPtrTemp[outIdx] = std::norm(complexFft[i]); + } + } + else + { + if (power == 1) + { + for (int i = 0; i < numBins; i++) + *dstPtrBinTemp++ = std::abs(complexFft[i]); + } + else + { + for (int i = 0; i < numBins; i++) + *dstPtrBinTemp++ = std::norm(complexFft[i]); + } + } + } + ffts_free(p); + _mm_free(fftInBuf); + _mm_free(fftOutBuf); + } + if(windowFn) + free(windowFn); + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/cpu/kernel/tensor_stddev.hpp b/src/modules/cpu/kernel/tensor_stddev.hpp index 3013d9e54..2f64e93ab 100644 --- a/src/modules/cpu/kernel/tensor_stddev.hpp +++ b/src/modules/cpu/kernel/tensor_stddev.hpp @@ -147,8 +147,8 @@ RppStatus tensor_stddev_u8_f32_host(Rpp8u *srcPtr, { __m256d p[6]; rpp_simd_load(rpp_load24_u8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); - compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); - compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); srcPtrTempR += vectorIncrementPerChannel; srcPtrTempG += vectorIncrementPerChannel; srcPtrTempB += vectorIncrementPerChannel; @@ -239,8 +239,8 @@ RppStatus tensor_stddev_u8_f32_host(Rpp8u *srcPtr, { __m256d p[6]; rpp_simd_load(rpp_load24_u8pkd3_to_f64pln3_avx, srcPtrTemp, p); - compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); - compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); srcPtrTemp += vectorIncrement; } #endif @@ -408,8 +408,8 @@ RppStatus tensor_stddev_f32_f32_host(Rpp32f *srcPtr, { __m256d p[6]; rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); - compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); - compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); srcPtrTempR += vectorIncrementPerChannel; srcPtrTempG += vectorIncrementPerChannel; srcPtrTempB += vectorIncrementPerChannel; @@ -500,8 +500,8 @@ RppStatus tensor_stddev_f32_f32_host(Rpp32f *srcPtr, { __m256d p[6]; rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp, p); - compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); - compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); srcPtrTemp += vectorIncrement; } #endif @@ -682,8 +682,8 @@ RppStatus tensor_stddev_f16_f32_host(Rpp16f *srcPtr, __m256d p[6]; rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); - compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); - compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); srcPtrTempR += vectorIncrementPerChannel; srcPtrTempG += vectorIncrementPerChannel; srcPtrTempB += vectorIncrementPerChannel; @@ -778,8 +778,8 @@ RppStatus tensor_stddev_f16_f32_host(Rpp16f *srcPtr, __m256d p[6]; rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp_ps, p); - compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); - compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); srcPtrTemp += vectorIncrement; } @@ -949,8 +949,8 @@ RppStatus tensor_stddev_i8_f32_host(Rpp8s *srcPtr, { __m256d p[6]; rpp_simd_load(rpp_load24_i8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); - compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); - compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); srcPtrTempR += vectorIncrementPerChannel; srcPtrTempG += vectorIncrementPerChannel; srcPtrTempB += vectorIncrementPerChannel; @@ -1041,8 +1041,8 @@ RppStatus tensor_stddev_i8_f32_host(Rpp8s *srcPtr, { __m256d p[6]; rpp_simd_load(rpp_load24_i8pkd3_to_f64pln3_avx, srcPtrTemp, p); - compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); - compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); + compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB); + compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB); srcPtrTemp += vectorIncrement; } #endif diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp index c0a27806e..a0f50ee7e 100644 --- a/src/modules/hip/hip_tensor_statistical_operations.hpp +++ b/src/modules/hip/hip_tensor_statistical_operations.hpp @@ -29,5 +29,6 @@ SOFTWARE. #include "kernel/tensor_max.hpp" #include "kernel/tensor_mean.hpp" #include "kernel/tensor_stddev.hpp" +#include "kernel/normalize.hpp" #endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP diff --git a/src/modules/hip/kernel/normalize.hpp b/src/modules/hip/kernel/normalize.hpp new file mode 100644 index 000000000..384454ca7 --- /dev/null +++ b/src/modules/hip/kernel/normalize.hpp @@ -0,0 +1,1921 @@ +#include +#include "rpp_hip_common.hpp" + +#define MAX_SHARED_MEMORY_SIZE 1024 + +// -------------------- Set 0 - normalization kernels device helpers -------------------- + +__device__ __forceinline__ void normalize_check_and_store(float outVal, uchar* dst) +{ + outVal = fmax(fminf(outVal, 255), 0); + *dst = static_cast(outVal); +} + +__device__ __forceinline__ void normalize_check_and_store(float outVal, schar* dst) +{ + outVal = fmax(fminf(outVal, 127), -128); + *dst = static_cast(outVal); +} + +__device__ __forceinline__ void normalize_check_and_store(float outVal, float* dst) +{ + *dst = outVal; +} + +__device__ __forceinline__ void normalize_check_and_store(float outVal, half* dst) +{ + *dst = static_cast(outVal); +} + +// -------------------- Set 1 - normalization kernel host helpers -------------------- + +// setup function needed for 2D/3D optimized kernel variants when mean and stddev needs to be internally computed +void normalize_setup_2d_and_3d(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims, + Rpp32u axisMask, Rpp32u &maxParamVolume) +{ + maxParamVolume = 1; + uint axisSet[RPPT_MAX_DIMS]; + for(int i = 0; i < tensorDims; i++) + axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0; + + for(uint i = 0; i < batchSize; i++) + { + // calculate the max param volume + Rpp32u paramVolume = 1; + Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims]; + for(uint j = 0; j < tensorDims; j++) + paramVolume *= (axisSet[j]) ? 1 : roi[j]; + maxParamVolume = std::max(maxParamVolume, paramVolume); + } +} + +// setup function needed for ND generic kernel variants +void normalize_setup_nd(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims, Rpp32u axisMask, + Rpp32u *paramShapeTensor, Rpp32u *paramStridesTensor, Rpp32u &maxParamVolume) +{ + maxParamVolume = 1; + uint axisSet[RPPT_MAX_DIMS]; + for(int i = 0; i < tensorDims; i++) + axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0; + + for(uint i = 0; i < batchSize; i++) + { + // calculate the param shape and param volume based on the axis mask + Rpp32u paramVolume = 1; + Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims]; + Rpp32u *paramShape = ¶mShapeTensor[i * tensorDims]; + for(uint j = 0; j < tensorDims; j++) + { + paramShape[j] = (axisSet[j]) ? 1 : roi[j]; + paramVolume *= paramShape[j]; + } + maxParamVolume = std::max(maxParamVolume, paramVolume); + + // calculate the param strides from the param shape + Rpp32u *paramStrides = ¶mStridesTensor[i * tensorDims]; + Rpp32u val = 1; + for(uint j = tensorDims - 1; j > 0; j--) + { + paramStrides[j] = val; + val *= paramShape[j]; + } + paramStrides[0] = val; + } +} + +// -------------------- Set 2 - normalization kernels -------------------- + +template +__global__ void normalize_2d_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + float *meanTensor, + float *stdDevTensor, + float2 scaleAndShift, + uint *roiTensor, + uint2 maxParamVolumeAndAxisMask, + bool computeStdDev) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // width + uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize + + uint *roi = &roiTensor[id_z * 4]; + uint yBegin = roi[0]; + uint xBegin = roi[1]; + uint height = roi[2]; + uint width = roi[3]; + + if (id_x >= width || id_y >= height) + return; + + uint maxParamVolume = maxParamVolumeAndAxisMask.x; + uint axisMask = maxParamVolumeAndAxisMask.y; + uint paramIndex = id_z * maxParamVolume; + // update paramIndex based on axisMask value + if (axisMask == 1) + paramIndex += id_x; + else if (axisMask == 2) + paramIndex += id_y; + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + id_x + xBegin; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x; + float mean = meanTensor[paramIndex]; + float stdDev = stdDevTensor[paramIndex]; + float scale = scaleAndShift.x; + float shift = scaleAndShift.y; + float invStdDev; + if (computeStdDev) + { + float stdDevSquare = stdDev * stdDev; + invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0; + } + else + { + invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f; + } + float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift); + normalize_check_and_store(outVal, &dstPtr[dstIdx]); +} + +template +__global__ void normalize_3d_hip_tensor(T *srcPtr, + uint2 srcStridesDH, + T *dstPtr, + uint2 dstStridesDH, + float *meanTensor, + float *stdDevTensor, + float2 scaleAndShift, + uint *roiTensor, + uint axisMask, + bool computeStdDev) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX + uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ + + uint *roi = roiTensor; + uint zBegin = roi[0]; + uint yBegin = roi[1]; + uint xBegin = roi[2]; + uint lengthZ = roi[3]; + uint lengthY = roi[4]; + uint lengthX = roi[5]; + + if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ) + return; + + uint paramIndex = 0; + // update paramIndex based on axisMask value + if (axisMask == 1) + paramIndex += id_y * lengthX + id_x; + else if (axisMask == 2) + paramIndex += id_z * lengthX + id_x; + else if (axisMask == 4) + paramIndex += id_z * lengthY + id_y; + else if (axisMask == 3) + paramIndex += id_x; + else if (axisMask == 5) + paramIndex += id_y; + else if (axisMask == 6) + paramIndex += id_z; + + uint srcIdx = ((id_z + zBegin) * srcStridesDH.x) + ((id_y + yBegin) * srcStridesDH.y) + id_x + xBegin; + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x; + float mean = meanTensor[paramIndex]; + float stdDev = stdDevTensor[paramIndex]; + float scale = scaleAndShift.x; + float shift = scaleAndShift.y; + float invStdDev; + if (computeStdDev) + { + float stdDevSquare = stdDev * stdDev; + invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0; + } + else + { + invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f; + } + float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift); + normalize_check_and_store(outVal, &dstPtr[dstIdx]); +} + +template +__global__ void normalize_nd_hip_tensor(T *srcPtr, + uint *srcMaxDims, + uint *srcStrides, + T *dstPtr, + float *meanTensor, + float *stdDevTensor, + float2 scaleAndShift, + uint *roiTensor, + uint *paramShapeTensor, + uint *paramStridesTensor, + uint2 maxParamVolumeAndBufferLength, + uint tensorDims, + bool computeStdDev) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint maxBufferLength = maxParamVolumeAndBufferLength.y; + + if (id_x >= maxBufferLength) + return; + + uint *begin = &roiTensor[id_z * tensorDims * 2]; + uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + uint *paramShape = ¶mShapeTensor[id_z * tensorDims]; + uint *paramStrides = ¶mStridesTensor[id_z * tensorDims]; + uint maxParamVolume = maxParamVolumeAndBufferLength.x; + uint srcIdx = id_z * maxBufferLength; + + uint paramIndex = id_z * maxParamVolume; + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + return; + paramIndex += (maxParamVolume != 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + + float mean = meanTensor[paramIndex]; + float stdDev = stdDevTensor[paramIndex]; + float scale = scaleAndShift.x; + float shift = scaleAndShift.y; + float invStdDev; + if (computeStdDev) + { + float stdDevSquare = stdDev * stdDev; + invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0; + } + else + { + invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f; + } + uint dstIdx = id_z * maxBufferLength + id_x; + float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift); + normalize_check_and_store(outVal, &dstPtr[dstIdx]); +} + +// -------------------- Set 3 - mean and stddev compute kernels device helpers -------------------- + +__device__ __forceinline__ void reduction_sum_x_hip(float *partialSum_smem) +{ + for(uint threadMax = hipBlockDim_x / 2; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialSum_smem[hipThreadIdx_x] += partialSum_smem[hipThreadIdx_x + threadMax]; + __syncthreads(); + } +} + +// -------------------- Set 4 - mean compute kernels (reduction stage 1) -------------------- + +template +__global__ void compute_mean_2d_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + float *meanTensor, + float *partialSumTensor, + uint *roiTensor, + uint maxParamVolume, + uint axisMask) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *roi = &roiTensor[id_z * 4]; + uint yBegin = roi[0]; + uint xBegin = roi[1]; + uint height = roi[2]; + uint width = roi[3]; + + // compute column wise mean + if (axisMask == 1) + { + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin); + uint dstIdx = id_z * maxParamVolume + id_x; + if (id_x < width) + { + float accum = 0.0f; + for(int i = 0; i < height; i++) + { + accum += static_cast(srcPtr[srcIdx]); + srcIdx += srcStridesNH.y; + } + meanTensor[dstIdx] = accum / static_cast(height); + } + } + // compute partial sums needed for row wise mean + else if (axisMask == 2) + { + id_x *= 8; + __shared__ float partialRowSum_smem[256]; + partialRowSum_smem[hipThreadIdx_x] = 0.0f; + + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + int xAlignedLength = width & ~7; // alignedLength for vectorized global loads + int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + if (id_x + 8 > width) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 256 threads and store in Shared + __syncthreads(); + + // Now do block level reduction sum + reduction_sum_x_hip(partialRowSum_smem); + + // Final store to dst + if (hipThreadIdx_x == 0) + { + uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x; + partialSumTensor[paramIndex] = partialRowSum_smem[0]; + } + } + // compute partial sums need for computing mean over entire rows and columns + else if (axisMask == 3) + { + id_x *= 8; + __shared__ float partialSum_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + int xAlignedLength = width & ~7; // alignedLength for vectorized global loads + int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + if (id_x + 8 > width) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSumRowPtr_smem); + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0]; + } + } +} + +template +__global__ void compute_mean_3d_hip_tensor(T *srcPtr, + uint3 srcStridesNZY, + float *meanTensor, + uint *roiTensor, + float *partialSumTensor, + uint maxParamVolume, + uint axisMask) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *roi = &roiTensor[id_z * 6]; + uint zBegin = roi[0]; + uint yBegin = roi[1]; + uint xBegin = roi[2]; + uint lengthZ = roi[3]; + uint lengthY = roi[4]; + uint lengthX = roi[5]; + + // compute mean along z direction + if (axisMask == 1) + { + if (id_x >= lengthX || id_y >= lengthY) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin); + uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x; + float accum = 0.0f; + for(uint i = 0; i < lengthZ; i++) + { + accum += static_cast(srcPtr[srcIdx]); + srcIdx += srcStridesNZY.y; + } + meanTensor[dstIdx] = accum / static_cast(lengthZ); + } + // compute mean along y direction + else if (axisMask == 2) + { + if (id_x >= lengthX || id_y >= lengthZ) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin); + uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x; + float accum = 0.0f; + for(uint i = 0; i < lengthY; i++) + { + accum += static_cast(srcPtr[srcIdx]); + srcIdx += srcStridesNZY.z; + } + meanTensor[dstIdx] = accum / static_cast(lengthY); + } + // compute mean along x direction + else if (axisMask == 4) + { + if (id_x >= lengthY || id_y >= lengthZ) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin; + d_float8 accum_f8; + accum_f8.f4[0] = (float4)0.0f; + accum_f8.f4[1] = (float4)0.0f; + for(int i = 0; i < lengthX; i += 8) + { + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + if (i + 8 > lengthX) + { + int xDiff = i + 8 - lengthX; + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; + } + accum_f8.f4[0] += src_f8.f4[0]; + accum_f8.f4[1] += src_f8.f4[1]; + srcIdx += 8; + } + accum_f8.f4[0] += accum_f8.f4[1]; + accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]); + uint dstIdx = id_z * maxParamVolume + id_y * lengthY + id_x; + meanTensor[dstIdx] = accum_f8.f1[0] / static_cast(lengthX); + } + // compute partial sums required for computing mean along z-y direction + else if (axisMask == 3) + { + for(uint xIndex = 0; xIndex < lengthX; xIndex++) + { + __shared__ float partialSum_smem[16][16]; + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + if ((id_x >= lengthY) || (id_y >= lengthZ)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex); + partialSumRowPtr_smem[hipThreadIdx_x] = static_cast(srcPtr[srcIdx]); + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax]; + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in z dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x); + partialSumTensor[dstIdx] = partialSumRowPtr_smem[0]; + } + } + __syncthreads(); + } + } + // compute partial sums required for computing mean along y-x direction + else if (axisMask == 6) + { + __shared__ float partialSum_smem[256]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + __syncthreads(); + + if (id_x >= lengthY || id_y >= lengthZ) + return; + + uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y; + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin; + d_float8 accum_f8; + accum_f8.f4[0] = (float4)0.0f; + accum_f8.f4[1] = (float4)0.0f; + for(int i = 0; i < lengthX; i += 8) + { + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + if (i + 8 > lengthX) + { + int xDiff = lengthX - i; + for(int j = xDiff; j < 8; j++) + src_f8.f1[j] = 0.0f; + } + accum_f8.f4[0] += src_f8.f4[0]; + accum_f8.f4[1] += src_f8.f4[1]; + srcIdx += 8; + } + accum_f8.f4[0] += accum_f8.f4[1]; + accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]); + partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0]; + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSum_smem); + + if (hipThreadIdx_x == 0) + { + uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x; + partialSumTensor[dstIdx] = partialSum_smem[0]; + } + } + // compute mean along z-x direction + else if (axisMask == 5) + { + __shared__ float partialSum_smem[32]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + + if (hipBlockIdx_x >= lengthY) + return; + + uint dstIdx = id_z * maxParamVolume + hipBlockIdx_x; + float accum = 0.0f; + for (uint i = 0; i < lengthZ; i++) + { + uint tid_x = hipThreadIdx_x; + uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin; + while (tid_x < lengthX) + { + accum += static_cast(srcPtr[srcIdx + tid_x]); + tid_x += hipBlockDim_x; + } + } + partialSum_smem[hipThreadIdx_x] = accum; + __syncthreads(); + + // perform reduction on shared memory sums + reduction_sum_x_hip(partialSum_smem); + + if (hipThreadIdx_x == 0) + meanTensor[dstIdx] = partialSum_smem[0] / static_cast(lengthX * lengthZ); + } + // compute partial sums required for computing mean along z-y-x direction + else if (axisMask == 7) + { + id_x *= 8; + __shared__ float partialSum_smem[16][16]; + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + uint xIndex = id_x % srcStridesNZY.z; + uint yIndex = id_x / srcStridesNZY.z; + if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ)) + { + return; + } + + int xAlignedLength = lengthX & ~7; // alignedLength for vectorized global loads + int xDiff = lengthX - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + if (xIndex + 8 > lengthX) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; + partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSumRowPtr_smem); + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x); + partialSumTensor[dstIdx] = partialSumRowPtr_smem[0]; + } + } + } +} + +template +__global__ void compute_mean_nd_hip_tensor(T *srcPtr, + uint *srcMaxDims, + uint *srcStrides, + float *meanTensor, + uint *roiTensor, + uint *paramShapeTensor, + uint *paramStridesTensor, + uint maxParamVolume, + uint tensorDims, + uint maxBufferLength) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *begin = &roiTensor[id_z * tensorDims * 2]; + uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + uint *paramShape = ¶mShapeTensor[id_z * tensorDims]; + uint *paramStrides = ¶mStridesTensor[id_z * tensorDims]; + uint srcIdx = id_z * maxBufferLength; + uint paramBase = id_z * maxParamVolume; + uint paramIndex = 0; + + if (maxParamVolume > MAX_SHARED_MEMORY_SIZE) + { + if (id_x >= maxBufferLength) + return; + + // validate if id_x is within the roi of input and compute paramIndex if valid + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + return; + paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + atomicAdd(&meanTensor[paramBase + paramIndex], static_cast(srcPtr[srcIdx])); + } + else + { + + if (id_x >= (hipBlockDim_x * hipGridDim_x)) + return; + + // if number of means needed to compute is within in the max shared memory size + // use shared memory for atomic addition to reduce global memory traffic + bool isValid = true; + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + { + isValid = false; + break; + } + paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + + extern __shared__ float sh_mem[]; + sh_mem[hipThreadIdx_x] = 0.0f; + __syncthreads(); + + if (isValid && id_x < maxBufferLength) + atomicAdd(&sh_mem[paramIndex], static_cast(srcPtr[srcIdx])); + __syncthreads(); + + if (hipThreadIdx_x < maxParamVolume) + atomicAdd(&meanTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]); + } +} + +// -------------------- Set 5 - stddev compute kernels (reduction stage 1) -------------------- + +template +__global__ void compute_stddev_2d_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + float *meanTensor, + float *stdDevTensor, + float *partialSumTensor, + uint *roiTensor, + uint maxParamVolume, + uint axisMask) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *roi = &roiTensor[id_z * 4]; + uint yBegin = roi[0]; + uint xBegin = roi[1]; + uint height = roi[2]; + uint width = roi[3]; + + // compute column wise stdDev + if (axisMask == 1) + { + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin); + uint paramIndex = id_z * maxParamVolume + id_x; + float mean = meanTensor[paramIndex]; + if (id_x < width) + { + float accum = 0.0f; + for(int i = 0; i < height; i++) + { + float val = (static_cast(srcPtr[srcIdx]) - mean); + accum += (val * val); + srcIdx += srcStridesNH.y; + } + stdDevTensor[paramIndex] = sqrtf(accum / static_cast(height)); + } + } + // compute partial mean subtracted squared sums needed for row wise stdDev + else if (axisMask == 2) + { + id_x *= 8; + __shared__ float partialRowSum_smem[256]; + partialRowSum_smem[hipThreadIdx_x] = 0.0f; + + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + int xAlignedLength = width & ~7; // alignedLength for vectorized global loads + int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin); + + uint paramIndex = id_z * maxParamVolume + id_y; + float mean = meanTensor[paramIndex]; + float4 mean_f4 = static_cast(mean); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + + if (id_x + 8 > width) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared + __syncthreads(); + + // Now do block level reduction sum + reduction_sum_x_hip(partialRowSum_smem); + + // Final store to dst + if (hipThreadIdx_x == 0) + { + uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x; + partialSumTensor[paramIndex] = partialRowSum_smem[0]; + } + } + // compute partial mean subtracted squared sums need for computing stdDev over entire rows and columns + else if (axisMask == 3) + { + id_x *= 8; + __shared__ float partialSum_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + int xAlignedLength = width & ~7; // alignedLength for vectorized global loads + int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin); + + float mean = meanTensor[id_z]; + float4 mean_f4 = static_cast(mean); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + if (id_x + 8 > width) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSumRowPtr_smem); + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0]; + } + } +} + +template +__global__ void compute_stddev_3d_hip_tensor(T *srcPtr, + uint3 srcStridesNZY, + float *meanTensor, + float *stdDevTensor, + uint *roiTensor, + float *partialSumTensor, + uint maxParamVolume, + uint axisMask) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *roi = &roiTensor[id_z * 6]; + uint zBegin = roi[0]; + uint yBegin = roi[1]; + uint xBegin = roi[2]; + uint lengthZ = roi[3]; + uint lengthY = roi[4]; + uint lengthX = roi[5]; + + // compute stddev along z direction + if (axisMask == 1) + { + if (id_x >= lengthX || id_y >= lengthY) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin); + uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x; + float mean = meanTensor[paramIndex]; + float accum = 0.0f; + for(uint i = 0; i < lengthZ; i++) + { + float val = (static_cast(srcPtr[srcIdx]) - mean); + accum += (val * val); + srcIdx += srcStridesNZY.y; + } + stdDevTensor[paramIndex] = sqrtf(accum / static_cast(lengthZ)); + } + // compute stddev along y direction + else if (axisMask == 2) + { + if (id_x >= lengthX || id_y >= lengthZ) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin); + uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x; + float mean = meanTensor[paramIndex]; + float accum = 0.0f; + for(uint i = 0; i < lengthY; i++) + { + float val = (static_cast(srcPtr[srcIdx]) - mean); + accum += (val * val); + srcIdx += srcStridesNZY.z; + } + stdDevTensor[paramIndex] = sqrtf(accum / static_cast(lengthY)); + } + // compute stddev along x direction + else if (axisMask == 4) + { + if (id_x >= lengthY || id_y >= lengthZ) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin; + uint paramIndex = id_z * maxParamVolume + id_y * lengthY + id_x; + float mean = meanTensor[paramIndex]; + float4 mean_f4 = static_cast(mean); + d_float8 accum_f8; + accum_f8.f4[0] = (float4)0.0f; + accum_f8.f4[1] = (float4)0.0f; + for(int i = 0; i < lengthX; i += 8) + { + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + if (i + 8 > lengthX) + { + int xDiff = i + 8 - lengthX; + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; + } + accum_f8.f4[0] += src_f8.f4[0]; + accum_f8.f4[1] += src_f8.f4[1]; + srcIdx += 8; + } + accum_f8.f4[0] += accum_f8.f4[1]; + accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]); + + stdDevTensor[paramIndex] = sqrtf(accum_f8.f1[0] / static_cast(lengthX)); + } + // compute partial mean subtracted squared sums required for computing stdDev along z-y direction + else if (axisMask == 3) + { + for(uint xIndex = 0; xIndex < lengthX; xIndex++) + { + __shared__ float partialSum_smem[16][16]; + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + if ((id_x >= lengthY) || (id_y >= lengthZ)) + { + return; + } + + uint paramIndex = id_z * maxParamVolume + xIndex; + float mean = meanTensor[paramIndex]; + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex); + float val = static_cast(srcPtr[srcIdx]) - mean; + partialSumRowPtr_smem[hipThreadIdx_x] = (val * val); + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax]; + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in z dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x); + partialSumTensor[dstIdx] = partialSumRowPtr_smem[0]; + } + } + __syncthreads(); + } + } + // compute partial mean subtracted squared sums required for computing stdDev along y-x direction + else if (axisMask == 6) + { + __shared__ float partialSum_smem[256]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + __syncthreads(); + + if (id_x >= lengthY || id_y >= lengthZ) + return; + + uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y; + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin; + + uint paramIndex = id_z * maxParamVolume + id_y; + float mean = meanTensor[paramIndex]; + float4 mean_f4 = static_cast(mean); + + d_float8 accum_f8; + accum_f8.f4[0] = (float4)0.0f; + accum_f8.f4[1] = (float4)0.0f; + for(int i = 0; i < lengthX; i += 8) + { + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + if (i + 8 > lengthX) + { + int xDiff = lengthX - i; + for(int j = xDiff; j < 8; j++) + src_f8.f1[j] = 0.0f; + } + accum_f8.f4[0] += src_f8.f4[0]; + accum_f8.f4[1] += src_f8.f4[1]; + srcIdx += 8; + } + accum_f8.f4[0] += accum_f8.f4[1]; + accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]); + partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0]; + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSum_smem); + + if (hipThreadIdx_x == 0) + { + uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x; + partialSumTensor[dstIdx] = partialSum_smem[0]; + } + } + // compute stddev along z-x direction + else if (axisMask == 5) + { + __shared__ float partialSum_smem[32]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + + if (hipBlockIdx_x >= lengthY) + return; + + uint paramIndex = id_z * maxParamVolume + hipBlockIdx_x; + float mean = meanTensor[paramIndex]; + float accum = 0.0f; + for (uint i = 0; i < lengthZ; i++) + { + uint tid_x = hipThreadIdx_x; + uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin; + while (tid_x < lengthX) + { + float val = (static_cast(srcPtr[srcIdx + tid_x]) - mean); + accum += (val * val); + tid_x += hipBlockDim_x; + } + } + partialSum_smem[hipThreadIdx_x] = accum; + __syncthreads(); + + // perform reduction on shared memory sums + reduction_sum_x_hip(partialSum_smem); + + if (hipThreadIdx_x == 0) + stdDevTensor[paramIndex] = sqrtf(partialSum_smem[0] / static_cast(lengthX * lengthZ)); + } + // compute partial mean subtracted squared sums required for computing stdDev along z-y-x direction + else if (axisMask == 7) + { + id_x *= 8; + __shared__ float partialSum_smem[16][16]; + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + uint xIndex = id_x % srcStridesNZY.z; + uint yIndex = id_x / srcStridesNZY.z; + if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ)) + { + return; + } + + int xAlignedLength = lengthX & ~7; // alignedLength for vectorized global loads + int xDiff = lengthX - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin); + + uint paramIndex = id_z * maxParamVolume; + float mean = meanTensor[paramIndex]; + float4 mean_f4 = static_cast(mean); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + + if (xIndex + 8 > lengthX) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; + partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSumRowPtr_smem); + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x); + partialSumTensor[dstIdx] = partialSumRowPtr_smem[0]; + } + } + } +} + +template +__global__ void compute_stddev_nd_hip_tensor(T *srcPtr, + uint *srcMaxDims, + uint *srcStrides, + float *meanTensor, + float *stdDevTensor, + uint *roiTensor, + uint *paramShapeTensor, + uint *paramStridesTensor, + uint maxParamVolume, + uint tensorDims, + uint maxBufferLength) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *begin = &roiTensor[id_z * tensorDims * 2]; + uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + uint *paramShape = ¶mShapeTensor[id_z * tensorDims]; + uint *paramStrides = ¶mStridesTensor[id_z * tensorDims]; + uint srcIdx = id_z * maxBufferLength; + uint paramBase = id_z * maxParamVolume; + uint paramIndex = 0; + + if (maxParamVolume > MAX_SHARED_MEMORY_SIZE) + { + if (id_x >= maxBufferLength) + return; + + // validate if id_x is within the roi of input and compute paramIndex if valid + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + return; + paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + float val = static_cast(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex]; + atomicAdd(&stdDevTensor[paramBase + paramIndex], (val * val)); + } + else + { + + if (id_x >= (hipBlockDim_x * hipGridDim_x)) + return; + + // if number of means needed to compute is within in the max shared memory size + // use shared memory for atomic addition to reduce global memory traffic + bool isValid = true; + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + { + isValid = false; + break; + } + paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + + extern __shared__ float sh_mem[]; + sh_mem[hipThreadIdx_x] = 0.0f; + __syncthreads(); + + if (isValid && id_x < maxBufferLength) + { + float val = static_cast(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex]; + atomicAdd(&sh_mem[paramIndex], (val * val)); + } + __syncthreads(); + + if (hipThreadIdx_x < maxParamVolume) + atomicAdd(&stdDevTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]); + } +} + +// -------------------- Set 6 - mean and stddev compute kernels (reduction stage 2) -------------------- + +__global__ void reduce_final_result_hip(float *partialSumTensor, + uint numPartialSums, + float *meanTensor, + float *stdDevTensor, + bool isMean, + uint *roiTensor, + uint axisMask, + uint tensorDims) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + + uint meanFactor; + if (tensorDims == 3) + { + uint lengthZ = roi[0]; + uint lengthY = roi[1]; + uint lengthX = roi[2]; + + if (axisMask == 3) + meanFactor = lengthZ * lengthY; + else if (axisMask == 6) + meanFactor = lengthY * lengthX; + else if (axisMask == 7) + meanFactor = lengthZ * lengthY * lengthX; + } + else if (tensorDims == 2) + { + uint lengthY = roi[0]; + uint lengthX = roi[1]; + + if (axisMask == 2) + meanFactor = lengthX; + else if (axisMask == 3) + meanFactor = lengthY * lengthX; + } + + __shared__ float partialSum_smem[16]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + + float accum = 0.0f; + while(id_x < numPartialSums) + { + uint srcIdx = (id_z * hipGridDim_y * numPartialSums) + (id_y * numPartialSums) + id_x; + accum += partialSumTensor[srcIdx]; + id_x += hipBlockDim_x; + } + partialSum_smem[hipThreadIdx_x] = accum; + __syncthreads(); + + // Now do block level reduction sum + reduction_sum_x_hip(partialSum_smem); + + // Final store to dst + if (hipThreadIdx_x == 0) + { + if (isMean) + meanTensor[id_z * hipGridDim_y + id_y] = partialSum_smem[0] / meanFactor; + else + stdDevTensor[id_z * hipGridDim_y + id_y] = sqrtf(partialSum_smem[0] / meanFactor); + } +} + +__global__ void final_reduction_nd_hip_tensor(float *meanTensor, + float *stdDevTensor, + uint *paramShapeTensor, + uint *roiTensor, + uint tensorDims, + uint maxParamVolume, + bool isMean) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *paramShape = ¶mShapeTensor[id_z * tensorDims]; + uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + + uint divisionFactor = 1; + uint paramVolume = 1; + for(int i = 0; i < tensorDims; i++) + { + paramVolume *= paramShape[i]; + if (paramShape[i] == 1) + divisionFactor *= roi[i]; + } + + if (id_x >= paramVolume) + return; + + uint paramIndex = id_z * maxParamVolume + id_x; + if (isMean) + meanTensor[paramIndex] = meanTensor[paramIndex] / divisionFactor; + else + stdDevTensor[paramIndex] = sqrtf(stdDevTensor[paramIndex] / divisionFactor); +} + +// -------------------- Set 7 - mean and stddev compute kernels launch helpers -------------------- + +void set_kernel_launch_config_2d(RpptGenericDescPtr srcGenericDescPtr, + int &globalThreads_x, + int &globalThreads_y, + int &globalThreads_z, + int &localThreads_x, + int &localThreads_y, + int &localThreads_z, + Rpp32u axisMask, + Rpp32f *partialSumArr, + rpp::Handle& handle) +{ + switch (axisMask) + { + // compute along Y direction + case 1: + { + localThreads_x = 256; + localThreads_y = 1; + localThreads_z = 1; + globalThreads_x = static_cast(ceil((float)srcGenericDescPtr->dims[2] / localThreads_x)); + globalThreads_y = 1; + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along X direction + case 2: + { + localThreads_x = 256; + localThreads_y = 1; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / 256)); + globalThreads_y = srcGenericDescPtr->dims[1]; + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = srcGenericDescPtr->dims[0] * srcGenericDescPtr->dims[1] * globalThreads_x; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + // compute along XY direction + case 3: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + } +} + +void set_kernel_launch_config_3d(RpptGenericDescPtr srcGenericDescPtr, + int &globalThreads_x, + int &globalThreads_y, + int &globalThreads_z, + int &localThreads_x, + int &localThreads_y, + int &localThreads_z, + Rpp32u axisMask, + Rpp32f *partialSumArr, + rpp::Handle& handle) +{ + switch (axisMask) + { + // compute along Z direction + case 1: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along Y direction + case 2: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along YZ direction + case 3: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + // compute along X direction + case 4: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along XZ direction + case 5: + { + localThreads_x = 32; + localThreads_y = 1; + localThreads_z = 1; + globalThreads_x = srcGenericDescPtr->dims[2]; + globalThreads_y = 1; + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along XY direction + case 6: + { + localThreads_x = 256; + localThreads_y = 1; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x)); + globalThreads_y = srcGenericDescPtr->dims[1]; + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + // compute along XYZ direction + case 7: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + Rpp32u numValues = (srcGenericDescPtr->dims[2] * srcGenericDescPtr->dims[3] + 7) >> 3; + globalThreads_x = static_cast (ceil((float)numValues / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + } +} + +// -------------------- Set 8 - mean and stddev compute kernels executor -------------------- + +template +RppStatus hip_exec_compute_mean_stddev_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *meanTensor, + Rpp32f *stdDevTensor, + bool isMean, + Rpp32u *roiTensor, + Rpp32u axisMask, + Rpp32u tensorDims, + Rpp32u maxParamVolume, + Rpp32u *paramShape, + Rpp32u *paramStrides, + rpp::Handle& handle) +{ + Rpp32f *partialSumArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; + Rpp32u partialSumArrLength, partialSumBlocksPerSample; + + int globalThreads_x, globalThreads_y, globalThreads_z; + int localThreads_x, localThreads_y, localThreads_z; + // based on number of dimensions call the corresponding kernel + if (tensorDims == 2) + { + // set the block and grid configuration based on axisMask + set_kernel_launch_config_2d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z, + localThreads_x, localThreads_y, localThreads_z, axisMask, + partialSumArr, handle); + + if (isMean) + { + hipLaunchKernelGGL(compute_mean_2d_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]), + meanTensor, + partialSumArr, + roiTensor, + maxParamVolume, + axisMask); + } + else + { + hipLaunchKernelGGL(compute_stddev_2d_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]), + meanTensor, + stdDevTensor, + partialSumArr, + roiTensor, + maxParamVolume, + axisMask); + } + + if (axisMask == 2) + { + partialSumBlocksPerSample = globalThreads_x; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), ceil((float)globalThreads_y), ceil((float)globalThreads_z)), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + else if (axisMask == 3) + { + partialSumBlocksPerSample = globalThreads_x * globalThreads_y; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), 1, globalThreads_z), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + } + else if (tensorDims == 3) + { + // set the block and grid configuration based on axisMask + set_kernel_launch_config_3d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z, + localThreads_x, localThreads_y, localThreads_z, axisMask, + partialSumArr, handle); + + if (isMean) + { + hipLaunchKernelGGL(compute_mean_3d_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + meanTensor, + roiTensor, + partialSumArr, + maxParamVolume, + axisMask); + } + else + { + hipLaunchKernelGGL(compute_stddev_3d_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + meanTensor, + stdDevTensor, + roiTensor, + partialSumArr, + maxParamVolume, + axisMask); + } + + // perform final reduction on block wise sums for below cases + // reduce on YZ partial sums + if (axisMask == 3) + { + partialSumBlocksPerSample = globalThreads_x * globalThreads_y; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[3], srcGenericDescPtr->dims[0]), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + // reduce on XY partial sums + if (axisMask == 6) + { + partialSumBlocksPerSample = globalThreads_x; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[1], srcGenericDescPtr->dims[0]), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + // reduce on XYZ block partial sums + else if (axisMask == 7) + { + partialSumBlocksPerSample = globalThreads_x * globalThreads_y; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), 1, srcGenericDescPtr->dims[0]), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + } + else + { + // interpret the input as 1D tensor + globalThreads_x = srcGenericDescPtr->strides[0]; + globalThreads_y = 1; + globalThreads_z = srcGenericDescPtr->dims[0]; + Rpp32u batchSize = globalThreads_z; + + // allocate tensor for src strides + Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1]; + Rpp32u *srcStrides = &srcGenericDescPtr->strides[1]; + + Rpp32u shared_memory_size = 0; + Rpp32u block_size = 1024; + if (maxParamVolume <= MAX_SHARED_MEMORY_SIZE) + { + if (maxParamVolume <= 32) + shared_memory_size = 32; + else if (maxParamVolume <= 64) + shared_memory_size = 64; + else if (maxParamVolume <= 128) + shared_memory_size = 128; + else if (maxParamVolume <= 256) + shared_memory_size = 256; + else if (maxParamVolume <= 512) + shared_memory_size = 512; + else + shared_memory_size = MAX_SHARED_MEMORY_SIZE; + block_size = shared_memory_size; + } + + if (isMean) + { + hipLaunchKernelGGL(compute_mean_nd_hip_tensor, + dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)), + dim3(block_size, 1, 1), + shared_memory_size, + handle.GetStream(), + srcPtr, + srcMaxDims, + srcStrides, + meanTensor, + roiTensor, + paramShape, + paramStrides, + maxParamVolume, + tensorDims, + srcGenericDescPtr->strides[0]); + } + else + { + hipLaunchKernelGGL(compute_stddev_nd_hip_tensor, + dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)), + dim3(block_size, 1, 1), + shared_memory_size, + handle.GetStream(), + srcPtr, + srcMaxDims, + srcStrides, + meanTensor, + stdDevTensor, + roiTensor, + paramShape, + paramStrides, + maxParamVolume, + tensorDims, + srcGenericDescPtr->strides[0]); + } + hipLaunchKernelGGL(final_reduction_nd_hip_tensor, + dim3(ceil((float)maxParamVolume/1024), 1, globalThreads_z), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + meanTensor, + stdDevTensor, + paramShape, + roiTensor, + tensorDims, + maxParamVolume, + isMean); + } + hipStreamSynchronize(handle.GetStream()); + return RPP_SUCCESS; +} + +// -------------------- Set 9 - normalization kernel executor -------------------- + +template +RppStatus hip_exec_normalize_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + T *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensor, + Rpp32f *stdDevTensor, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u batchSize = srcGenericDescPtr->dims[0]; + Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims + + // create buffer for paramShape and paramStride needed for generic kernel + Rpp32u *paramShape, *paramStrides; + paramShape = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem; + paramStrides = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem + (batchSize * tensorDims); + + // do initial preprocessing, compute maxParamVolue and fill the values for paramShape and paramStrides + Rpp32u maxParamVolume; + if (tensorDims == 2 || tensorDims == 3) + normalize_setup_2d_and_3d(roiTensor, batchSize, tensorDims, + axisMask, maxParamVolume); + else + normalize_setup_nd(roiTensor, batchSize, tensorDims, axisMask, + paramShape, paramStrides, maxParamVolume); + + bool computeMean = computeMeanStddev & 1; // if 0th bit in computeMeanStddev is set, computeMean is set to true. Otherwise it is set to false + bool computeStdDev = computeMeanStddev & 2; // if 1st bit in computeMeanStddev is set, computeStdDev is set to true. Otherwise it is set to false + if ((!computeMean) && (!computeStdDev)) + maxParamVolume = 0; + + // if computeMean is set, compute mean values by processing over input based on axisMask values + if (computeMean) + hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, true, + roiTensor, axisMask, tensorDims, maxParamVolume, + paramShape, paramStrides, handle); + + // if computeStdDev is set, compute stdDev values by processing over input based on axisMask values + if (computeStdDev) + hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, false, + roiTensor, axisMask, tensorDims, maxParamVolume, + paramShape, paramStrides, handle); + + // based on number of dimensions call the corresponding kernel + if (tensorDims == 2) + { + // NHW + int globalThreads_x = dstGenericDescPtr->dims[2]; + int globalThreads_y = dstGenericDescPtr->dims[1]; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + hipLaunchKernelGGL(normalize_2d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]), + dstPtr, + make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]), + meanTensor, + stdDevTensor, + make_float2(scale, shift), + roiTensor, + make_uint2(maxParamVolume, axisMask), + computeStdDev); + } + else if (tensorDims == 3) + { + // NDHW + int globalThreads_x = dstGenericDescPtr->dims[3]; + int globalThreads_y = dstGenericDescPtr->dims[2]; + int globalThreads_z = dstGenericDescPtr->dims[1]; + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(normalize_3d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + &meanTensor[batchCount * maxParamVolume], + &stdDevTensor[batchCount * maxParamVolume], + make_float2(scale, shift), + &roiTensor[batchCount * 6], + axisMask, + computeStdDev); + } + } + else + { + // interpret the input as 1D tensor + int globalThreads_x = dstGenericDescPtr->strides[0]; + int globalThreads_y = 1; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + // allocate tensor for src strides + Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1]; + Rpp32u *srcStrides = &srcGenericDescPtr->strides[1]; + hipLaunchKernelGGL(normalize_nd_hip_tensor, + dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y), ceil((float)globalThreads_z)), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + srcPtr, + srcMaxDims, + srcStrides, + dstPtr, + meanTensor, + stdDevTensor, + make_float2(scale, shift), + roiTensor, + paramShape, + paramStrides, + make_uint2(maxParamVolume, srcGenericDescPtr->strides[0]), + tensorDims, + computeStdDev); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/slice.hpp b/src/modules/hip/kernel/slice.hpp index 8deb52bbb..c1b7a6c41 100644 --- a/src/modules/hip/kernel/slice.hpp +++ b/src/modules/hip/kernel/slice.hpp @@ -2,24 +2,79 @@ #include #include "rpp_hip_common.hpp" +template +__global__ void fill_value_ncdhw_hip_tensor(T *dstPtr, + uint3 dstStridesCDH, + int channels, + uint3 dstDimsDHW, + T *fillValue) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z)) + { + return; + } + + uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x; + d_float8 val_f8; + val_f8.f4[0] = (float4)(*fillValue); + val_f8.f4[1] = val_f8.f4[0]; + for(int c = 0; c < channels; c++) + { + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8); + dstIdx += dstStridesCDH.x; + } +} + + +template +__global__ void fill_value_ndhwc_hip_tensor(T *dstPtr, + uint2 dstStridesDH, + uint3 dstDimsDHW, + T *fillValue) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim + + if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z)) + { + return; + } + + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3; + d_float24 val_f24; + val_f24.f4[0] = (float4)(*fillValue); + val_f24.f4[1] = val_f24.f4[0]; + val_f24.f4[2] = val_f24.f4[0]; + val_f24.f4[3] = val_f24.f4[0]; + val_f24.f4[4] = val_f24.f4[0]; + val_f24.f4[5] = val_f24.f4[0]; + rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &val_f24); +} + + template __global__ void slice_ncdhw_hip_tensor(T *srcPtr, uint3 srcStridesCDH, T *dstPtr, uint3 dstStridesCDH, int channels, - RpptROI3DPtr roiGenericSrc) + uint3 validShapeDHW) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim - if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth)) + if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z)) { return; } - uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x); + uint srcIdx = (id_z * srcStridesCDH.y) + (id_y * srcStridesCDH.z) + id_x; uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x; d_float8 val_f8; @@ -32,77 +87,439 @@ __global__ void slice_ncdhw_hip_tensor(T *srcPtr, } } + template __global__ void slice_ndhwc_hip_tensor(T *srcPtr, uint2 srcStridesDH, T *dstPtr, uint2 dstStridesDH, - RpptROI3DPtr roiGenericSrc) + uint3 validShapeDHW) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim - if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth)) + if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z)) { return; } - uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x) * 3; - uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3; + uint srcIdx = (id_z * srcStridesDH.x) + (id_y * srcStridesDH.y) + (id_x * 3); + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + (id_x * 3); d_float24 val_f24; rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24); rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24); } +template +RppStatus hip_exec_fill_value_tensor(T *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + T *fillValue, + Rpp32u *roiTensor, + rpp::Handle& handle, + Rpp32u numDims) +{ + if (numDims == 4) + { + // set the dimsOrder and globalthreads values required for NDHWC layout + Rpp32s dimsOrder[3] = {0, 1, 2}; + int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) + + // change the dimsOrder and globalthreads values if layout is NCDHW + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + dimsOrder[0] = 1; // depth + dimsOrder[1] = 2; // height + dimsOrder[2] = 3; // width + globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) + globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + } + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]); + + // checking if padding is required + bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) || + ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) || + ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]])); + + // if needPadding is set, launch kernel for filling the padded region with fill value specified + if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), + dstGenericDescPtr->dims[1], + make_uint3(maxDepth, maxHeight, maxWidth), + fillValue); + } + else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + make_uint3(maxDepth, maxHeight, maxWidth), + fillValue); + } + } + } + else if (numDims == 3) + { + // set the dimsOrder and globalthreads values required for NHWC layout + Rpp32s dimsOrder[2] = {0, 1}; + int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction) + int globalThreads_z = 1; + + // change the dimsOrder and globalthreads values if layout is NCHW + if (dstGenericDescPtr->layout == RpptLayout::NCHW) + { + dimsOrder[0] = 1; // height + dimsOrder[1] = 2; // width + globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + globalThreads_z = 1; + } + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + + // check if padding is needed + bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) || + ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]])); + + // launch kernel for filling the padded region with fill value specified + if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCHW) + { + hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]), + dstGenericDescPtr->dims[1], + make_uint3(1, shape[1], shape[2]), + fillValue); + } + else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NHWC) + { + hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(1, dstGenericDescPtr->strides[1]), + make_uint3(1, maxHeight, maxWidth), + fillValue); + } + } + } + else if (numDims == 2) + { + // NHW + int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction) + int globalThreads_z = 1; + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]); + Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]); + + // check if padding is needed + bool needPadding = (((anchor[0] + shape[0]) > length[0]) || + ((anchor[1] + shape[1]) > length[1])); + + // launch kernel for filling the padded region with fill value specified + if (needPadding) + { + hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(0, 0, dstGenericDescPtr->strides[1]), + 1, + make_uint3(1, shape[0], shape[1]), + fillValue); + } + } + } + else if (numDims == 1) + { + int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = 1; + int globalThreads_z = 1; + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]); + + // check if padding is needed + bool needPadding = ((anchor[0] + shape[0]) > length[0]); + + // launch kernel for filling the padded region with fill value specified + if (needPadding) + { + hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, 1, 1), + 0, + handle.GetStream(), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint3(0, 0, 1), + 1, + make_uint3(1, 1, shape[0]), + fillValue); + } + } + } + + return RPP_SUCCESS; +} + template RppStatus hip_exec_slice_tensor(T *srcPtr, RpptGenericDescPtr srcGenericDescPtr, T *dstPtr, RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + T *fillValue, + bool enablePadding, + Rpp32u *roiTensor, rpp::Handle& handle) { - if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims + + /* if enabledPadding is set to true, launch kernel to fill the output buffers with fill value specified. + This will be only done if shapeTensor[d] > roiTensor[d] where d is the dimension*/ + if (enablePadding) + { + hip_exec_fill_value_tensor(dstPtr, + dstGenericDescPtr, + anchorTensor, + shapeTensor, + fillValue, + roiTensor, + handle, + numDims); + hipStreamSynchronize(handle.GetStream()); + } + + if(numDims == 4) { - int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel - int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) - int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + // set the dimsOrder and globalthreads values required for NDHWC layout + Rpp32s dimsOrder[3] = {0, 1, 2}; + int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) + + // change the dimsOrder and globalthreads values if layout is NCDHW + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + dimsOrder[0] = 1; // depth + dimsOrder[1] = 2; // height + dimsOrder[2] = 3; // width + globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction) + globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction) + } for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]); + if (dstGenericDescPtr->layout == RpptLayout::NCDHW) + { + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2] * srcGenericDescPtr->strides[3] + anchor[3]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtrTemp, + make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]), + dstPtrTemp, + make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), + dstGenericDescPtr->dims[1], + make_uint3(maxDepth, maxHeight, maxWidth)); + } + else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + { + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtrTemp, + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtrTemp, + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + make_uint3(maxDepth, maxHeight, maxWidth)); + } + } + } + else if (numDims == 3) + { + // set the dimsOrder and globalthreads values required for NHWC layout + Rpp32s dimsOrder[2] = {0, 1}; + int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction) + int globalThreads_z = 1; + + // change the dimsOrder and globalthreads values if layout is NCHW + if (dstGenericDescPtr->layout == RpptLayout::NCHW) + { + dimsOrder[0] = 1; // height + dimsOrder[1] = 2; // width + globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) + globalThreads_z = 1; + } + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]); + Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]); + if (dstGenericDescPtr->layout == RpptLayout::NCHW) + { + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ncdhw_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + srcPtrTemp, + make_uint3(srcGenericDescPtr->strides[1], 0, srcGenericDescPtr->strides[2]), + dstPtrTemp, + make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]), + dstGenericDescPtr->dims[1], + make_uint3(1, maxHeight, maxWidth)); + } + else if (dstGenericDescPtr->layout == RpptLayout::NHWC) + { + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ndhwc_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), + 0, + handle.GetStream(), + srcPtrTemp, + make_uint2(1, srcGenericDescPtr->strides[1]), + dstPtrTemp, + make_uint2(1, dstGenericDescPtr->strides[1]), + make_uint3(1, maxHeight, maxWidth)); + } + } + } + else if (numDims == 2) + { + // NHW + int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction) + int globalThreads_z = 1; + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]); + Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]); + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[2] + anchor[1]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + hipLaunchKernelGGL(slice_ncdhw_hip_tensor, dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), - dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1), 0, handle.GetStream(), - srcPtr + (batchCount * srcGenericDescPtr->strides[0]), - make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]), - dstPtr + (batchCount * dstGenericDescPtr->strides[0]), - make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]), - dstGenericDescPtr->dims[1], - &roiGenericPtrSrc[batchCount]); + srcPtrTemp, + make_uint3(0, 0, srcGenericDescPtr->strides[1]), + dstPtrTemp, + make_uint3(0, 0, dstGenericDescPtr->strides[1]), + 1, + make_uint3(1, maxHeight, maxWidth)); } } - else if (dstGenericDescPtr->layout == RpptLayout::NDHWC) + else if (numDims == 1) { - int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel - int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction) - int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction) - + int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel + int globalThreads_y = 1; + int globalThreads_z = 1; for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) { - hipLaunchKernelGGL(slice_ndhwc_hip_tensor, + Rpp32s *anchor = &anchorTensor[batchCount * numDims]; + Rpp32s *shape = &shapeTensor[batchCount * numDims]; + Rpp32u *roi = roiTensor + batchCount * numDims * 2; + Rpp32s *length = reinterpret_cast(&roi[numDims]); + Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]); + T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0]; + T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]); + + hipLaunchKernelGGL(slice_ncdhw_hip_tensor, dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), - dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + dim3(LOCAL_THREADS_X, 1, 1), 0, handle.GetStream(), - srcPtr + (batchCount * srcGenericDescPtr->strides[0]), - make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), - dstPtr + (batchCount * dstGenericDescPtr->strides[0]), - make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), - &roiGenericPtrSrc[batchCount]); + srcPtrTemp, + make_uint3(0, 0, 1), + dstPtrTemp, + make_uint3(0, 0, 1), + 1, + make_uint3(1, 1, maxLength)); } } diff --git a/src/modules/rppi_validate.hpp b/src/modules/rppi_validate.hpp index e35e5c514..3285ee756 100644 --- a/src/modules/rppi_validate.hpp +++ b/src/modules/rppi_validate.hpp @@ -56,11 +56,9 @@ inline RppLayoutParams get_layout_params(RpptLayout layout, Rpp32u channels) } else if(layout == RpptLayout::NHWC || layout == RpptLayout::NDHWC) { - if (channels == 3) // PKD3 - { - layoutParams.channelParam = 1; - layoutParams.bufferMultiplier = 3; - } + //PKD + layoutParams.channelParam = 1; + layoutParams.bufferMultiplier = channels; } return layoutParams; } diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp index e2c9b18f7..0267985e5 100644 --- a/src/modules/rppt_tensor_audio_augmentations.cpp +++ b/src/modules/rppt_tensor_audio_augmentations.cpp @@ -32,8 +32,8 @@ SOFTWARE. RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, - Rpp32f *detectedIndexTensor, - Rpp32f *detectionLengthTensor, + Rpp32s *detectedIndexTensor, + Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, @@ -155,6 +155,88 @@ RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, } } +/******************** spectrogram ********************/ + +RppStatus rppt_spectrogram_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s *srcLengthTensor, + bool centerWindows, + bool reflectPadding, + Rpp32f *windowFunction, + Rpp32s nfft, + Rpp32s power, + Rpp32s windowLength, + Rpp32s windowStep, + rppHandle_t rppHandle) +{ + if ((dstDescPtr->layout != RpptLayout::NFT) && (dstDescPtr->layout != RpptLayout::NTF)) return RPP_ERROR_INVALID_DST_LAYOUT; + + if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + spectrogram_host_tensor(static_cast(srcPtr), + srcDescPtr, + static_cast(dstPtr), + dstDescPtr, + srcLengthTensor, + centerWindows, + reflectPadding, + windowFunction, + nfft, + power, + windowLength, + windowStep, + rpp::deref(rppHandle)); + + return RPP_SUCCESS; + } + else + { + return RPP_ERROR_NOT_IMPLEMENTED; + } +} + +/******************** mel_filter_bank ********************/ + +RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s* srcDimsTensor, + Rpp32f maxFreq, + Rpp32f minFreq, + RpptMelScaleFormula melFormula, + Rpp32s numFilter, + Rpp32f sampleRate, + bool normalize, + rppHandle_t rppHandle) +{ + if (srcDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_SRC_LAYOUT; + if (dstDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_DST_LAYOUT; + + if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + mel_filter_bank_host_tensor(static_cast(srcPtr), + srcDescPtr, + static_cast(dstPtr), + dstDescPtr, + srcDimsTensor, + maxFreq, + minFreq, + melFormula, + numFilter, + sampleRate, + normalize, + rpp::deref(rppHandle)); + return RPP_SUCCESS; + } + else + { + return RPP_ERROR_NOT_IMPLEMENTED; + } +} + /******************** resample ********************/ RppStatus rppt_resample_host(RppPtr_t srcPtr, diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp index 45a0d5221..d758aa676 100644 --- a/src/modules/rppt_tensor_geometric_augmentations.cpp +++ b/src/modules/rppt_tensor_geometric_augmentations.cpp @@ -1050,43 +1050,54 @@ RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, - RpptRoi3DType roiType, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + RppPtr_t fillValue, + bool enablePadding, + Rpp32u *roiTensor, rppHandle_t rppHandle) { + if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE; + if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH; + RppLayoutParams layoutParams; if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); - - if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE; - if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; - if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; - if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; - if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; + else if ((srcGenericDescPtr->layout == RpptLayout::NCHW) && (dstGenericDescPtr->layout == RpptLayout::NCHW)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); + else if ((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]); if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) { - slice_f32_f32_host_tensor((Rpp32f*) (static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), - srcGenericDescPtr, - (Rpp32f*) (static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), - dstGenericDescPtr, - roiGenericPtrSrc, - roiType, - layoutParams, - rpp::deref(rppHandle)); + slice_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + anchorTensor, + shapeTensor, + static_cast(fillValue), + enablePadding, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); } else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) { - slice_u8_u8_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, - srcGenericDescPtr, - static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, - dstGenericDescPtr, - roiGenericPtrSrc, - roiType, - layoutParams, - rpp::deref(rppHandle)); + slice_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + anchorTensor, + shapeTensor, + static_cast(fillValue), + enablePadding, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); } return RPP_SUCCESS; @@ -1848,24 +1859,29 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, - RpptROI3DPtr roiGenericPtrSrc, - RpptRoi3DType roiType, + Rpp32s *anchorTensor, + Rpp32s *shapeTensor, + RppPtr_t fillValue, + bool enablePadding, + Rpp32u *roiTensor, rppHandle_t rppHandle) { #ifdef HIP_COMPILE - if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT; - if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT; - if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS; if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE; if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH; if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) { - hip_exec_slice_tensor((Rpp32f*) (static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + hip_exec_slice_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), srcGenericDescPtr, - (Rpp32f*) (static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), dstGenericDescPtr, - roiGenericPtrSrc, + anchorTensor, + shapeTensor, + static_cast(fillValue), + enablePadding, + roiTensor, rpp::deref(rppHandle)); } else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) @@ -1874,7 +1890,11 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, srcGenericDescPtr, static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, dstGenericDescPtr, - roiGenericPtrSrc, + anchorTensor, + shapeTensor, + static_cast(fillValue), + enablePadding, + roiTensor, rpp::deref(rppHandle)); } diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp index f14e73a1a..dd151ec7d 100644 --- a/src/modules/rppt_tensor_statistical_operations.cpp +++ b/src/modules/rppt_tensor_statistical_operations.cpp @@ -241,6 +241,100 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** normalize_ND ********************/ + +RppStatus rppt_normalize_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensor, + Rpp32f *stdDevTensor, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams; + Rpp32u tensorDim = srcGenericDescPtr->numDims - 1; + if (tensorDim == 3 && (srcGenericDescPtr->layout == RpptLayout::NHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]); + else if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); + else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); + else if(tensorDim == 2 && (srcGenericDescPtr->layout == RpptLayout::NHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[2]); + + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) + { + normalize_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + normalize_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + normalize_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); + } + + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) + { + normalize_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} /******************** tensor_mean ********************/ @@ -402,6 +496,7 @@ RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, /********************************************************************************************************************/ #ifdef GPU_SUPPORT + /******************** tensor_sum ********************/ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, @@ -480,7 +575,6 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, } /******************** tensor_min ********************/ - RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMinArr, @@ -609,6 +703,87 @@ RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, #endif // backend } +RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensor, + Rpp32f *stdDevTensor, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_normalize_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_normalize_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_normalize_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_normalize_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + /******************** tensor_mean ********************/ RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr, @@ -761,5 +936,5 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, return RPP_ERROR_NOT_IMPLEMENTED; #endif // backend } -#endif // GPU_SUPPORT +#endif // GPU_SUPPORT \ No newline at end of file diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt index 82ed65309..77052cabe 100644 --- a/utilities/test_suite/CMakeLists.txt +++ b/utilities/test_suite/CMakeLists.txt @@ -83,7 +83,7 @@ if(Python3_FOUND) if(NIFTI_FOUND) add_test( NAME rpp_qa_tests_tensor_voxel_host_all - COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runTests_voxel.py --qa_mode 1 --batch_size 3 + COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runVoxelTests.py --qa_mode 1 --batch_size 3 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endif(NIFTI_FOUND) @@ -105,7 +105,7 @@ if(Python3_FOUND) if(NIFTI_FOUND) add_test( NAME rpp_qa_tests_tensor_voxel_hip_all - COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runTests_voxel.py --qa_mode 1 --batch_size 3 + COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runVoxelTests.py --qa_mode 1 --batch_size 3 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endif(NIFTI_FOUND) diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt index 26017065e..a0bd42fa0 100644 --- a/utilities/test_suite/HIP/CMakeLists.txt +++ b/utilities/test_suite/HIP/CMakeLists.txt @@ -83,8 +83,10 @@ if (hip_FOUND AND OpenCV_FOUND) link_directories(${ROCM_PATH}/lib /usr/local/lib) add_executable(Tensor_hip Tensor_hip.cpp) + add_executable(Tensor_misc_hip Tensor_misc_hip.cpp) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPU_SUPPORT=1 -DRPP_BACKEND_HIP=1 -std=gnu++17") target_link_libraries(Tensor_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device) + target_link_libraries(Tensor_misc_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device) else() message(FATAL_ERROR "-- ${Red}Error: OpenCV and hip must be installed to install ${PROJECT_NAME} successfully!${ColourReset}") endif() diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 6008f33d3..fca310947 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -334,11 +334,20 @@ int main(int argc, char **argv) bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f, for testCase 90, 91 else if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8)) bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u); + CHECK_RETURN_STATUS(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize)); if(testCase == 91) CHECK_RETURN_STATUS(hipHostMalloc(&mean, reductionFuncResultArrLength * bitDepthByteSize)); } + // create generic descriptor and params in case of slice + RpptGenericDesc descriptor3D; + RpptGenericDescPtr descriptorPtr3D = &descriptor3D; + Rpp32s *anchorTensor = NULL, *shapeTensor = NULL; + Rpp32u *roiTensor = NULL; + if(testCase == 92) + set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize); + // Allocate hip memory for src/dst CHECK_RETURN_STATUS(hipMalloc(&d_input, inputBufferSize)); CHECK_RETURN_STATUS(hipMalloc(&d_output, outputBufferSize)); @@ -349,6 +358,7 @@ int main(int argc, char **argv) if(testCase == 82) CHECK_RETURN_STATUS(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI))); + // create cropRoi and patchRoi in case of crop_and_patch RpptROI *cropRoi, *patchRoi; if(testCase == 33) { @@ -1168,6 +1178,28 @@ int main(int argc, char **argv) break; } + case 92: + { + testCaseName = "slice"; + Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims + if(anchorTensor == NULL) + CHECK_RETURN_STATUS(hipHostMalloc(&anchorTensor, batchSize * numDims * sizeof(Rpp32s))); + if(shapeTensor == NULL) + CHECK_RETURN_STATUS(hipHostMalloc(&shapeTensor, batchSize * numDims * sizeof(Rpp32s))); + if(roiTensor == NULL) + CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, batchSize * numDims * 2 * sizeof(Rpp32u))); + bool enablePadding = false; + auto fillValue = 0; + init_slice(descriptorPtr3D, roiTensorPtrSrc, roiTensor, anchorTensor, shapeTensor); + + startWallTime = omp_get_wtime(); + if((inputBitDepth == 0 || inputBitDepth == 2) && srcDescPtr->layout == dstDescPtr->layout) + rppt_slice_gpu(d_input, descriptorPtr3D, d_output, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle); + else + missingFuncFlag = 1; + + break; + } default: { missingFuncFlag = 1; @@ -1262,6 +1294,42 @@ int main(int argc, char **argv) refFile.close(); } + // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values + // for output display and comparision purposes + if (testCase == 92) + { + if (dstDescPtr->layout == RpptLayout::NCHW) + { + if (dstDescPtr->c == 3) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 3; + dstImgSizes[i].height = shapeTensor[idx1 + 1]; + dstImgSizes[i].width = shapeTensor[idx1 + 2]; + } + } + else + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 2; + dstImgSizes[i].height = shapeTensor[idx1]; + dstImgSizes[i].width = shapeTensor[idx1 + 1]; + } + } + } + else if (dstDescPtr->layout == RpptLayout::NHWC) + { + for(int i = 0; i < batchSize; i++) + { + int idx1 = i * 3; + dstImgSizes[i].height = shapeTensor[idx1]; + dstImgSizes[i].width = shapeTensor[idx1 + 1]; + } + } + } + /*Compare the output of the function with golden outputs only if 1.QA Flag is set 2.input bit depth 0 (Input U8 && Output U8) @@ -1328,6 +1396,12 @@ int main(int argc, char **argv) if(testCase == 91) CHECK_RETURN_STATUS(hipHostFree(mean)); } + if(anchorTensor != NULL) + CHECK_RETURN_STATUS(hipHostFree(anchorTensor)); + if(shapeTensor != NULL) + CHECK_RETURN_STATUS(hipHostFree(shapeTensor)); + if(roiTensor != NULL) + CHECK_RETURN_STATUS(hipHostFree(roiTensor)); free(input); free(input_second); free(output); diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp new file mode 100644 index 000000000..96197f432 --- /dev/null +++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp @@ -0,0 +1,231 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "../rpp_test_suite_misc.h" + +int main(int argc, char **argv) +{ + // Handle inputs + const int MIN_ARG_COUNT = 9; + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_misc_hip