diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6c439d35d..16c4251f4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,30 @@
Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/rpp/en/latest/).
-### RPP 1.5.0 (unreleased)
+### RPP 1.8.0 (unreleased)
+
+### Changes
+
+* Prerequisites - ROCm install requires only --usecase=rocm
+* Use pre-allocated common scratchBufferHip everywhere in Tensor code for scratch HIP memory
+* Use CHECK_RETURN_STATUS everywhere to adhere to C++17 for hip
+* RPP Tensor Audio support on HOST for Spectrogram
+* RPP Tensor Audio support on HOST/HIP for Slice, by modifying voxel slice kernels to now accept anchor and shape params for a more generic version
+* RPP Tensor Audio support on HOST for Mel Filter Bank
+* RPP Tensor Normalize ND support on HOST and HIP
+
+### Tested configurations
+
+* Linux distribution
+ * Ubuntu - `20.04` / `22.04`
+ * CentOS - `7`
+ * RHEL - `8`/`9`
+* ROCm: rocm-core - `6.1.0.60100`
+* Clang - Version `5.0.1`
+* CMake - Version `3.22.3`
+* IEEE 754-based half-precision floating-point library - Version `1.12.0`
+
+### RPP 1.5.0
### Changes
@@ -376,4 +399,4 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
### Known issues
-* `CPU` backend is not enabled
\ No newline at end of file
+* `CPU` backend is not enabled
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5659a8e6..a53e2a0a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,6 +308,7 @@ message("-- ${White}${PROJECT_NAME} -- Link Libraries: ${LINK_LIBRARY_LIST}${Col
target_link_libraries(${PROJECT_NAME} ${LINK_LIBRARY_LIST})
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+target_link_libraries(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/libs/third_party/ffts/libffts.a)
set_target_properties(${PROJECT_NAME} PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR})
target_include_directories(${PROJECT_NAME}
@@ -429,15 +430,12 @@ set(CPACK_RPM_COMPONENT_INSTALL ON)
set(CPACK_RPM_RUNTIME_PACKAGE_NAME "${PROJECT_NAME}")
set(CPACK_RPM_RUNTIME_PACKAGE_REQUIRES "rocm-core, ${RPP_RPM_PACKAGE_LIST}")
set(CPACK_RPM_RUNTIME_PACKAGE_PROVIDES "${PROJECT_NAME}")
-set(CPACK_RPM_RUNTIME_PACKAGE_OBSOLETES "${PROJECT_NAME}")
set(CPACK_RPM_DEV_PACKAGE_NAME "${PROJECT_NAME}-devel")
set(CPACK_RPM_DEV_PACKAGE_REQUIRES "rocm-core, ${PROJECT_NAME}, ${RPP_RPM_DEV_PACKAGE_LIST}")
set(CPACK_RPM_DEV_PACKAGE_PROVIDES "${PROJECT_NAME}-devel")
-set(CPACK_RPM_DEV_PACKAGE_OBSOLETES "${PROJECT_NAME}-devel")
set(CPACK_RPM_TEST_PACKAGE_NAME "${PROJECT_NAME}-test")
set(CPACK_RPM_TEST_PACKAGE_REQUIRES "rocm-core, ${PROJECT_NAME}-devel")
set(CPACK_RPM_TEST_PACKAGE_PROVIDES "${PROJECT_NAME}-test")
-set(CPACK_RPM_TEST_PACKAGE_OBSOLETES "${PROJECT_NAME}-test")
set(CPACK_RPM_PACKAGE_LICENSE "MIT" )
# RPM package specific variable for ASAN
set(CPACK_RPM_ASAN_PACKAGE_NAME "${PROJECT_NAME}-asan" )
diff --git a/README.md b/README.md
index c9349c394..1235faed1 100644
--- a/README.md
+++ b/README.md
@@ -26,14 +26,14 @@ Input
(nifti1 .nii medical image) | fused_multiply_add_scalar
(brightened
## Prerequisites
* Linux
- * **Ubuntu** - `20.04` / `22.04`
- * **CentOS** - `7`
- * **RedHat** - `8` / `9`
- * **SLES** - `15-SP4`
+ * Ubuntu - `20.04` / `22.04`
+ * CentOS - `7`
+ * RedHat - `8` / `9`
+ * SLES - `15-SP4`
* [ROCm supported hardware](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html)
-* Install ROCm with [amdgpu-install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/amdgpu-install.html) with `--usecase=graphics,rocm --no-32`
+* Install ROCm with [amdgpu-install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/amdgpu-install.html) with `--usecase=rocm`
* Clang Version `5.0.1` and above
@@ -74,15 +74,15 @@ Input
(nifti1 .nii medical image) | fused_multiply_add_scalar
(brightened
sudo apt-get install half
```
-> [!NOTE]
-> You must use the appropriate package manager for your operating system.
-
* Compiler with support for C++ Version `17` and above
* OpenMP
* Threads
+> [!NOTE]
+> You must use the appropriate package manager for your operating system.
+
## Build and install instructions
### Package install
diff --git a/docker/rpp-on-ubuntu20.dockerfile b/docker/rpp-on-ubuntu20.dockerfile
index 568b427ab..cff7071c3 100644
--- a/docker/rpp-on-ubuntu20.dockerfile
+++ b/docker/rpp-on-ubuntu20.dockerfile
@@ -15,7 +15,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get -y install initramfs-tools libnuma-de
wget ${ROCM_INSTALLER_REPO} && \
sudo apt-get install -y ./${ROCM_INSTALLER_PACKAGE} && \
sudo apt-get update -y && \
- sudo amdgpu-install -y --usecase=graphics,rocm
+ sudo amdgpu-install -y --usecase=rocm
# install rpp dependencies - half.hpp
RUN wget https://sourceforge.net/projects/half/files/half/1.12.0/half-1.12.0.zip && \
unzip half-1.12.0.zip -d half-files && mkdir -p /usr/local/include/half && cp half-files/include/half.hpp /usr/local/include/half
diff --git a/docker/rpp-on-ubuntu22.dockerfile b/docker/rpp-on-ubuntu22.dockerfile
index b5571d039..dd94379ff 100644
--- a/docker/rpp-on-ubuntu22.dockerfile
+++ b/docker/rpp-on-ubuntu22.dockerfile
@@ -15,7 +15,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get -y install initramfs-tools libnuma-de
wget ${ROCM_INSTALLER_REPO} && \
sudo apt-get install -y ./${ROCM_INSTALLER_PACKAGE} && \
sudo apt-get update -y && \
- sudo amdgpu-install -y --usecase=graphics,rocm
+ sudo amdgpu-install -y --usecase=rocm
# install rpp dependencies - half.hpp
RUN wget https://sourceforge.net/projects/half/files/half/1.12.0/half-1.12.0.zip && \
unzip half-1.12.0.zip -d half-files && mkdir -p /usr/local/include/half && cp half-files/include/half.hpp /usr/local/include/half
diff --git a/docs/install/install.rst b/docs/install/install.rst
index 7693cc5af..cc0a4a0a1 100644
--- a/docs/install/install.rst
+++ b/docs/install/install.rst
@@ -22,7 +22,7 @@ Prerequisites
* `ROCm supported hardware `_
-* Install ROCm with `amdgpu-install `_ with ``--usecase=graphics,rocm --no-32``
+* Install ROCm with `amdgpu-install `_ with ``--usecase=rocm``
* Clang Version `5.0.1` and above
diff --git a/include/rppdefs.h b/include/rppdefs.h
index c5861e0e6..28876d7f5 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -134,7 +134,7 @@ typedef enum
/*! \brief Out of bound source ROI \ingroup group_rppdefs */
RPP_ERROR_OUT_OF_BOUND_SRC_ROI = -17,
/*! \brief src and dst layout mismatch \ingroup group_rppdefs */
- RPP_ERROR_SRC_DST_LAYOUT_MISMATCH = -18,
+ RPP_ERROR_LAYOUT_MISMATCH = -18,
/*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
RPP_ERROR_INVALID_CHANNELS = -19
} RppStatus;
@@ -369,10 +369,13 @@ typedef enum
*/
typedef enum
{
- NCHW,
- NHWC,
- NCDHW,
- NDHWC
+ NCHW, // BatchSize-Channels-Height-Width
+ NHWC, // BatchSize-Height-Width-Channels
+ NCDHW, // BatchSize-Channels-Depth-Height-Width
+ NDHWC, // BatchSize-Depth-Height-Width-Channels
+ NHW, // BatchSize-Height-Width
+ NFT, // BatchSize-Frequency-Time -> Frequency Major used for Spectrogram / MelfilterBank
+ NTF // BatchSize-Time-Frequency -> Time Major used for Spectrogram / MelfilterBank
} RpptLayout;
/*! \brief RPPT Tensor 2D ROI type enum
@@ -425,6 +428,15 @@ typedef enum
REFLECT
} RpptAudioBorderType;
+/*! \brief RPPT Mel Scale Formula
+ * \ingroup group_rppdefs
+ */
+typedef enum
+{
+ SLANEY = 0, // Follows Slaney’s MATLAB Auditory Modelling Work behavior
+ HTK, // Follows O’Shaughnessy’s book formula, consistent with Hidden Markov Toolkit(HTK), m = 2595 * log10(1 + (f/700))
+} RpptMelScaleFormula;
+
/*! \brief RPPT Tensor 2D ROI LTRB struct
* \ingroup group_rppdefs
*/
@@ -1024,6 +1036,7 @@ typedef struct
Rpp64u* dstBatchIndex;
Rpp32u* inc;
Rpp32u* dstInc;
+ hipMemRpp32u scratchBuf;
} memGPU;
/*! \brief RPP HIP-HOST memory management
diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
index 1941df0da..31f3e95ef 100644
--- a/include/rppt_tensor_audio_augmentations.h
+++ b/include/rppt_tensor_audio_augmentations.h
@@ -60,7 +60,7 @@ extern "C" {
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
-RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32f *detectedIndexTensor, Rpp32f *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
+RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
/*! \brief To Decibels augmentation on HOST backend
* \details To Decibels augmentation for 1D audio buffer converts magnitude values to decibel values
@@ -110,6 +110,47 @@ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr,
*/
RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle);
+/*! \brief Produces a spectrogram from a 1D audio buffer on HOST backend
+ * \details Spectrogram for 1D audio buffer
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT / NTF)
+ * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
+ * \param [in] centerWindows indicates whether extracted windows should be padded so that the window function is centered at multiples of window_step
+ * \param [in] reflectPadding indicates the padding policy when sampling outside the bounds of the signal
+ * \param [in] windowFunction samples of the window function that will be multiplied to each extracted window when calculating the Short Time Fourier Transform (STFT)
+ * \param [in] nfft size of the FFT
+ * \param [in] power exponent of the magnitude of the spectrum
+ * \param [in] windowLength window size in number of samples
+ * \param [in] windowStep step between the STFT windows in number of samples
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_spectrogram_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcLengthTensor, bool centerWindows, bool reflectPadding, Rpp32f *windowFunction, Rpp32s nfft, Rpp32s power, Rpp32s windowLength, Rpp32s windowStep, rppHandle_t rppHandle);
+
+/*! \brief Mel filter bank augmentation HOST backend
+ * \details Mel filter bank augmentation for audio data
+ * \param[in] srcPtr source tensor in HOST memory
+ * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT)
+ * \param[out] dstPtr destination tensor in HOST memory
+ * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32, layout - NFT)
+ * \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+ * \param[in] maxFreq maximum frequency if not provided maxFreq = sampleRate / 2
+ * \param[in] minFreq minimum frequency
+ * \param[in] melFormula formula used to convert frequencies from hertz to mel and from mel to hertz (SLANEY / HTK)
+ * \param[in] numFilter number of mel filters
+ * \param[in] sampleRate sampling rate of the audio
+ * \param[in] normalize boolean variable that determine whether to normalize weights / not
+ * \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDims, Rpp32f maxFreq, Rpp32f minFreq, RpptMelScaleFormula melFormula, Rpp32s numFilter, Rpp32f sampleRate, bool normalize, rppHandle_t rppHandle);
+
/*! \brief Resample augmentation on HOST backend
* \details Resample augmentation for audio data
* \param[in] srcPtr source tensor in HOST memory
diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h
index 695c3252d..8e846a41b 100644
--- a/include/rppt_tensor_geometric_augmentations.h
+++ b/include/rppt_tensor_geometric_augmentations.h
@@ -448,38 +448,42 @@ RppStatus rppt_phase_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc
* \details This function performs slice augmentation on a generic 4D tensor.
* Slice augmentation involves selecting a region of interest (ROI) from the source tensor
* and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstGenericDescPtr destination tensor descriptor
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] srcPtr source tensor memory in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions)
+ * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor of size = batchSize * numberOfDimensions)
+ * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true
+ * \param [in] enablePadding boolean flag to specify if padding is enabled or not
+ * \param [in] roiTensor roi data in HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
- * \ingroup group_tensor_geometric
*/
-RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+RppStatus rppt_slice_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
/*! \brief Slice augmentation GPU
* \details This function performs slice augmentation on a generic 4D tensor.
* Slice augmentation involves selecting a region of interest (ROI) from the source tensor
* and copying it to the destination tensor. Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \param[in] srcPtr source tensor in HIP memory
- * \param[in] srcGenericDescPtr source tensor descriptor
- * \param[out] dstPtr destination tensor in HIP memory
- * \param[in] dstGenericDescPtr destination tensor descriptor
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] srcPtr source tensor memory in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] anchorTensor starting index of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions)
+ * \param [in] shapeTensor length of the slice for each dimension in input (1D tensor in pinned/HOST memory of size = batchSize * numberOfDimensions)
+ * \param [in] fillValue fill value that is used to fill output if enablePadding is set to true
+ * \param [in] enablePadding boolean flag to specify if padding is enabled or not
+ * \param [in] roiTensor roi data in pinned/HOST memory (1D tensor of size = batchSize * numberOfDimensions * 2)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
- * \ingroup group_tensor_geometric
*/
-RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32s *anchorTensor, Rpp32s *shapeTensor, RppPtr_t fillValue, bool enablePadding, Rpp32u *roiTensor, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
/*! \brief Crop and Patch augmentation on HOST backend for a NCHW/NHWC layout tensor
@@ -539,15 +543,15 @@ RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPt
Support added for f32 -> f32 and u8 -> u8 dataypes.
* \image html input150x150x4.gif Sample Input
* \image html geometric_augmentations_flip_150x150x4.gif Sample Output
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
* \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
* \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
* \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in HOST memory, of size batchSize, with depthTensor[i] = 0/1)
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
@@ -562,15 +566,15 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes
Support added for f32 -> f32 and u8 -> u8 dataypes.
* \image html input150x150x4.gif Sample Input
* \image html geometric_augmentations_flip_150x150x4.gif Sample Output
- * \param[in] srcPtr source tensor in HIP memory
- * \param[in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
- * \param[out] dstPtr destination tensor in HIP memory
- * \param[in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
* \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
* \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
* \param [in] depthTensor depth flag values to set depth flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with depthTensor[i] = 0/1)
- * \param[in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
- * \param[in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
+ * \param [in] roiGenericPtrSrc ROI data for each image in source tensor (tensor of batchSize RpptRoiGeneric values)
+ * \param [in] roiType ROI type used (RpptRoi3DType::XYZWHD or RpptRoi3DType::LTFRBB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 89418be78..7026b13b4 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -49,7 +49,7 @@ extern "C" {
* \param [in] srcPtr source tensor in HOST memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorSumArr destination array in HOST memory
- * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength = srcDescPtr->n * 4)
+ * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
* \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
@@ -67,7 +67,7 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorSumArr destination array in HIP memory
- * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength = srcDescPtr->n * 4)
+ * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
* \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
@@ -85,7 +85,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcPtr source tensor in HOST memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] minArr destination array in HOST memory
- * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then minArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then minArrLength = srcDescPtr->n * 4)
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
* \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
@@ -103,7 +103,7 @@ RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] minArr destination array in HIP memory
- * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then minArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then minArrLength = srcDescPtr->n * 4)
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
* \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
@@ -121,7 +121,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcPtr source tensor in HOST memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] maxArr destination array in HOST memory
- * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then maxArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then maxArrLength = srcDescPtr->n * 4)
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
* \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
@@ -139,10 +139,10 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] maxArr destination array in HIP memory
- * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then maxArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then maxArrLength = srcDescPtr->n * 4)
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
* \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
- * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
@@ -150,6 +150,49 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Normalize Generic augmentation on HOST backend
+ * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor.
+ * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. Also has toggle variant(NHWC->NCHW) support for 3D.
+ * \param [in] srcPtr source tensor memory in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] axisMask axis along which normalization needs to be done
+ * \param [in] meanTensor values to be subtracted from input
+ * \param [in] stdDevTensor standard deviation values to scale the input
+ * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided)
+ * \param [in] scale value to be multiplied with data after subtracting from mean
+ * \param [in] shift value to be added finally
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_normalize_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Normalize Generic augmentation on HIP backend
+ * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor.
+ * \param [in] srcPtr source tensor memory in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] axisMask axis along which normalization needs to be done
+ * \param [in] meanTensor values to be subtracted from input
+ * \param [in] stdDevTensor standard deviation values to scale the input
+ * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided)
+ * \param [in] scale value to be multiplied with data after subtracting from mean
+ * \param [in] shift value to be added finally
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! \brief Tensor mean operation on HOST backend for a NCHW/NHWC layout tensor
* \details The tensor mean is a reduction operation that finds the channel-wise (R mean / G mean / B mean) and total mean for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
@@ -229,4 +272,4 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
+#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
\ No newline at end of file
diff --git a/include/third_party/ffts/ffts.h b/include/third_party/ffts/ffts.h
new file mode 100644
index 000000000..cc85a885b
--- /dev/null
+++ b/include/third_party/ffts/ffts.h
@@ -0,0 +1,110 @@
+/*
+
+ This file is part of FFTS.
+
+ Copyright (c) 2012, Anthony M. Blake
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_H
+#define FFTS_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if (defined(_WIN32) || defined(WIN32)) && defined(FFTS_SHARED)
+# ifdef FFTS_BUILD
+# define FFTS_API __declspec(dllexport)
+# else
+# define FFTS_API __declspec(dllimport)
+# endif
+#else
+# if (__GNUC__ >= 4) || defined(HAVE_GCC_VISIBILITY)
+# define FFTS_API __attribute__ ((visibility("default")))
+# else
+# define FFTS_API
+# endif
+#endif
+
+/* The direction of the transform
+ (i.e, the sign of the exponent in the transform.)
+*/
+#define FFTS_FORWARD (-1)
+#define FFTS_BACKWARD (+1)
+
+struct _ffts_plan_t;
+typedef struct _ffts_plan_t ffts_plan_t;
+
+/* Complex data is stored in the interleaved format
+ (i.e, the real and imaginary parts composing each
+ element of complex data are stored adjacently in memory)
+
+ The multi-dimensional arrays passed are expected to be
+ stored as a single contiguous block in row-major order
+*/
+FFTS_API ffts_plan_t*
+ffts_init_1d(size_t N, int sign);
+
+FFTS_API ffts_plan_t*
+ffts_init_2d(size_t N1, size_t N2, int sign);
+
+FFTS_API ffts_plan_t*
+ffts_init_nd(int rank, size_t *Ns, int sign);
+
+/* For real transforms, sign == FFTS_FORWARD implies a real-to-complex
+ forwards tranform, and sign == FFTS_BACKWARD implies a complex-to-real
+ backwards transform.
+
+ The output of a real-to-complex transform is N/2+1 complex numbers,
+ where the redundant outputs have been omitted.
+*/
+FFTS_API ffts_plan_t*
+ffts_init_1d_real(size_t N, int sign);
+
+FFTS_API ffts_plan_t*
+ffts_init_2d_real(size_t N1, size_t N2, int sign);
+
+FFTS_API ffts_plan_t*
+ffts_init_nd_real(int rank, size_t *Ns, int sign);
+
+FFTS_API void
+ffts_execute(ffts_plan_t *p, const void *input, void *output);
+
+FFTS_API void
+ffts_free(ffts_plan_t *p);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FFTS_H */
diff --git a/include/third_party/ffts/ffts_attributes.h b/include/third_party/ffts/ffts_attributes.h
new file mode 100644
index 000000000..bdfd6162f
--- /dev/null
+++ b/include/third_party/ffts/ffts_attributes.h
@@ -0,0 +1,111 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_ATTRIBUTES_H
+#define FFTS_ATTRIBUTES_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+/* Macro definitions for various function/variable attributes */
+#ifdef __GNUC__
+#define GCC_VERSION_AT_LEAST(x,y) \
+ (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y)
+#else
+#define GCC_VERSION_AT_LEAST(x,y) 0
+#endif
+
+#ifdef __GNUC__
+#define FFTS_ALIGN(x) __attribute__((aligned(x)))
+#elif defined(_MSC_VER)
+#define FFTS_ALIGN(x) __declspec(align(x))
+#else
+#define FFTS_ALIGN(x)
+#endif
+
+#if GCC_VERSION_AT_LEAST(3,1)
+#define FFTS_ALWAYS_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define FFTS_ALWAYS_INLINE __forceinline
+#else
+#define FFTS_ALWAYS_INLINE inline
+#endif
+
+#if defined(_MSC_VER)
+#define FFTS_INLINE __inline
+#else
+#define FFTS_INLINE inline
+#endif
+
+#if defined(__GNUC__)
+#define FFTS_RESTRICT __restrict
+#elif defined(_MSC_VER)
+#define FFTS_RESTRICT __restrict
+#else
+#define FFTS_RESTRICT
+#endif
+
+#if GCC_VERSION_AT_LEAST(4,5)
+#define FFTS_ASSUME(cond) do { if (!(cond)) __builtin_unreachable(); } while (0)
+#elif defined(_MSC_VER)
+#define FFTS_ASSUME(cond) __assume(cond)
+#else
+#define FFTS_ASSUME(cond)
+#endif
+
+#if GCC_VERSION_AT_LEAST(4,7)
+#define FFTS_ASSUME_ALIGNED_16(x) __builtin_assume_aligned(x, 16)
+#else
+#define FFTS_ASSUME_ALIGNED_16(x) x
+#endif
+
+#if GCC_VERSION_AT_LEAST(4,7)
+#define FFTS_ASSUME_ALIGNED_32(x) __builtin_assume_aligned(x, 32)
+#else
+#define FFTS_ASSUME_ALIGNED_32(x) x
+#endif
+
+#if defined(__GNUC__)
+#define FFTS_LIKELY(cond) __builtin_expect(!!(cond), 1)
+#else
+#define FFTS_LIKELY(cond) cond
+#endif
+
+#if defined(__GNUC__)
+#define FFTS_UNLIKELY(cond) __builtin_expect(!!(cond), 0)
+#else
+#define FFTS_UNLIKELY(cond) cond
+#endif
+
+#endif /* FFTS_ATTRIBUTES_H */
diff --git a/libs/third_party/ffts/libffts.a b/libs/third_party/ffts/libffts.a
new file mode 100644
index 000000000..b30b11471
Binary files /dev/null and b/libs/third_party/ffts/libffts.a differ
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index a6d58d5a2..5f8048a19 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -6123,41 +6123,41 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2
inline void compute_variance_8_host(__m256d *p1, __m256d *pMean, __m256d *pVar)
{
__m256d pSub = _mm256_sub_pd(p1[0], pMean[0]);
- pVar[0] = _mm256_fmadd_pd(pSub, pSub, pVar[0]);
+ pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]);
pSub = _mm256_sub_pd(p1[1], pMean[0]);
- pVar[0] = _mm256_fmadd_pd(pSub, pSub, pVar[0]);
+ pVar[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVar[0]);
}
-inline void compute_varianceChannel_pln3_24_host(__m256d *p1, __m256d *pMeanR, __m256d *pMeanG, __m256d *pMeanB, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
+inline void compute_variance_channel_pln3_24_host(__m256d *p1, __m256d *pMeanR, __m256d *pMeanG, __m256d *pMeanB, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
{
__m256d pSub = _mm256_sub_pd(p1[0], pMeanR[0]);
- pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]);
+ pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
pSub = _mm256_sub_pd(p1[1], pMeanR[0]);
- pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]);
+ pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
pSub = _mm256_sub_pd(p1[2], pMeanG[0]);
- pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]);
+ pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
pSub = _mm256_sub_pd(p1[3], pMeanG[0]);
- pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]);
+ pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
pSub = _mm256_sub_pd(p1[4], pMeanB[0]);
- pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]);
+ pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
pSub = _mm256_sub_pd(p1[5], pMeanB[0]);
- pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]);
+ pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
}
-inline void compute_varianceImage_pln3_24_host(__m256d *p1, __m256d *pMean, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
+inline void compute_variance_image_pln3_24_host(__m256d *p1, __m256d *pMean, __m256d *pVarR, __m256d *pVarG, __m256d *pVarB)
{
__m256d pSub = _mm256_sub_pd(p1[0], pMean[0]);
- pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]);
+ pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
pSub = _mm256_sub_pd(p1[1], pMean[0]);
- pVarR[0] = _mm256_fmadd_pd(pSub, pSub, pVarR[0]);
+ pVarR[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarR[0]);
pSub = _mm256_sub_pd(p1[2], pMean[0]);
- pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]);
+ pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
pSub = _mm256_sub_pd(pMean[0], p1[3]);
- pVarG[0] = _mm256_fmadd_pd(pSub, pSub, pVarG[0]);
+ pVarG[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarG[0]);
pSub = _mm256_sub_pd(p1[4], pMean[0]);
- pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]);
+ pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
pSub = _mm256_sub_pd(p1[5], pMean[0]);
- pVarB[0] = _mm256_fmadd_pd(pSub, pSub, pVarB[0]);
+ pVarB[0] = _mm256_add_pd(_mm256_mul_pd(pSub, pSub), pVarB[0]);
}
inline void compute_vignette_48_host(__m256 *p, __m256 &pMultiplier, __m256 &pILocComponent, __m256 &pJLocComponent)
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index d47193831..fd9373669 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -2611,6 +2611,127 @@ inline Rpp32f rpp_hsum_ps(__m256 x)
return _mm_cvtss_f32(sum);
}
+/* Computes inverse square root */
+inline Rpp32f rpp_rsqrt_ps(Rpp32f x)
+{
+ __m128 X = _mm_set_ss(x);
+ __m128 tmp = _mm_rsqrt_ss(X);
+ Rpp32f y = _mm_cvtss_f32(tmp);
+ return y * (1.5f - x * 0.5f * y * y);
+}
+
+/* Compute inverse square root */
+/* SSE matches to 6 decimal places with raw C version due to newton rhapson approximation*/
+inline void rpp_rsqrt_sse(Rpp32f *input, Rpp64s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f mul)
+{
+ Rpp64s i = 0;
+ __m128 rdivx4 = _mm_set1_ps(rdiv);
+ __m128 mulx4 = _mm_set1_ps(mul * 0.5f);
+ if (eps) // epsilon is present - no need for masking, but we need to add it
+ {
+ __m128 epsx4 = _mm_set1_ps(eps);
+ for (; i + 4 <= numElements; i += 4)
+ {
+ __m128 x = _mm_loadu_ps(&input[i]);
+ x = _mm_mul_ps(x, rdivx4);
+ x = _mm_add_ps(x, epsx4);
+ __m128 y = _mm_rsqrt_ps(x);
+ __m128 y2 = _mm_mul_ps(y, y);
+ __m128 xy2 = _mm_mul_ps(x, y2);
+ __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2);
+ y = _mm_mul_ps(y, three_minus_xy2);
+ y = _mm_mul_ps(y, mulx4);
+ _mm_storeu_ps(&input[i], y);
+ }
+ }
+ else
+ {
+ for (; i + 4 <= numElements; i += 4)
+ {
+ __m128 x = _mm_loadu_ps(&input[i]);
+ x = _mm_mul_ps(x, rdivx4);
+ __m128 mask = _mm_cmpneq_ps(x, xmm_p0);
+ __m128 y = _mm_rsqrt_ps(x);
+ y = _mm_and_ps(y, mask);
+ __m128 y2 = _mm_mul_ps(y, y);
+ __m128 xy2 = _mm_mul_ps(x, y2);
+ __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2);
+ y = _mm_mul_ps(y, three_minus_xy2);
+ y = _mm_mul_ps(y, mulx4);
+ _mm_storeu_ps(&input[i], y);
+ }
+ }
+ if (eps)
+ {
+ for (; i < numElements; i++)
+ input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * mul;
+ }
+ else
+ {
+ for (; i < numElements; i++)
+ {
+ Rpp32f x = input[i] * rdiv;
+ input[i] = x ? rpp_rsqrt_ps(x) * mul : 0;
+ }
+ }
+}
+
+/* Compute inverse square root */
+/* AVX2 matches to 6 decimal places with raw C version due to newton rhapson approximation*/
+inline void rpp_rsqrt_avx(Rpp32f *input, Rpp32s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f scale)
+{
+ Rpp32s i = 0;
+ __m256 rdivx8 = _mm256_set1_ps(rdiv);
+ __m256 mulx8 = _mm256_set1_ps(scale * 0.5f);
+ if (eps) // epsilon is present - no need for masking, but we need to add it
+ {
+ __m256 epsx8 = _mm256_set1_ps(eps);
+ for (; i + 8 <= numElements; i += 8)
+ {
+ __m256 x = _mm256_loadu_ps(&input[i]);
+ x = _mm256_mul_ps(x, rdivx8);
+ x = _mm256_add_ps(x, epsx8);
+ __m256 y = _mm256_rsqrt_ps(x);
+ __m256 y2 = _mm256_mul_ps(y, y);
+ __m256 xy2 = _mm256_mul_ps(x, y2);
+ __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2);
+ y = _mm256_mul_ps(y, three_minus_xy2);
+ y = _mm256_mul_ps(y, mulx8);
+ _mm256_storeu_ps(&input[i], y);
+ }
+ }
+ else
+ {
+ for (; i + 8 <= numElements; i += 8)
+ {
+ __m256 x = _mm256_loadu_ps(&input[i]);
+ x = _mm256_mul_ps(x, rdivx8);
+ __m256 mask = _mm256_cmp_ps(x, avx_p0, _CMP_NEQ_OQ);
+ __m256 y = _mm256_rsqrt_ps(x);
+ y = _mm256_and_ps(y, mask);
+ __m256 y2 = _mm256_mul_ps(y, y);
+ __m256 xy2 = _mm256_mul_ps(x, y2);
+ __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2);
+ y = _mm256_mul_ps(y, three_minus_xy2);
+ y = _mm256_mul_ps(y, mulx8);
+ _mm256_storeu_ps(&input[i], y);
+ }
+ }
+ if (eps)
+ {
+ for (; i < numElements; i++)
+ input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * scale;
+ }
+ else
+ {
+ for (; i < numElements; i++)
+ {
+ Rpp32f x = input[i] * rdiv;
+ input[i] = x ? rpp_rsqrt_ps(x) * scale : 0;
+ }
+ }
+}
+
static inline void fast_matmul4x4_sse(float *A, float *B, float *C)
{
__m128 row1 = _mm_load_ps(&B[0]); // Row 0 of B
diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
index 1f47e8ee7..4338483a9 100644
--- a/src/modules/CMakeLists.txt
+++ b/src/modules/CMakeLists.txt
@@ -97,6 +97,7 @@ if( "${BACKEND}" STREQUAL "HIP")
set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS}")
set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+ set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
# Add HIP specific preprocessor flags
add_definitions(-DHIP_COMPILE)
@@ -111,6 +112,7 @@ elseif( "${BACKEND}" STREQUAL "OCL")
list(APPEND Rpp_Source ${CPPFILES} ${MOD_CL_CPP})
message("-- ${Green}OpenCL kernels added!${ColourReset}")
set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+ set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
# Add OpenCL specific preprocessor flags
add_definitions(-DOCL_COMPILE)
@@ -125,6 +127,7 @@ elseif( "${BACKEND}" STREQUAL "CPU")
# Add CPU specific includes
set(INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/common/)
set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+ set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
endif()
message("-- ${White}AMD RPP ${PROJECT_NAME} -- Include Directories:${INCLUDE_LIST}${ColourReset}")
add_compile_options("-Wno-unused-result")
diff --git a/src/modules/cpu/host_tensor_audio_augmentations.hpp b/src/modules/cpu/host_tensor_audio_augmentations.hpp
index 0c6ccf211..82d43d082 100644
--- a/src/modules/cpu/host_tensor_audio_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_audio_augmentations.hpp
@@ -29,6 +29,8 @@ SOFTWARE.
#include "kernel/to_decibels.hpp"
#include "kernel/pre_emphasis_filter.hpp"
#include "kernel/down_mixing.hpp"
+#include "kernel/spectrogram.hpp"
+#include "kernel/mel_filter_bank.hpp"
#include "kernel/resample.hpp"
#endif // HOST_TENSOR_AUDIO_AUGMENTATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp
index 15a4245c5..0a7e8ff51 100644
--- a/src/modules/cpu/host_tensor_statistical_operations.hpp
+++ b/src/modules/cpu/host_tensor_statistical_operations.hpp
@@ -30,5 +30,6 @@ SOFTWARE.
#include "kernel/tensor_max.hpp"
#include "kernel/tensor_mean.hpp"
#include "kernel/tensor_stddev.hpp"
+#include "kernel/normalize.hpp"
-#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
+#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/cpu/kernel/mel_filter_bank.hpp b/src/modules/cpu/kernel/mel_filter_bank.hpp
new file mode 100644
index 000000000..9cc6d26d2
--- /dev/null
+++ b/src/modules/cpu/kernel/mel_filter_bank.hpp
@@ -0,0 +1,252 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+struct BaseMelScale
+{
+ public:
+ virtual Rpp32f hz_to_mel(Rpp32f hz) = 0;
+ virtual Rpp32f mel_to_hz(Rpp32f mel) = 0;
+ virtual ~BaseMelScale() = default;
+};
+
+struct HtkMelScale : public BaseMelScale
+{
+ Rpp32f hz_to_mel(Rpp32f hz) { return 1127.0f * std::log(1.0f + (hz / 700.0f)); }
+ Rpp32f mel_to_hz(Rpp32f mel) { return 700.0f * (std::exp(mel / 1127.0f) - 1.0f); }
+ public:
+ ~HtkMelScale() {};
+};
+
+struct SlaneyMelScale : public BaseMelScale
+{
+ const Rpp32f freqLow = 0;
+ const Rpp32f fsp = 200.0 / 3.0;
+ const Rpp32f minLogHz = 1000.0;
+ const Rpp32f minLogMel = (minLogHz - freqLow) / fsp;
+ const Rpp32f stepLog = 0.068751777; // Equivalent to std::log(6.4) / 27.0;
+
+ const Rpp32f invMinLogHz = 1.0f / 1000.0;
+ const Rpp32f invStepLog = 1.0f / stepLog;
+ const Rpp32f invFsp = 1.0f / fsp;
+
+ Rpp32f hz_to_mel(Rpp32f hz)
+ {
+ Rpp32f mel = 0.0f;
+ if (hz >= minLogHz)
+ mel = minLogMel + std::log(hz * invMinLogHz) * invStepLog;
+ else
+ mel = (hz - freqLow) * invFsp;
+
+ return mel;
+ }
+
+ Rpp32f mel_to_hz(Rpp32f mel)
+ {
+ Rpp32f hz = 0.0f;
+ if (mel >= minLogMel)
+ hz = minLogHz * std::exp(stepLog * (mel - minLogMel));
+ else
+ hz = freqLow + mel * fsp;
+ return hz;
+ }
+ public:
+ ~SlaneyMelScale() {};
+};
+
+RppStatus mel_filter_bank_host_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32s *srcDimsTensor,
+ Rpp32f maxFreqVal, // check unused
+ Rpp32f minFreqVal,
+ RpptMelScaleFormula melFormula,
+ Rpp32s numFilter,
+ Rpp32f sampleRate,
+ bool normalize,
+ rpp::Handle& handle)
+{
+ BaseMelScale *melScalePtr;
+ switch(melFormula)
+ {
+ case RpptMelScaleFormula::HTK:
+ melScalePtr = new HtkMelScale;
+ break;
+ case RpptMelScaleFormula::SLANEY:
+ default:
+ melScalePtr = new SlaneyMelScale();
+ break;
+ }
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u batchSize = srcDescPtr->n;
+ Rpp32f *scratchMem = handle.GetInitHandle()->mem.mcpu.scratchBufferHost;
+
+ Rpp32f maxFreq = sampleRate / 2;
+ Rpp32f minFreq = minFreqVal;
+
+ // Convert lower, higher frequencies to mel scale and find melStep
+ Rpp64f melLow = melScalePtr->hz_to_mel(minFreq);
+ Rpp64f melHigh = melScalePtr->hz_to_mel(maxFreq);
+ Rpp64f melStep = (melHigh - melLow) / (numFilter + 1);
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ // Extract nfft, number of Frames, numBins
+ Rpp32s nfft = (srcDimsTensor[batchCount * 2] - 1) * 2;
+ Rpp32s numBins = nfft / 2 + 1;
+ Rpp32s numFrames = srcDimsTensor[batchCount * 2 + 1];
+
+ // Find hzStep
+ Rpp64f hzStep = static_cast(sampleRate) / nfft;
+ Rpp64f invHzStep = 1.0 / hzStep;
+
+ // Find fftBinStart and fftBinEnd
+ Rpp32s fftBinStart = std::ceil(minFreq * invHzStep);
+ Rpp32s fftBinEnd = std::ceil(maxFreq * invHzStep);
+ fftBinEnd = std::min(fftBinEnd, numBins);
+
+ // Set/Fill normFactors, weightsDown and intervals
+ Rpp32f *normFactors = scratchMem + (batchCount * numFilter);
+ std::fill(normFactors, normFactors + numFilter, 1.f); // normFactors contain numFilter values of type float
+ Rpp32f *weightsDown = scratchMem + (batchSize * numFilter) + (batchCount * numBins);
+ memset(weightsDown, 0, sizeof(numBins * sizeof(Rpp32f))); // weightsDown contain numBins values of type float
+ Rpp32s *intervals = reinterpret_cast(weightsDown + (batchSize * numBins));
+ std::fill(intervals, intervals + numBins, -1); // intervals contain numBins values of type integer
+
+ Rpp32s fftBin = fftBinStart;
+ Rpp64f mel0 = melLow, mel1 = melLow + melStep;
+ Rpp64f fIter = fftBin * hzStep;
+ for (int interval = 0; interval < numFilter + 1; interval++, mel0 = mel1, mel1 += melStep)
+ {
+ Rpp64f f0 = melScalePtr->mel_to_hz(mel0);
+ Rpp64f f1 = melScalePtr->mel_to_hz(interval == numFilter ? melHigh : mel1);
+ Rpp64f slope = 1. / (f1 - f0);
+
+ if (normalize && interval < numFilter)
+ {
+ Rpp64f f2 = melScalePtr->mel_to_hz(mel1 + melStep);
+ normFactors[interval] = 2.0 / (f2 - f0);
+ }
+
+ for (; fftBin < fftBinEnd && fIter < f1; fftBin++, fIter = fftBin * hzStep)
+ {
+ weightsDown[fftBin] = (f1 - fIter) * slope;
+ intervals[fftBin] = interval;
+ }
+ }
+
+ Rpp32u maxFrames = std::min(static_cast(numFrames + 8), dstDescPtr->strides.hStride);
+ Rpp32u maxAlignedLength = maxFrames & ~7;
+ Rpp32u vectorIncrement = 8;
+
+ // Set ROI values in dst buffer to 0.0
+ for(int i = 0; i < numFilter; i++)
+ {
+ Rpp32f *dstPtrRow = dstPtrTemp + i * dstDescPtr->strides.hStride;
+ Rpp32u vectorLoopCount = 0;
+ for(; vectorLoopCount < maxAlignedLength; vectorLoopCount += 8)
+ {
+ _mm256_storeu_ps(dstPtrRow, avx_p0);
+ dstPtrRow += 8;
+ }
+ for(; vectorLoopCount < maxFrames; vectorLoopCount++)
+ *dstPtrRow++ = 0.0f;
+ }
+
+ Rpp32u alignedLength = numFrames & ~7;
+ __m256 pSrc, pDst;
+ Rpp32f *srcRowPtr = srcPtrTemp + fftBinStart * srcDescPtr->strides.hStride;
+ for (int64_t fftBin = fftBinStart; fftBin < fftBinEnd; fftBin++)
+ {
+ auto filterUp = intervals[fftBin];
+ auto weightUp = 1.0f - weightsDown[fftBin];
+ auto filterDown = filterUp - 1;
+ auto weightDown = weightsDown[fftBin];
+
+ if (filterDown >= 0)
+ {
+ Rpp32f *dstRowPtrTemp = dstPtrTemp + filterDown * dstDescPtr->strides.hStride;
+ Rpp32f *srcRowPtrTemp = srcRowPtr;
+
+ if (normalize)
+ weightDown *= normFactors[filterDown];
+ __m256 pWeightDown = _mm256_set1_ps(weightDown);
+
+ int vectorLoopCount = 0;
+ for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ pSrc = _mm256_loadu_ps(srcRowPtrTemp);
+ pSrc = _mm256_mul_ps(pSrc, pWeightDown);
+ pDst = _mm256_loadu_ps(dstRowPtrTemp);
+ pDst = _mm256_add_ps(pDst, pSrc);
+ _mm256_storeu_ps(dstRowPtrTemp, pDst);
+ dstRowPtrTemp += vectorIncrement;
+ srcRowPtrTemp += vectorIncrement;
+ }
+
+ for (; vectorLoopCount < numFrames; vectorLoopCount++)
+ (*dstRowPtrTemp++) += weightDown * (*srcRowPtrTemp++);
+ }
+
+ if (filterUp >= 0 && filterUp < numFilter)
+ {
+ Rpp32f *dstRowPtrTemp = dstPtrTemp + filterUp * dstDescPtr->strides.hStride;
+ Rpp32f *srcRowPtrTemp = srcRowPtr;
+
+ if (normalize)
+ weightUp *= normFactors[filterUp];
+ __m256 pWeightUp = _mm256_set1_ps(weightUp);
+
+ int vectorLoopCount = 0;
+ for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ pSrc = _mm256_loadu_ps(srcRowPtrTemp);
+ pSrc = _mm256_mul_ps(pSrc, pWeightUp);
+ pDst = _mm256_loadu_ps(dstRowPtrTemp);
+ pDst = _mm256_add_ps(pDst, pSrc);
+ _mm256_storeu_ps(dstRowPtrTemp, pDst);
+ dstRowPtrTemp += vectorIncrement;
+ srcRowPtrTemp += vectorIncrement;
+ }
+
+ for (; vectorLoopCount < numFrames; vectorLoopCount++)
+ (*dstRowPtrTemp++) += weightUp * (*srcRowPtrTemp++);
+ }
+
+ srcRowPtr += srcDescPtr->strides.hStride;
+ }
+ }
+ delete melScalePtr;
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/non_silent_region_detection.hpp b/src/modules/cpu/kernel/non_silent_region_detection.hpp
index 74dffb18e..39d9e6940 100644
--- a/src/modules/cpu/kernel/non_silent_region_detection.hpp
+++ b/src/modules/cpu/kernel/non_silent_region_detection.hpp
@@ -95,8 +95,8 @@ Rpp32f getSquare(Rpp32f &value)
RppStatus non_silent_region_detection_host_tensor(Rpp32f *srcPtr,
RpptDescPtr srcDescPtr,
Rpp32s *srcLengthTensor,
- Rpp32f *detectedIndexTensor,
- Rpp32f *detectionLengthTensor,
+ Rpp32s *detectedIndexTensor,
+ Rpp32s *detectionLengthTensor,
Rpp32f cutOffDB,
Rpp32s windowLength,
Rpp32f referencePower,
diff --git a/src/modules/cpu/kernel/normalize.hpp b/src/modules/cpu/kernel/normalize.hpp
new file mode 100644
index 000000000..dbe746d1a
--- /dev/null
+++ b/src/modules/cpu/kernel/normalize.hpp
@@ -0,0 +1,882 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+// Computes strides
+void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim)
+{
+ if (tensorDim > 0)
+ {
+ Rpp32u v = 1;
+ for (Rpp32u i = tensorDim - 1; i > 0; i--)
+ {
+ strides[i] = v;
+ v *= shape[i];
+ }
+ strides[0] = v;
+ }
+}
+
+// Recursive reduction helper function to compute difference of input with mean and squares them up
+template
+void compute_diff_square_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements, Rpp32f mean)
+{
+ if (numElements > 32)
+ {
+ Rpp32s currElements = numElements >> 1;
+ Rpp32f tmp1 = 0, tmp2 = 0;
+
+ // reduce first half and accumulate
+ compute_diff_square_sum(tmp1, input, inputStride, currElements, mean);
+
+ // reduce second half and accumulate
+ compute_diff_square_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements, mean);
+
+ tmp1 += tmp2;
+ output += tmp1;
+ }
+ else
+ {
+ // reduce to a temporary
+ Rpp32f tmp = 0;
+ for (Rpp32s i = 0; i < numElements; i++)
+ {
+ Rpp32f curr = (input[i * inputStride] - mean);
+ auto curSq = curr * curr;
+ tmp += curSq;
+ }
+
+ // accumulate in target value
+ output += tmp;
+ }
+}
+
+// Recursive reduction helper function to sum up input values
+template
+void compute_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements)
+{
+ if (numElements > 32)
+ {
+ Rpp32s currElements = numElements >> 1;
+ Rpp32f tmp1 = 0, tmp2 = 0;
+
+ // reduce first half and accumulate
+ compute_sum(tmp1, input, inputStride, currElements);
+
+ // reduce second half and accumulate
+ compute_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements);
+
+ tmp1 += tmp2;
+ output += tmp1;
+ }
+ else
+ {
+ // reduce to a temporary
+ Rpp32f tmp = 0;
+ for (Rpp32s i = 0; i < numElements; i++)
+ tmp += input[i * inputStride];
+
+ // accumulate in target value
+ output += tmp;
+ }
+}
+
+// Computes mean for 2D inputs
+void compute_2D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride)
+{
+ Rpp32f *srcPtrTemp = srcPtr;
+ Rpp32f normFactor = 1.0 / dims[1];
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ meanPtr[i] = 0;
+ compute_sum(meanPtr[i], srcPtrTemp, stride[0], dims[1]);
+ srcPtrTemp += stride[1];
+ meanPtr[i] *= normFactor;
+ }
+}
+
+// Computes inverse stddev for 2D inputs
+void compute_2D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale)
+{
+
+ Rpp32f *srcPtrTemp = srcPtr;
+ Rpp32f normFactor = (Rpp32f)(1.0 / dims[1]);
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ stdDevPtr[i] = 0;
+ compute_diff_square_sum(stdDevPtr[i], srcPtrTemp, stride[0], dims[1], meanPtr[i]);
+ srcPtrTemp += stride[1];
+ }
+ rpp_rsqrt_sse(stdDevPtr, (Rpp32s)dims[0], 0, normFactor, scale);
+}
+
+// Computes mean for 3D inputs
+void compute_3D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, bool isConsecutive = true)
+{
+ Rpp32f *srcPtrTemp = srcPtr;
+ if(isConsecutive)
+ {
+ Rpp32f normFactor = 1.0 / dims[2];
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ float *srcPtrRow = srcPtrTemp;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ Rpp32u index = i * dims[1] + j;
+ meanPtr[index] = 0;
+ compute_sum(meanPtr[index], srcPtrRow, stride[0], dims[2]);
+ srcPtrRow += stride[1];
+ meanPtr[index] *= normFactor;
+ }
+ srcPtrTemp += stride[2];
+ }
+ }
+ else
+ {
+ Rpp32f normFactor = 1.0 / (dims[1] * dims[2]);
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ meanPtr[i] = 0;
+ Rpp32f *srcPtrRow = srcPtrTemp;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ compute_sum(meanPtr[i], srcPtrRow, stride[0], dims[2]);
+ srcPtrRow += stride[1];
+ }
+ meanPtr[i] *= normFactor;
+ srcPtrTemp += stride[2];
+ }
+ }
+}
+
+// Computes inverse stddev for 3D inputs
+void compute_3D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale, bool isConsecutive = true)
+{
+ Rpp32f *srcPtrTemp = srcPtr;
+ if(isConsecutive)
+ {
+ Rpp32f normFactor = (Rpp32f)(1.0 / dims[2]);
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ float *srcPtrRow = srcPtrTemp;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ Rpp32u index = i * dims[1] + j;
+ stdDevPtr[index] = 0;
+ compute_diff_square_sum(stdDevPtr[index], srcPtrRow, stride[0], dims[2], meanPtr[index]);
+ srcPtrRow += stride[1];
+ }
+ srcPtrTemp += stride[2];
+ }
+ rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0] * dims[1]), 0, normFactor, scale);
+ }
+ else
+ {
+ Rpp32f normFactor = (Rpp32f)(1.0 / (dims[1] * dims[2]));
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ stdDevPtr[i] = 0;
+ Rpp32f *srcPtrRow = srcPtrTemp;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ compute_diff_square_sum(stdDevPtr[i], srcPtrRow, stride[0], dims[2], meanPtr[i]);
+ srcPtrRow += stride[1];
+ }
+ srcPtrTemp += stride[2];
+ }
+ rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0]), 0, normFactor, scale);
+ }
+}
+
+// Computes mean for ND inputs
+template
+void compute_ND_mean(T *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis)
+{
+ if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeSum when last dimension is to be normalized
+ compute_sum(meanPtr[index], srcPtr, stride[level], dims[level]);
+ else if(level == tensorDim) // Calls computeSum when only 1st axis need to be normalized
+ compute_sum(meanPtr[index], srcPtr, stride[norm], dims[norm]);
+ else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store mean
+ {
+ for(Rpp32u i = 0; i < dims[level]; i++)
+ compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis);
+ }
+ else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized
+ compute_ND_mean(srcPtr, meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+ else if(axis[level]) // Called when axis at present level needs to be normalized
+ {
+ for(Rpp32u i = 0; i < dims[level]; i++)
+ compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+ }
+}
+
+// Computes inverse stddev for ND inputs
+template
+void compute_ND_stddev(T *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis)
+{
+ if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeDiffSumSquare when last dimension is to be normalized
+ compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[level], dims[level], meanPtr[index]);
+ else if(level == tensorDim) // Calls computeDiffSumSquare when only 1st axis need to be normalized
+ compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[norm], dims[norm], meanPtr[index]);
+ else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store stddev
+ {
+ for(Rpp32u i = 0; i < dims[level]; i++)
+ compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis);
+ }
+ else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized
+ compute_ND_stddev(srcPtr, meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+ else if(axis[level]) // Called when axis at present level needs to be normalized
+ {
+ for(Rpp32u i = 0; i < dims[level]; i++)
+ compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+ }
+}
+
+// Computes normalize for 3D non toggle variants
+void normalize_3D_tensor_nontoggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length)
+{
+ Rpp32s paramIdx = 0;
+ Rpp32s idx1 = 0;
+
+ for(Rpp32u i = 0; i < length[0]; i++)
+ {
+ Rpp32f *srcPtrRow = srcPtr;
+ Rpp32f *dstPtrRow = dstPtr;
+ for(Rpp32u j = 0; j < length[1]; j++)
+ {
+ Rpp32f *srcPtrRowTemp = srcPtrRow;
+ Rpp32f *dstPtrRowTemp = dstPtrRow;
+ idx1 = paramIdx;
+ for(Rpp32u k = 0; k < length[2]; k++)
+ {
+ *dstPtrRowTemp = ((*srcPtrRowTemp - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift;
+ if(k < length[2] - 1)
+ paramIdx += paramStride[2];
+ srcPtrRowTemp++;
+ dstPtrRowTemp++;
+ }
+ if(j < length[1] - 1)
+ paramIdx = (!paramStride[1]) ? idx1 : paramIdx + paramStride[1];
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ if(i < length[0] - 1)
+ paramIdx = (!paramStride[0]) ? 0 : paramIdx + paramStride[0];
+ srcPtr += srcGenericDescPtr->strides[1];
+ dstPtr += dstGenericDescPtr->strides[1];
+ }
+}
+
+// Computes normalize for 3D toggle variants when axis mask is set to 3
+void normalize_3D_tensor_axis3_toggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length)
+{
+ Rpp32f *srcPtrTemp = srcPtr;
+ Rpp32f *dstPtrTemp[length[2]];
+ dstPtrTemp[0] = dstPtr;
+ for(Rpp32u i = 1; i < length[2]; i++)
+ dstPtrTemp[i] = dstPtrTemp[i-1] + dstGenericDescPtr->strides[1];
+ Rpp32s paramIdx = 0;
+
+ for(Rpp32u i = 0; i < length[0]; i++)
+ {
+ Rpp32f *srcPtrRow = srcPtrTemp;
+ Rpp32f *dstPtrRow[length[2]];
+ for(Rpp32u l = 0; l < length[2]; l++)
+ dstPtrRow[l] = dstPtrTemp[l];
+ for(Rpp32u j = 0; j < length[1]; j++)
+ {
+ Rpp32f *srcPtrRowTemp = srcPtrRow;
+ Rpp32f *dstPtrRowTemp[length[2]];
+ for(Rpp32u l = 0; l < length[2]; l++)
+ dstPtrRowTemp[l] = dstPtrRow[l];
+ for(Rpp32u k = 0; k < length[2]; k++)
+ {
+ *dstPtrRowTemp[k]++ = ((*srcPtrRowTemp++ - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift;
+ paramIdx += paramStride[2];
+ }
+ paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1];
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ for(Rpp32u l = 0; l < length[2]; l++)
+ dstPtrRow[l] += dstGenericDescPtr->strides[3];
+ }
+ srcPtrTemp += srcGenericDescPtr->strides[1];
+ for(Rpp32u l = 0; l < length[2]; l++)
+ dstPtrTemp[l] += dstGenericDescPtr->strides[2];
+ }
+}
+
+// Computes normalize for 3D non toggle variants, optimized with AVX when axis mask set to 3 and 16 channel normalize
+void normalize_3D_tensor_avx_axis3(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u bufferLength, Rpp32u *length)
+{
+ Rpp32u vectorIncrement = 16;
+ Rpp32u alignedLength = (bufferLength / 16) * 16;
+ Rpp32u outerDim = length[0];
+
+ // set shift, mean and stddev
+ __m256 pShift = _mm256_set1_ps(shift);
+ __m256 pMean1 = _mm256_loadu_ps(meanPtr);
+ __m256 pMean2 = _mm256_loadu_ps(meanPtr + 8);
+ __m256 pMultiplier1 = _mm256_loadu_ps(multiplierPtr);
+ __m256 pMultiplier2 = _mm256_loadu_ps(multiplierPtr + 8);
+
+ for(Rpp32u i = 0; i < outerDim; i++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr + i * srcGenericDescPtr->strides[1];
+ Rpp32f *dstPtrTemp = dstPtr + i * dstGenericDescPtr->strides[1];
+
+ Rpp32u vectorLoopCount = 0;
+ for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement)
+ {
+ __m256 pSrc1 = _mm256_loadu_ps(srcPtrTemp);
+ __m256 pSrc2 = _mm256_loadu_ps(srcPtrTemp + 8);
+ __m256 pDst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc1, pMean1), pMultiplier1), pShift);
+ __m256 pDst2 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc2, pMean2), pMultiplier2), pShift);
+ _mm256_storeu_ps(dstPtrTemp, pDst1);
+ _mm256_storeu_ps(dstPtrTemp + 8, pDst2);
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ }
+}
+
+// Computes normalize for ND non toggle variants for i8 dataype
+void normalize_ND_tensor_nontoggle(Rpp32s *srcPtr, Rpp32u *srcStride, Rpp32f *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr,
+ Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx)
+{
+ Rpp32u idx1 = 0;
+ if(tensorDim == 1)
+ {
+ for(Rpp32u k = 0; k < length[level]; k++)
+ {
+ *dstPtr++ = (((Rpp32f)(*srcPtr + 128) - meanPtr[idx]) * multiplierPtr[idx]) + shift;
+ if(k < length[level] - 1)
+ idx += paramStride[level];
+ srcPtr++;
+ }
+ }
+ else
+ {
+ idx1 = idx;
+ for (Rpp32u i = 0; i < length[level]; i++)
+ {
+ normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx);
+ if(i < length[level] - 1)
+ idx = (!paramStride[level]) ? idx1 : idx + paramStride[level];
+ dstPtr += srcStride[level];
+ srcPtr += srcStride[level];
+ }
+ }
+}
+
+// Computes normalize for ND non toggle variants
+template
+void normalize_ND_tensor_nontoggle(T1 *srcPtr, Rpp32u *srcStride, T2 *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr,
+ Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx)
+{
+ Rpp32u idx1 = 0;
+ if(tensorDim == 1)
+ {
+ T1 *srcPtrTemp = srcPtr;
+ T2 *dstPtrTemp = dstPtr;
+
+ for(Rpp32u k = 0; k < length[level]; k++)
+ {
+ *dstPtrTemp = (((T2)*srcPtrTemp - meanPtr[idx]) * multiplierPtr[idx]) + shift;
+ if(k < length[level] - 1)
+ idx += paramStride[level];
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ }
+ else
+ {
+ idx1 = idx;
+ for (Rpp32u i = 0; i < length[level]; i++)
+ {
+ normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx);
+ if(i < length[level] - 1)
+ idx = (!paramStride[level]) ? idx1 : idx + paramStride[level];
+ dstPtr += srcStride[level];
+ srcPtr += srcStride[level];
+ }
+ }
+}
+
+// Computes normalize for 2D
+void normalize_2D_tensor(Rpp32f *srcPtr, RpptGenericDescPtr srcDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstDescPtr,
+ Rpp32f *meanPtr, Rpp32f *invStdDevPtr, Rpp32f shift, Rpp32u *dims, Rpp32u *paramStride)
+{
+ if (paramStride[1]) // Optimized with AVX when axis mask set to 2
+ {
+ Rpp32u vectorIncrement = 8;
+ Rpp32u bufferLength = dims[1];
+ Rpp32u alignedLength = (bufferLength / 8) * 8;
+
+ __m256 pShift = _mm256_set1_ps(shift);
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr + i * srcDescPtr->strides[1];
+ Rpp32f *dstPtrTemp = dstPtr + i * dstDescPtr->strides[1];
+
+ // set mean and stddev
+ Rpp32f mean = meanPtr[i];
+ Rpp32f invStdDev = invStdDevPtr[i];
+ __m256 pMean, pInvStdDev;
+ pMean = _mm256_set1_ps(mean);
+ pInvStdDev = _mm256_set1_ps(invStdDev);
+
+ Rpp32u vectorLoopCount = 0;
+ for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement)
+ {
+ __m256 pSrc = _mm256_loadu_ps(srcPtrTemp);
+ __m256 pDst = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc, pMean), pInvStdDev), pShift);
+ _mm256_storeu_ps(dstPtrTemp, pDst);
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for(; vectorLoopCount < dims[1] ; vectorLoopCount ++)
+ *dstPtrTemp++ = (*srcPtrTemp++ - mean) * invStdDev + shift;
+ }
+ }
+ else
+ {
+ Rpp32s paramIdx = 0;
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr;
+ Rpp32f *dstPtrTemp = dstPtr;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ *dstPtrTemp++ = (*srcPtrTemp++ - meanPtr[paramIdx]) * invStdDevPtr[paramIdx] + shift;
+ paramIdx += paramStride[0];
+ }
+ paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1];
+ srcPtr += srcDescPtr->strides[1];
+ dstPtr += dstDescPtr->strides[1];
+ }
+ }
+}
+
+// Performs collapse axis operation wherein continuous axis that require normalization are combined together
+void collapse_axis(Rpp32u *tensorDim, Rpp32u *axis, Rpp32u *length, Rpp32u *newAxis, Rpp32u *newDims, Rpp32u *lastNormAxis)
+{
+ int skipped = 0, prev = -2, k = 0;
+ for(Rpp32u i = 0; i < *tensorDim; i++)
+ {
+ if(axis[i])
+ {
+ int temp = i - skipped;
+ if(temp != prev + 1)
+ {
+ newAxis[k] = 1;
+ newDims[k] = length[i];
+ prev = i;
+ k++;
+ }
+ else if(prev >= 0)
+ {
+ newDims[prev] *= length[i];
+ skipped++;
+ }
+ }
+ else
+ {
+ newDims[k] = length[i];
+ k++;
+ }
+ }
+ *tensorDim -= skipped;
+ for(Rpp32u i = 0; i < *tensorDim; i++)
+ {
+ if(newAxis[i])
+ *lastNormAxis = i;
+ }
+}
+
+RppStatus normalize_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensorPtr,
+ Rpp32f *stdDevTensorPtr,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u tensorDims = srcGenericDescPtr->numDims - 1;
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ Rpp32u maxSize = 1;
+ // Compute maxSize as length of input tensors differ based on axisMask and tensorDims
+ for(int batch = 0; batch < batchSize; batch++)
+ {
+ Rpp32u size = 1;
+ for(int i = 0; i < tensorDims; i++)
+ size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i];
+ maxSize = std::max(maxSize, size);
+ }
+
+ if(!computeMeanStddev)
+ {
+ for(Rpp32u i = 0; i < maxSize; i++)
+ stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i];
+ maxSize = 0;
+ }
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[tensorDims];
+
+ Rpp32f *srcPtrTemp, *dstPtrTemp, *meanTensor, *stdDevTensor;
+ srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+ meanTensor = meanTensorPtr + batchCount * maxSize;
+ stdDevTensor = stdDevTensorPtr + batchCount * maxSize;
+
+ // Set all values in dst buffer to 0.0
+ for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++)
+ dstPtrTemp[cnt] = 0.0f;
+
+ Rpp32f *srcPtrChannel = srcPtrTemp;
+
+ if(tensorDims == 2) // Called for audio testcase and for any other 2D case
+ {
+ Rpp32u paramStride[2];
+ Rpp32u srcReductionDims[2], srcStride[2];
+ if (axisMask == 3)
+ {
+ srcStride[0] = srcStride[1] = srcGenericDescPtr->strides[2];
+ srcReductionDims[0] = 1;
+ srcReductionDims[1] = length[0] * length[1];
+ paramStride[0] = paramStride[1] = 0;
+ }
+ else if (axisMask == 1)
+ {
+ srcStride[0] = srcGenericDescPtr->strides[1];
+ srcStride[1] = srcGenericDescPtr->strides[2];
+ srcReductionDims[0] = length[1];
+ srcReductionDims[1] = length[0];
+ paramStride[0] = 1;
+ paramStride[1] = 0;
+ }
+ else if (axisMask == 2)
+ {
+ srcStride[0] = srcGenericDescPtr->strides[2];
+ srcStride[1] = srcGenericDescPtr->strides[1];
+ srcReductionDims[0] = length[0];
+ srcReductionDims[1] = length[1];
+ paramStride[0] = 0;
+ paramStride[1] = 1;
+ }
+
+ if(computeMeanStddev & 1) // Check if mean is to be computed internally
+ compute_2D_mean(srcPtrTemp, meanTensor, srcReductionDims, srcStride);
+ if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+ compute_2D_inv_std_dev(srcPtrTemp, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale);
+
+ normalize_2D_tensor(srcPtrTemp, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, length, paramStride);
+ }
+ else if(tensorDims == 3) // Called when a 3D tensor is passed to kernel
+ {
+ Rpp32u paramStride[3];
+ Rpp32u srcReductionDims[3], srcStride[3];
+ Rpp32u reductionDims;
+ bool isConsecutive = true;
+ switch(axisMask)
+ {
+ case 1: // Normalize axes 0
+ {
+ reductionDims = length[1] * length[2];
+ paramStride[0] = 0;
+ paramStride[1] = paramStride[2] = 1;
+ srcReductionDims[0] = length[1];
+ srcReductionDims[1] = length[2];
+ srcReductionDims[2] = length[0];
+ srcStride[0] = srcGenericDescPtr->strides[1];
+ srcStride[1] = srcGenericDescPtr->strides[3];
+ srcStride[2] = srcGenericDescPtr->strides[2];
+ break;
+ }
+ case 2: // Normalize axes 1
+ {
+ reductionDims = length[0] * length[2];
+ paramStride[1] = 0;
+ paramStride[0] = paramStride[2] = 1;
+ srcReductionDims[0] = length[0];
+ srcReductionDims[1] = length[2];
+ srcReductionDims[2] = length[1];
+ srcStride[0] = srcGenericDescPtr->strides[2];
+ srcStride[1] = srcGenericDescPtr->strides[3];
+ srcStride[2] = srcGenericDescPtr->strides[1];
+ break;
+ }
+ case 3: // Normalize axes 0, 1
+ {
+ reductionDims = length[2];
+ paramStride[0] = paramStride[1] = 0;
+ paramStride[2] = 1;
+ srcReductionDims[0] = 1;
+ srcReductionDims[1] = length[2];
+ srcReductionDims[2] = length[0] * length[1];
+ srcStride[0] = srcGenericDescPtr->strides[2];
+ srcStride[1] = srcGenericDescPtr->strides[3];
+ srcStride[2] = srcGenericDescPtr->strides[3];
+ break;
+ }
+ case 4: // Normalize across 2
+ {
+ reductionDims = length[0] * length[1];
+ paramStride[2] = 0;
+ paramStride[0] = paramStride[1] = 1;
+ srcReductionDims[0] = length[0];
+ srcReductionDims[1] = length[1];
+ srcReductionDims[2] = length[2];
+ srcStride[0] = srcGenericDescPtr->strides[3];
+ srcStride[1] = srcGenericDescPtr->strides[2];
+ srcStride[2] = srcGenericDescPtr->strides[1];
+ break;
+ }
+ case 5: // Normalize across 0, 2
+ {
+ reductionDims = length[1];
+ paramStride[0] = paramStride[2] = 0;
+ paramStride[1] = 1;
+ srcReductionDims[0] = length[1];
+ srcReductionDims[1] = length[0];
+ srcReductionDims[2] = length[2];
+ srcStride[0] = srcGenericDescPtr->strides[3];
+ srcStride[1] = srcGenericDescPtr->strides[1];
+ srcStride[2] = srcGenericDescPtr->strides[2];
+ isConsecutive = false;
+ break;
+ }
+ case 6: // Normalize across 1, 2
+ {
+ reductionDims = length[0];
+ paramStride[1] = paramStride[2] = 0;
+ paramStride[0] = 1;
+ srcReductionDims[0] = 1;
+ srcReductionDims[1] = length[0];
+ srcReductionDims[2] = length[1] * length[2];
+ srcStride[0] = srcGenericDescPtr->strides[3];
+ srcStride[1] = srcGenericDescPtr->strides[1];
+ srcStride[2] = srcGenericDescPtr->strides[3];
+ break;
+ }
+ case 7: // Normalize across 0, 1, 2
+ {
+ reductionDims = 1;
+ paramStride[0] = paramStride[1] = paramStride[2] = 0;
+ srcReductionDims[0] = 1;
+ srcReductionDims[1] = 1;
+ srcReductionDims[2] = length[0] * length[1] * length[2];
+ srcStride[0] = srcStride[1] = srcStride[2] = srcGenericDescPtr->strides[3];
+ break;
+ }
+ default:
+ {
+ std::cout<<"Invalid Axis mask"<strides[i];
+
+ if(computeMeanStddev & 1) // Check if mean is to be computed internally
+ compute_3D_mean(srcPtrChannel, meanTensor, srcReductionDims, srcStride, isConsecutive);
+ if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+ compute_3D_inv_std_dev(srcPtrChannel, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale, isConsecutive);
+
+ if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC) && (srcGenericDescPtr->dims[3] == 16))
+ normalize_3D_tensor_avx_axis3(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length[1] * layoutParams.bufferMultiplier, length);
+ else if((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC))
+ normalize_3D_tensor_nontoggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length);
+ else if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NCHW))
+ normalize_3D_tensor_axis3_toggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length);
+ }
+ else // Handle any other ND tensor is passed to kernel
+ {
+ // Compute length of input tensors as they differ based on axisMask and tensorDims
+ int size = 1;
+ for(int i = 0; i < tensorDims; i++)
+ size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i];
+
+ Rpp32u totalElements = 1;
+ Rpp32u lastNormAxis = 0;
+ Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims];
+ // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis
+ memset(newAxis, 0, tensorDims * sizeof(Rpp32u));
+ memset(newDims, 0, tensorDims * sizeof(Rpp32u));
+
+ for(Rpp32u i = 0; i < tensorDims; i++)
+ {
+ axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+ totalElements *= axis[i] ? length[i] : 1;
+ srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1];
+ }
+
+ Rpp32u paramStride[tensorDims], srcStride[tensorDims];
+ collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis);
+ compute_strides(srcStride, newDims, tensorDims);
+
+ if(computeMeanStddev & 1) // Check if mean is to be computed internally
+ {
+ compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+ Rpp32f normFactor = 1.0 / totalElements;
+ for(int i = 0; i < size; i++)
+ meanTensor[i] *= normFactor;
+ }
+ if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+ {
+ compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+ Rpp32f normFactor = (Rpp32f)(1.0 / totalElements);
+ rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale);
+ }
+
+ for(Rpp32u i = 0; i < tensorDims; i++)
+ paramStride[i] = !newAxis[i];
+
+ Rpp32u idx = 0;
+ normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+template
+RppStatus normalize_generic_host_tensor(T1 *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ T2 *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensorPtr,
+ Rpp32f *stdDevTensorPtr,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ Rpp32u maxSize = 1;
+ for(int batch = 0; batch < batchSize; batch++)
+ {
+ Rpp32u size = 1; // length of input tensors differ based on axisMask and tensorDims
+ for(int i = 0; i < tensorDims; i++)
+ size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i];
+ maxSize = std::max(maxSize, size);
+ }
+ if(!computeMeanStddev)
+ {
+ for(Rpp32u i = 0; i < maxSize; i++)
+ stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i];
+ maxSize = 0;
+ }
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ int size = 1;
+ Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[tensorDims];
+
+ for(int i = 0; i < tensorDims; i++)
+ size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i];
+
+ T1 *srcPtrTemp;
+ T2 *dstPtrTemp;
+ Rpp32f *meanTensor, *stdDevTensor;
+ srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+ meanTensor = meanTensorPtr + batchCount * maxSize;
+ stdDevTensor = stdDevTensorPtr + batchCount * maxSize;
+
+ // Set all values in dst buffer to 0.0
+ for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++)
+ dstPtrTemp[cnt] = 0.0f;
+
+ T1 *srcPtrChannel = srcPtrTemp;
+
+ int totalElements = 1;
+ Rpp32u lastNormAxis = 0;
+ Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims];
+ // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis
+ memset(newAxis, 0, sizeof(newAxis));
+ memset(newDims, 0, sizeof(newDims));
+
+ for(int i = 0; i < tensorDims; i++)
+ {
+ axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+ totalElements *= axis[i] ? length[i] : 1;
+ srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1];
+ }
+
+ Rpp32u paramStride[tensorDims], srcStride[tensorDims];
+ collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis);
+ compute_strides(srcStride, newDims, tensorDims);
+
+ if(computeMeanStddev & 1) // Check if mean is to be computed internally
+ {
+ compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+ Rpp32f normFactor = 1.0 / totalElements;
+ for(int i = 0; i < size; i++)
+ meanTensor[i] *= normFactor;
+ }
+ if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+ {
+ compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+ Rpp32f normFactor = (Rpp32f)(1.0 / totalElements);
+ rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale);
+ }
+
+ for(int i = 0; i < tensorDims; i++)
+ paramStride[i] = !newAxis[i];
+
+ Rpp32u idx = 0;
+ normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/pre_emphasis_filter.hpp b/src/modules/cpu/kernel/pre_emphasis_filter.hpp
index 1d25921ad..889cd2dec 100644
--- a/src/modules/cpu/kernel/pre_emphasis_filter.hpp
+++ b/src/modules/cpu/kernel/pre_emphasis_filter.hpp
@@ -50,7 +50,7 @@ RppStatus pre_emphasis_filter_host_tensor(Rpp32f *srcPtr,
dstPtrTemp[0] = srcPtrTemp[0] - coeff * border;
Rpp32s vectorIncrement = 8;
- Rpp32s alignedLength = (bufferLength / 8) * 8;
+ Rpp32s alignedLength = (bufferLength / 8) * 8 - 8;
__m256 pCoeff = _mm256_set1_ps(coeff);
Rpp32s vectorLoopCount = 1;
diff --git a/src/modules/cpu/kernel/slice.hpp b/src/modules/cpu/kernel/slice.hpp
index c451b67b4..37c3097c9 100644
--- a/src/modules/cpu/kernel/slice.hpp
+++ b/src/modules/cpu/kernel/slice.hpp
@@ -26,184 +26,213 @@ SOFTWARE.
#include "rpp_cpu_simd.hpp"
#include "rpp_cpu_common.hpp"
-RppStatus slice_f32_f32_host_tensor(Rpp32f *srcPtr,
- RpptGenericDescPtr srcGenericDescPtr,
- Rpp32f *dstPtr,
- RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
- RpptRoi3DType roiType,
- RppLayoutParams layoutParams,
- rpp::Handle& handle)
+template
+RppStatus slice_host_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ T *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ T* fillValue,
+ bool enablePadding,
+ Rpp32u *roiTensor,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
{
- RpptROI3D roiDefault;
- if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
- roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
- else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
- roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
omp_set_dynamic(0);
#pragma omp parallel for num_threads(numThreads)
for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
{
- RpptROI3D roi;
- RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
- compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+ T *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
- Rpp32f *srcPtrImage, *dstPtrImage;
- srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
- dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+ T *srcPtrChannel, *dstPtrChannel;
+ dstPtrChannel = dstPtrTemp;
- Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
- Rpp32f *srcPtrChannel, *dstPtrChannel;
- dstPtrChannel = dstPtrImage;
+ // get the starting address of length values from roiTensor
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
- // Slice without fused output-layout toggle (NCDHW -> NCDHW)
- if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ if (numDims == 4)
{
- srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
-
- Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f);
- for(int c = 0; c < layoutParams.channelParam; c++)
+ // order of dims
+ Rpp32s dimsOrder[3];
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ dimsOrder[0] = 1; // depth
+ dimsOrder[1] = 2; // height
+ dimsOrder[2] = 3; // width
+ }
+ else
{
- Rpp32f *srcPtrDepth, *dstPtrDepth;
- srcPtrDepth = srcPtrChannel;
- dstPtrDepth = dstPtrChannel;
+ dimsOrder[0] = 0; // depth
+ dimsOrder[1] = 1; // height
+ dimsOrder[2] = 2; // width
+ }
+ Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]);
+ Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier;
+ Rpp32u copyLengthInBytes = bufferLength * sizeof(T);
+
+ // if padding is required, fill the buffer with fill value specified
+ bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+ ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) ||
+ ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]]));
+ if (needPadding && enablePadding)
+ std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue);
+
+ // slice without fused output-layout toggle (NCDHW -> NCDHW)
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * srcGenericDescPtr->strides[3]) + (anchor[3] * layoutParams.bufferMultiplier);
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ T *srcPtrDepth, *dstPtrDepth;
+ srcPtrDepth = srcPtrChannel;
+ dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < maxDepth; i++)
+ {
+ T *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrDepth;
+ dstPtrRow = dstPtrDepth;
+ for(int j = 0; j < maxHeight; j++)
+ {
+ memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+ srcPtrRow += srcGenericDescPtr->strides[3];
+ dstPtrRow += dstGenericDescPtr->strides[3];
+ }
+ srcPtrDepth += srcGenericDescPtr->strides[2];
+ dstPtrDepth += dstGenericDescPtr->strides[2];
+ }
+ srcPtrChannel += srcGenericDescPtr->strides[1];
+ dstPtrChannel += srcGenericDescPtr->strides[1];
+ }
+ }
- for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ // slice without fused output-layout toggle (NDHWC -> NDHWC)
+ else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier);
+ T *srcPtrDepth = srcPtrChannel;
+ T *dstPtrDepth = dstPtrChannel;
+ for(int i = 0; i < maxDepth; i++)
{
- Rpp32f *srcPtrRow, *dstPtrRow;
+ T *srcPtrRow, *dstPtrRow;
srcPtrRow = srcPtrDepth;
dstPtrRow = dstPtrDepth;
-
- for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ for(int j = 0; j < maxHeight; j++)
{
memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
-
- srcPtrRow += srcGenericDescPtr->strides[3];
- dstPtrRow += dstGenericDescPtr->strides[3];
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
}
- srcPtrDepth += srcGenericDescPtr->strides[2];
- dstPtrDepth += dstGenericDescPtr->strides[2];
+ srcPtrDepth += srcGenericDescPtr->strides[1];
+ dstPtrDepth += dstGenericDescPtr->strides[1];
}
-
- srcPtrChannel += srcGenericDescPtr->strides[1];
- dstPtrChannel += srcGenericDescPtr->strides[1];
}
}
- // Slice without fused output-layout toggle (NDHWC -> NDHWC)
- else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ else if (numDims == 3)
{
- Rpp32u copyLengthInBytes = bufferLength * sizeof(Rpp32f);
- srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
- Rpp32f *srcPtrDepth = srcPtrChannel;
- Rpp32f *dstPtrDepth = dstPtrChannel;
-
- for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ // order of dims
+ Rpp32s dimsOrder[2];
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
{
- Rpp32f *srcPtrRow, *dstPtrRow;
- srcPtrRow = srcPtrDepth;
- dstPtrRow = dstPtrDepth;
-
- for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
- {
- memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
-
- srcPtrRow += srcGenericDescPtr->strides[2];
- dstPtrRow += dstGenericDescPtr->strides[2];
- }
- srcPtrDepth += srcGenericDescPtr->strides[1];
- dstPtrDepth += dstGenericDescPtr->strides[1];
+ dimsOrder[0] = 1; // height
+ dimsOrder[1] = 2; // width
+ }
+ else
+ {
+ dimsOrder[0] = 0; // height
+ dimsOrder[1] = 1; // width
}
- }
- }
-
- return RPP_SUCCESS;
-}
-
-RppStatus slice_u8_u8_host_tensor(Rpp8u *srcPtr,
- RpptGenericDescPtr srcGenericDescPtr,
- Rpp8u *dstPtr,
- RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
- RpptRoi3DType roiType,
- RppLayoutParams layoutParams,
- rpp::Handle& handle)
-{
- RpptROI3D roiDefault;
- if(srcGenericDescPtr->layout==RpptLayout::NCDHW)
- roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[4], (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2]};
- else if(srcGenericDescPtr->layout==RpptLayout::NDHWC)
- roiDefault = {0, 0, 0, (Rpp32s)srcGenericDescPtr->dims[3], (Rpp32s)srcGenericDescPtr->dims[2], (Rpp32s)srcGenericDescPtr->dims[1]};
- Rpp32u numThreads = handle.GetNumThreads();
-
- omp_set_dynamic(0);
-#pragma omp parallel for num_threads(numThreads)
- for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
- {
- RpptROI3D roi;
- RpptROI3DPtr roiPtrInput = &roiGenericPtrSrc[batchCount];
- compute_roi3D_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
- Rpp8u *srcPtrImage, *dstPtrImage;
- srcPtrImage = srcPtr + batchCount * srcGenericDescPtr->strides[0];
- dstPtrImage = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+ Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ Rpp32u bufferLength = maxWidth * layoutParams.bufferMultiplier;
+ Rpp32u copyLengthInBytes = bufferLength * sizeof(T);
- Rpp32u bufferLength = roi.xyzwhdROI.roiWidth * layoutParams.bufferMultiplier;
- Rpp8u *srcPtrChannel, *dstPtrChannel;
- dstPtrChannel = dstPtrImage;
+ // if padding is required, fill the buffer with fill value specified
+ bool needPadding = ((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+ ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]);
+ if (needPadding && enablePadding)
+ std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue);
- // Slice without fused output-layout toggle (NCDHW -> NCDHW)
- if((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
- {
- srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[3]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
- for(int c = 0; c < layoutParams.channelParam; c++)
+ // slice without fused output-layout toggle (NCHW -> NCHW)
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
{
- Rpp8u *srcPtrDepth, *dstPtrDepth;
- srcPtrDepth = srcPtrChannel;
- dstPtrDepth = dstPtrChannel;
- for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ srcPtrChannel = srcPtrTemp + (anchor[1] * srcGenericDescPtr->strides[2]) + (anchor[2] * layoutParams.bufferMultiplier);
+ for(int c = 0; c < layoutParams.channelParam; c++)
{
- Rpp8u *srcPtrRow, *dstPtrRow;
- srcPtrRow = srcPtrDepth;
- dstPtrRow = dstPtrDepth;
- for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ T *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+ for(int j = 0; j < maxHeight; j++)
{
- memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u));
- srcPtrRow += srcGenericDescPtr->strides[3];
- dstPtrRow += dstGenericDescPtr->strides[3];
+ memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
}
- srcPtrDepth += srcGenericDescPtr->strides[2];
- dstPtrDepth += dstGenericDescPtr->strides[2];
+ srcPtrChannel += srcGenericDescPtr->strides[1];
+ dstPtrChannel += srcGenericDescPtr->strides[1];
}
- srcPtrChannel += srcGenericDescPtr->strides[1];
- dstPtrChannel += srcGenericDescPtr->strides[1];
}
- }
- // Slice without fused output-layout toggle (NDHWC -> NDHWC)
- else if((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
- {
- srcPtrChannel = srcPtrImage + (roi.xyzwhdROI.xyz.z * srcGenericDescPtr->strides[1]) + (roi.xyzwhdROI.xyz.y * srcGenericDescPtr->strides[2]) + (roi.xyzwhdROI.xyz.x * layoutParams.bufferMultiplier);
- Rpp8u *srcPtrDepth = srcPtrChannel;
- Rpp8u *dstPtrDepth = dstPtrChannel;
- for(int i = 0; i < roi.xyzwhdROI.roiDepth; i++)
+ // slice without fused output-layout toggle (NHWC -> NHWC)
+ else if (dstGenericDescPtr->layout == RpptLayout::NHWC)
{
- Rpp8u *srcPtrRow, *dstPtrRow;
- srcPtrRow = srcPtrDepth;
- dstPtrRow = dstPtrDepth;
-
- for(int j = 0; j < roi.xyzwhdROI.roiHeight; j++)
+ srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + (anchor[1] * layoutParams.bufferMultiplier);
+ T *srcPtrRow = srcPtrChannel;
+ T *dstPtrRow = dstPtrChannel;
+ for(int j = 0; j < maxHeight; j++)
{
- memcpy(dstPtrRow, srcPtrRow, bufferLength * sizeof(Rpp8u));
- srcPtrRow += srcGenericDescPtr->strides[2];
- dstPtrRow += dstGenericDescPtr->strides[2];
+ memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+ srcPtrRow += srcGenericDescPtr->strides[1];
+ dstPtrRow += dstGenericDescPtr->strides[1];
}
- srcPtrDepth += srcGenericDescPtr->strides[1];
- dstPtrDepth += dstGenericDescPtr->strides[1];
}
}
+ else if (numDims == 2)
+ {
+ srcPtrChannel = srcPtrTemp + (anchor[0] * srcGenericDescPtr->strides[1]) + anchor[1];
+ Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]);
+ Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]);
+ Rpp32u copyLengthInBytes = maxWidth * sizeof(T);
+
+ // if padding is required, fill the buffer with fill value specified
+ bool needPadding = ((anchor[0] + shape[0]) > length[0]) ||
+ ((anchor[1] + shape[1]) > length[1]);
+ if (needPadding && enablePadding)
+ std::fill(dstPtrChannel, dstPtrChannel + dstGenericDescPtr->strides[0] - 1, *fillValue);
+
+ T *srcPtrRow = srcPtrChannel;
+ T *dstPtrRow = dstPtrChannel;
+ for(int j = 0; j < maxHeight; j++)
+ {
+ memcpy(dstPtrRow, srcPtrRow, copyLengthInBytes);
+ srcPtrRow += srcGenericDescPtr->strides[1];
+ dstPtrRow += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if (numDims == 1)
+ {
+ srcPtrChannel = srcPtrTemp + anchor[0];
+ Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]);
+ Rpp32u copyLengthInBytes = maxLength * sizeof(T);
+
+ // if padding is required, fill the buffer with fill value specified
+ bool needPadding = ((anchor[0] + shape[0]) > length[0]);
+ if (needPadding && enablePadding)
+ std::fill(dstPtrTemp, dstPtrTemp + dstGenericDescPtr->strides[0] - 1, *fillValue);
+ memcpy(dstPtrChannel, srcPtrChannel, copyLengthInBytes);
+ }
}
return RPP_SUCCESS;
diff --git a/src/modules/cpu/kernel/spectrogram.hpp b/src/modules/cpu/kernel/spectrogram.hpp
new file mode 100644
index 000000000..2489d2180
--- /dev/null
+++ b/src/modules/cpu/kernel/spectrogram.hpp
@@ -0,0 +1,245 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "third_party/ffts/ffts.h"
+#include "third_party/ffts/ffts_attributes.h"
+#include
+
+bool is_pow2(Rpp64s n) { return (n & (n-1)) == 0; }
+inline bool can_use_real_impl(Rpp64s n) { return is_pow2(n); }
+inline Rpp64s size_in_buf(Rpp64s n) { return can_use_real_impl(n) ? n : 2 * n; }
+inline Rpp64s size_out_buf(Rpp64s n) { return can_use_real_impl(n) ? n + 2 : 2 * n; }
+
+// Compute hanning window
+inline void hann_window(Rpp32f *output, Rpp32s windowSize)
+{
+ Rpp64f a = (2.0 * M_PI) / windowSize;
+ for (Rpp32s t = 0; t < windowSize; t++)
+ {
+ Rpp64f phase = a * (t + 0.5);
+ output[t] = (0.5 * (1.0 - std::cos(phase)));
+ }
+}
+
+// Compute number of spectrogram windows
+inline Rpp32s get_num_windows(Rpp32s length, Rpp32s windowLength, Rpp32s windowStep, bool centerWindows)
+{
+ if (!centerWindows)
+ length -= windowLength;
+ return ((length / windowStep) + 1);
+}
+
+// Compute reflect start idx to pad
+inline Rpp32s get_idx_reflect(Rpp32s loc, Rpp32s minLoc, Rpp32s maxLoc)
+{
+ if (maxLoc - minLoc < 2)
+ return maxLoc - 1;
+ for (;;)
+ {
+ if (loc < minLoc)
+ loc = 2 * minLoc - loc;
+ else if (loc >= maxLoc)
+ loc = 2 * maxLoc - 2 - loc;
+ else
+ break;
+ }
+ return loc;
+}
+
+RppStatus spectrogram_host_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32s *srcLengthTensor,
+ bool centerWindows,
+ bool reflectPadding,
+ Rpp32f *windowFunction,
+ Rpp32s nfft,
+ Rpp32s power,
+ Rpp32s windowLength,
+ Rpp32s windowStep,
+ rpp::Handle& handle)
+{
+ Rpp32s windowCenterOffset = 0;
+ bool vertical = (dstDescPtr->layout == RpptLayout::NFT);
+ if (centerWindows) windowCenterOffset = windowLength / 2;
+ if (nfft == 0) nfft = windowLength;
+ const Rpp32s numBins = nfft / 2 + 1;
+ const Rpp32f mulFactor = (2.0 * M_PI) / nfft;
+ const Rpp32u hStride = dstDescPtr->strides.hStride;
+ const Rpp32s alignedNfftLength = nfft & ~7;
+ const Rpp32s alignedNbinsLength = numBins & ~7;
+ const Rpp32s alignedWindowLength = windowLength & ~7;
+ bool useRealImpl = can_use_real_impl(nfft);
+ const auto fftInSize = size_in_buf(nfft);
+ const auto fftOutSize = size_out_buf(nfft);
+
+ Rpp32f *windowFn = static_cast(calloc(windowLength, sizeof(Rpp32f)));
+
+ // Generate hanning window
+ if (windowFunction == NULL)
+ hann_window(windowFn, windowLength);
+ else
+ memcpy(windowFn, windowFunction, windowLength * sizeof(Rpp32f));
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ // Get windows output
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (Rpp32s batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp32f *dstPtrTemp = dstPtr + batchCount * dstDescPtr->strides.nStride;
+ Rpp32s bufferLength = srcLengthTensor[batchCount];
+ Rpp32s numWindows = get_num_windows(bufferLength, windowLength, windowStep, centerWindows);
+ Rpp32f windowOutput[numWindows * nfft];
+ std::fill_n(windowOutput, numWindows * nfft, 0);
+ for (Rpp32s w = 0; w < numWindows; w++)
+ {
+ Rpp32s windowStart = w * windowStep - windowCenterOffset;
+ Rpp32f *windowOutputTemp = windowOutput + (w * nfft);
+ // Pad when either windowStart less than zero or length greater than input srclength
+ if (windowStart < 0 || (windowStart + windowLength) > bufferLength)
+ {
+ for (Rpp32s t = 0; t < windowLength; t++)
+ {
+ Rpp32s inIdx = windowStart + t;
+ if (reflectPadding)
+ {
+ inIdx = get_idx_reflect(inIdx, 0, bufferLength);
+ *windowOutputTemp++ = windowFn[t] * srcPtrTemp[inIdx];
+ }
+ else
+ {
+ if (inIdx >= 0 && inIdx < bufferLength)
+ *windowOutputTemp++ = windowFn[t] * srcPtrTemp[inIdx];
+ else
+ *windowOutputTemp++ = 0;
+ }
+ }
+ }
+ else
+ {
+ Rpp32f *srcPtrWindowTemp = srcPtrTemp + windowStart;
+ Rpp32f *windowFnTemp = windowFn;
+ Rpp32s t = 0;
+ for (; t < alignedWindowLength; t += 8)
+ {
+ __m256 pSrc, pWindowFn;
+ pSrc = _mm256_loadu_ps(srcPtrWindowTemp);
+ pWindowFn = _mm256_loadu_ps(windowFnTemp);
+ pSrc = _mm256_mul_ps(pSrc, pWindowFn);
+ _mm256_storeu_ps(windowOutputTemp, pSrc);
+ srcPtrWindowTemp += 8;
+ windowFnTemp += 8;
+ windowOutputTemp += 8;
+ }
+ for (; t < windowLength; t++)
+ *windowOutputTemp++ = (*windowFnTemp++) * (*srcPtrWindowTemp++);
+ }
+ }
+
+ // Generate FFT output
+ ffts_plan_t *p;
+ if(useRealImpl)
+ p = ffts_init_1d_real(nfft, FFTS_FORWARD);
+ else
+ p = ffts_init_1d(nfft, FFTS_FORWARD);
+
+ if (!p)
+ {
+ printf("FFT Plan is unsupported. Exiting the code\n");
+ exit(0);
+ }
+
+ // Set temporary buffers to 0
+ Rpp32f FFTS_ALIGN(32) *fftInBuf = static_cast(_mm_malloc(fftInSize * sizeof(Rpp32f), 32)); // ffts requires 32-byte aligned memory
+ Rpp32f FFTS_ALIGN(32) *fftOutBuf = static_cast(_mm_malloc(fftOutSize * sizeof(Rpp32f), 32)); // ffts requires 32-byte aligned memory
+
+ for (Rpp32s w = 0; w < numWindows; w++)
+ {
+ Rpp32f *dstPtrBinTemp = dstPtrTemp + (w * hStride);
+ Rpp32f *windowOutputTemp = windowOutput + (w * nfft);
+ for(int k = 0; k < fftInSize; k++)
+ fftInBuf[k] = 0.0f;
+
+ for(int k = 0; k < fftOutSize; k++)
+ fftOutBuf[k] = 0.0f;
+
+ Rpp32s inWindowStart = windowLength < nfft ? (nfft - windowLength) / 2 : 0;
+ // Copy the window input to fftInBuf
+ if (useRealImpl)
+ {
+ for (int i = 0; i < windowLength; i++)
+ fftInBuf[inWindowStart + i] = windowOutputTemp[i];
+ }
+ else
+ {
+ for (int i = 0; i < windowLength; i++)
+ {
+ Rpp32s off = 2 * (inWindowStart + i);
+ fftInBuf[off] = windowOutputTemp[i];
+ fftInBuf[off + 1] = 0.0f;
+ }
+ }
+
+ ffts_execute(p, fftInBuf, fftOutBuf);
+ auto *complexFft = reinterpret_cast *>(fftOutBuf);
+ Rpp32s outIdx = w;
+ if (vertical)
+ {
+ if (power == 1)
+ {
+ for (int i = 0; i < numBins; i++, outIdx += hStride)
+ dstPtrTemp[outIdx] = std::abs(complexFft[i]);
+ }
+ else
+ {
+ for (int i = 0; i < numBins; i++, outIdx += hStride)
+ dstPtrTemp[outIdx] = std::norm(complexFft[i]);
+ }
+ }
+ else
+ {
+ if (power == 1)
+ {
+ for (int i = 0; i < numBins; i++)
+ *dstPtrBinTemp++ = std::abs(complexFft[i]);
+ }
+ else
+ {
+ for (int i = 0; i < numBins; i++)
+ *dstPtrBinTemp++ = std::norm(complexFft[i]);
+ }
+ }
+ }
+ ffts_free(p);
+ _mm_free(fftInBuf);
+ _mm_free(fftOutBuf);
+ }
+ if(windowFn)
+ free(windowFn);
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/tensor_stddev.hpp b/src/modules/cpu/kernel/tensor_stddev.hpp
index 3013d9e54..2f64e93ab 100644
--- a/src/modules/cpu/kernel/tensor_stddev.hpp
+++ b/src/modules/cpu/kernel/tensor_stddev.hpp
@@ -147,8 +147,8 @@ RppStatus tensor_stddev_u8_f32_host(Rpp8u *srcPtr,
{
__m256d p[6];
rpp_simd_load(rpp_load24_u8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
- compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
- compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
srcPtrTempR += vectorIncrementPerChannel;
srcPtrTempG += vectorIncrementPerChannel;
srcPtrTempB += vectorIncrementPerChannel;
@@ -239,8 +239,8 @@ RppStatus tensor_stddev_u8_f32_host(Rpp8u *srcPtr,
{
__m256d p[6];
rpp_simd_load(rpp_load24_u8pkd3_to_f64pln3_avx, srcPtrTemp, p);
- compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
- compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
srcPtrTemp += vectorIncrement;
}
#endif
@@ -408,8 +408,8 @@ RppStatus tensor_stddev_f32_f32_host(Rpp32f *srcPtr,
{
__m256d p[6];
rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
- compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
- compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
srcPtrTempR += vectorIncrementPerChannel;
srcPtrTempG += vectorIncrementPerChannel;
srcPtrTempB += vectorIncrementPerChannel;
@@ -500,8 +500,8 @@ RppStatus tensor_stddev_f32_f32_host(Rpp32f *srcPtr,
{
__m256d p[6];
rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp, p);
- compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
- compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
srcPtrTemp += vectorIncrement;
}
#endif
@@ -682,8 +682,8 @@ RppStatus tensor_stddev_f16_f32_host(Rpp16f *srcPtr,
__m256d p[6];
rpp_simd_load(rpp_load24_f32pln3_to_f64pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
- compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
- compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
srcPtrTempR += vectorIncrementPerChannel;
srcPtrTempG += vectorIncrementPerChannel;
srcPtrTempB += vectorIncrementPerChannel;
@@ -778,8 +778,8 @@ RppStatus tensor_stddev_f16_f32_host(Rpp16f *srcPtr,
__m256d p[6];
rpp_simd_load(rpp_load24_f32pkd3_to_f64pln3_avx, srcPtrTemp_ps, p);
- compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
- compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
srcPtrTemp += vectorIncrement;
}
@@ -949,8 +949,8 @@ RppStatus tensor_stddev_i8_f32_host(Rpp8s *srcPtr,
{
__m256d p[6];
rpp_simd_load(rpp_load24_i8pln3_to_f64pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
- compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
- compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
srcPtrTempR += vectorIncrementPerChannel;
srcPtrTempG += vectorIncrementPerChannel;
srcPtrTempB += vectorIncrementPerChannel;
@@ -1041,8 +1041,8 @@ RppStatus tensor_stddev_i8_f32_host(Rpp8s *srcPtr,
{
__m256d p[6];
rpp_simd_load(rpp_load24_i8pkd3_to_f64pln3_avx, srcPtrTemp, p);
- compute_varianceChannel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
- compute_varianceImage_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
+ compute_variance_channel_pln3_24_host(p, &pMeanR, &pMeanG, &pMeanB, &pVarR, &pVarG, &pVarB);
+ compute_variance_image_pln3_24_host(p, &pMeanImage, &pVarImageR, &pVarImageG, &pVarImageB);
srcPtrTemp += vectorIncrement;
}
#endif
diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp
index c0a27806e..a0f50ee7e 100644
--- a/src/modules/hip/hip_tensor_statistical_operations.hpp
+++ b/src/modules/hip/hip_tensor_statistical_operations.hpp
@@ -29,5 +29,6 @@ SOFTWARE.
#include "kernel/tensor_max.hpp"
#include "kernel/tensor_mean.hpp"
#include "kernel/tensor_stddev.hpp"
+#include "kernel/normalize.hpp"
#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/normalize.hpp b/src/modules/hip/kernel/normalize.hpp
new file mode 100644
index 000000000..384454ca7
--- /dev/null
+++ b/src/modules/hip/kernel/normalize.hpp
@@ -0,0 +1,1921 @@
+#include
+#include "rpp_hip_common.hpp"
+
+#define MAX_SHARED_MEMORY_SIZE 1024
+
+// -------------------- Set 0 - normalization kernels device helpers --------------------
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, uchar* dst)
+{
+ outVal = fmax(fminf(outVal, 255), 0);
+ *dst = static_cast(outVal);
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, schar* dst)
+{
+ outVal = fmax(fminf(outVal, 127), -128);
+ *dst = static_cast(outVal);
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, float* dst)
+{
+ *dst = outVal;
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, half* dst)
+{
+ *dst = static_cast(outVal);
+}
+
+// -------------------- Set 1 - normalization kernel host helpers --------------------
+
+// setup function needed for 2D/3D optimized kernel variants when mean and stddev needs to be internally computed
+void normalize_setup_2d_and_3d(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims,
+ Rpp32u axisMask, Rpp32u &maxParamVolume)
+{
+ maxParamVolume = 1;
+ uint axisSet[RPPT_MAX_DIMS];
+ for(int i = 0; i < tensorDims; i++)
+ axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+
+ for(uint i = 0; i < batchSize; i++)
+ {
+ // calculate the max param volume
+ Rpp32u paramVolume = 1;
+ Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims];
+ for(uint j = 0; j < tensorDims; j++)
+ paramVolume *= (axisSet[j]) ? 1 : roi[j];
+ maxParamVolume = std::max(maxParamVolume, paramVolume);
+ }
+}
+
+// setup function needed for ND generic kernel variants
+void normalize_setup_nd(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims, Rpp32u axisMask,
+ Rpp32u *paramShapeTensor, Rpp32u *paramStridesTensor, Rpp32u &maxParamVolume)
+{
+ maxParamVolume = 1;
+ uint axisSet[RPPT_MAX_DIMS];
+ for(int i = 0; i < tensorDims; i++)
+ axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+
+ for(uint i = 0; i < batchSize; i++)
+ {
+ // calculate the param shape and param volume based on the axis mask
+ Rpp32u paramVolume = 1;
+ Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims];
+ Rpp32u *paramShape = ¶mShapeTensor[i * tensorDims];
+ for(uint j = 0; j < tensorDims; j++)
+ {
+ paramShape[j] = (axisSet[j]) ? 1 : roi[j];
+ paramVolume *= paramShape[j];
+ }
+ maxParamVolume = std::max(maxParamVolume, paramVolume);
+
+ // calculate the param strides from the param shape
+ Rpp32u *paramStrides = ¶mStridesTensor[i * tensorDims];
+ Rpp32u val = 1;
+ for(uint j = tensorDims - 1; j > 0; j--)
+ {
+ paramStrides[j] = val;
+ val *= paramShape[j];
+ }
+ paramStrides[0] = val;
+ }
+}
+
+// -------------------- Set 2 - normalization kernels --------------------
+
+template
+__global__ void normalize_2d_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ float *meanTensor,
+ float *stdDevTensor,
+ float2 scaleAndShift,
+ uint *roiTensor,
+ uint2 maxParamVolumeAndAxisMask,
+ bool computeStdDev)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // width
+ uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+ uint *roi = &roiTensor[id_z * 4];
+ uint yBegin = roi[0];
+ uint xBegin = roi[1];
+ uint height = roi[2];
+ uint width = roi[3];
+
+ if (id_x >= width || id_y >= height)
+ return;
+
+ uint maxParamVolume = maxParamVolumeAndAxisMask.x;
+ uint axisMask = maxParamVolumeAndAxisMask.y;
+ uint paramIndex = id_z * maxParamVolume;
+ // update paramIndex based on axisMask value
+ if (axisMask == 1)
+ paramIndex += id_x;
+ else if (axisMask == 2)
+ paramIndex += id_y;
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + id_x + xBegin;
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
+ float mean = meanTensor[paramIndex];
+ float stdDev = stdDevTensor[paramIndex];
+ float scale = scaleAndShift.x;
+ float shift = scaleAndShift.y;
+ float invStdDev;
+ if (computeStdDev)
+ {
+ float stdDevSquare = stdDev * stdDev;
+ invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+ }
+ else
+ {
+ invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+ }
+ float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift);
+ normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+template
+__global__ void normalize_3d_hip_tensor(T *srcPtr,
+ uint2 srcStridesDH,
+ T *dstPtr,
+ uint2 dstStridesDH,
+ float *meanTensor,
+ float *stdDevTensor,
+ float2 scaleAndShift,
+ uint *roiTensor,
+ uint axisMask,
+ bool computeStdDev)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX
+ uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ
+
+ uint *roi = roiTensor;
+ uint zBegin = roi[0];
+ uint yBegin = roi[1];
+ uint xBegin = roi[2];
+ uint lengthZ = roi[3];
+ uint lengthY = roi[4];
+ uint lengthX = roi[5];
+
+ if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ)
+ return;
+
+ uint paramIndex = 0;
+ // update paramIndex based on axisMask value
+ if (axisMask == 1)
+ paramIndex += id_y * lengthX + id_x;
+ else if (axisMask == 2)
+ paramIndex += id_z * lengthX + id_x;
+ else if (axisMask == 4)
+ paramIndex += id_z * lengthY + id_y;
+ else if (axisMask == 3)
+ paramIndex += id_x;
+ else if (axisMask == 5)
+ paramIndex += id_y;
+ else if (axisMask == 6)
+ paramIndex += id_z;
+
+ uint srcIdx = ((id_z + zBegin) * srcStridesDH.x) + ((id_y + yBegin) * srcStridesDH.y) + id_x + xBegin;
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x;
+ float mean = meanTensor[paramIndex];
+ float stdDev = stdDevTensor[paramIndex];
+ float scale = scaleAndShift.x;
+ float shift = scaleAndShift.y;
+ float invStdDev;
+ if (computeStdDev)
+ {
+ float stdDevSquare = stdDev * stdDev;
+ invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+ }
+ else
+ {
+ invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+ }
+ float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift);
+ normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+template
+__global__ void normalize_nd_hip_tensor(T *srcPtr,
+ uint *srcMaxDims,
+ uint *srcStrides,
+ T *dstPtr,
+ float *meanTensor,
+ float *stdDevTensor,
+ float2 scaleAndShift,
+ uint *roiTensor,
+ uint *paramShapeTensor,
+ uint *paramStridesTensor,
+ uint2 maxParamVolumeAndBufferLength,
+ uint tensorDims,
+ bool computeStdDev)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint maxBufferLength = maxParamVolumeAndBufferLength.y;
+
+ if (id_x >= maxBufferLength)
+ return;
+
+ uint *begin = &roiTensor[id_z * tensorDims * 2];
+ uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+ uint *paramShape = ¶mShapeTensor[id_z * tensorDims];
+ uint *paramStrides = ¶mStridesTensor[id_z * tensorDims];
+ uint maxParamVolume = maxParamVolumeAndBufferLength.x;
+ uint srcIdx = id_z * maxBufferLength;
+
+ uint paramIndex = id_z * maxParamVolume;
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ return;
+ paramIndex += (maxParamVolume != 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+
+ float mean = meanTensor[paramIndex];
+ float stdDev = stdDevTensor[paramIndex];
+ float scale = scaleAndShift.x;
+ float shift = scaleAndShift.y;
+ float invStdDev;
+ if (computeStdDev)
+ {
+ float stdDevSquare = stdDev * stdDev;
+ invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+ }
+ else
+ {
+ invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+ }
+ uint dstIdx = id_z * maxBufferLength + id_x;
+ float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift);
+ normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+// -------------------- Set 3 - mean and stddev compute kernels device helpers --------------------
+
+__device__ __forceinline__ void reduction_sum_x_hip(float *partialSum_smem)
+{
+ for(uint threadMax = hipBlockDim_x / 2; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialSum_smem[hipThreadIdx_x] += partialSum_smem[hipThreadIdx_x + threadMax];
+ __syncthreads();
+ }
+}
+
+// -------------------- Set 4 - mean compute kernels (reduction stage 1) --------------------
+
+template
+__global__ void compute_mean_2d_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ float *meanTensor,
+ float *partialSumTensor,
+ uint *roiTensor,
+ uint maxParamVolume,
+ uint axisMask)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *roi = &roiTensor[id_z * 4];
+ uint yBegin = roi[0];
+ uint xBegin = roi[1];
+ uint height = roi[2];
+ uint width = roi[3];
+
+ // compute column wise mean
+ if (axisMask == 1)
+ {
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin);
+ uint dstIdx = id_z * maxParamVolume + id_x;
+ if (id_x < width)
+ {
+ float accum = 0.0f;
+ for(int i = 0; i < height; i++)
+ {
+ accum += static_cast(srcPtr[srcIdx]);
+ srcIdx += srcStridesNH.y;
+ }
+ meanTensor[dstIdx] = accum / static_cast(height);
+ }
+ }
+ // compute partial sums needed for row wise mean
+ else if (axisMask == 2)
+ {
+ id_x *= 8;
+ __shared__ float partialRowSum_smem[256];
+ partialRowSum_smem[hipThreadIdx_x] = 0.0f;
+
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ int xAlignedLength = width & ~7; // alignedLength for vectorized global loads
+ int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ if (id_x + 8 > width)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 256 threads and store in Shared
+ __syncthreads();
+
+ // Now do block level reduction sum
+ reduction_sum_x_hip(partialRowSum_smem);
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x;
+ partialSumTensor[paramIndex] = partialRowSum_smem[0];
+ }
+ }
+ // compute partial sums need for computing mean over entire rows and columns
+ else if (axisMask == 3)
+ {
+ id_x *= 8;
+ __shared__ float partialSum_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ int xAlignedLength = width & ~7; // alignedLength for vectorized global loads
+ int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ if (id_x + 8 > width)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSumRowPtr_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void compute_mean_3d_hip_tensor(T *srcPtr,
+ uint3 srcStridesNZY,
+ float *meanTensor,
+ uint *roiTensor,
+ float *partialSumTensor,
+ uint maxParamVolume,
+ uint axisMask)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *roi = &roiTensor[id_z * 6];
+ uint zBegin = roi[0];
+ uint yBegin = roi[1];
+ uint xBegin = roi[2];
+ uint lengthZ = roi[3];
+ uint lengthY = roi[4];
+ uint lengthX = roi[5];
+
+ // compute mean along z direction
+ if (axisMask == 1)
+ {
+ if (id_x >= lengthX || id_y >= lengthY)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin);
+ uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x;
+ float accum = 0.0f;
+ for(uint i = 0; i < lengthZ; i++)
+ {
+ accum += static_cast(srcPtr[srcIdx]);
+ srcIdx += srcStridesNZY.y;
+ }
+ meanTensor[dstIdx] = accum / static_cast(lengthZ);
+ }
+ // compute mean along y direction
+ else if (axisMask == 2)
+ {
+ if (id_x >= lengthX || id_y >= lengthZ)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin);
+ uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x;
+ float accum = 0.0f;
+ for(uint i = 0; i < lengthY; i++)
+ {
+ accum += static_cast(srcPtr[srcIdx]);
+ srcIdx += srcStridesNZY.z;
+ }
+ meanTensor[dstIdx] = accum / static_cast(lengthY);
+ }
+ // compute mean along x direction
+ else if (axisMask == 4)
+ {
+ if (id_x >= lengthY || id_y >= lengthZ)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+ d_float8 accum_f8;
+ accum_f8.f4[0] = (float4)0.0f;
+ accum_f8.f4[1] = (float4)0.0f;
+ for(int i = 0; i < lengthX; i += 8)
+ {
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ if (i + 8 > lengthX)
+ {
+ int xDiff = i + 8 - lengthX;
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f;
+ }
+ accum_f8.f4[0] += src_f8.f4[0];
+ accum_f8.f4[1] += src_f8.f4[1];
+ srcIdx += 8;
+ }
+ accum_f8.f4[0] += accum_f8.f4[1];
+ accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+ uint dstIdx = id_z * maxParamVolume + id_y * lengthY + id_x;
+ meanTensor[dstIdx] = accum_f8.f1[0] / static_cast(lengthX);
+ }
+ // compute partial sums required for computing mean along z-y direction
+ else if (axisMask == 3)
+ {
+ for(uint xIndex = 0; xIndex < lengthX; xIndex++)
+ {
+ __shared__ float partialSum_smem[16][16];
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ if ((id_x >= lengthY) || (id_y >= lengthZ))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex);
+ partialSumRowPtr_smem[hipThreadIdx_x] = static_cast(srcPtr[srcIdx]);
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax];
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in z dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x);
+ partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+ }
+ }
+ __syncthreads();
+ }
+ }
+ // compute partial sums required for computing mean along y-x direction
+ else if (axisMask == 6)
+ {
+ __shared__ float partialSum_smem[256];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+ __syncthreads();
+
+ if (id_x >= lengthY || id_y >= lengthZ)
+ return;
+
+ uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y;
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+ d_float8 accum_f8;
+ accum_f8.f4[0] = (float4)0.0f;
+ accum_f8.f4[1] = (float4)0.0f;
+ for(int i = 0; i < lengthX; i += 8)
+ {
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ if (i + 8 > lengthX)
+ {
+ int xDiff = lengthX - i;
+ for(int j = xDiff; j < 8; j++)
+ src_f8.f1[j] = 0.0f;
+ }
+ accum_f8.f4[0] += src_f8.f4[0];
+ accum_f8.f4[1] += src_f8.f4[1];
+ srcIdx += 8;
+ }
+ accum_f8.f4[0] += accum_f8.f4[1];
+ accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+ partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0];
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSum_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
+ partialSumTensor[dstIdx] = partialSum_smem[0];
+ }
+ }
+ // compute mean along z-x direction
+ else if (axisMask == 5)
+ {
+ __shared__ float partialSum_smem[32];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+ if (hipBlockIdx_x >= lengthY)
+ return;
+
+ uint dstIdx = id_z * maxParamVolume + hipBlockIdx_x;
+ float accum = 0.0f;
+ for (uint i = 0; i < lengthZ; i++)
+ {
+ uint tid_x = hipThreadIdx_x;
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin;
+ while (tid_x < lengthX)
+ {
+ accum += static_cast(srcPtr[srcIdx + tid_x]);
+ tid_x += hipBlockDim_x;
+ }
+ }
+ partialSum_smem[hipThreadIdx_x] = accum;
+ __syncthreads();
+
+ // perform reduction on shared memory sums
+ reduction_sum_x_hip(partialSum_smem);
+
+ if (hipThreadIdx_x == 0)
+ meanTensor[dstIdx] = partialSum_smem[0] / static_cast(lengthX * lengthZ);
+ }
+ // compute partial sums required for computing mean along z-y-x direction
+ else if (axisMask == 7)
+ {
+ id_x *= 8;
+ __shared__ float partialSum_smem[16][16];
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ uint xIndex = id_x % srcStridesNZY.z;
+ uint yIndex = id_x / srcStridesNZY.z;
+ if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ))
+ {
+ return;
+ }
+
+ int xAlignedLength = lengthX & ~7; // alignedLength for vectorized global loads
+ int xDiff = lengthX - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ if (xIndex + 8 > lengthX)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1];
+ partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]);
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSumRowPtr_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x);
+ partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+ }
+ }
+ }
+}
+
+template
+__global__ void compute_mean_nd_hip_tensor(T *srcPtr,
+ uint *srcMaxDims,
+ uint *srcStrides,
+ float *meanTensor,
+ uint *roiTensor,
+ uint *paramShapeTensor,
+ uint *paramStridesTensor,
+ uint maxParamVolume,
+ uint tensorDims,
+ uint maxBufferLength)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *begin = &roiTensor[id_z * tensorDims * 2];
+ uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+ uint *paramShape = ¶mShapeTensor[id_z * tensorDims];
+ uint *paramStrides = ¶mStridesTensor[id_z * tensorDims];
+ uint srcIdx = id_z * maxBufferLength;
+ uint paramBase = id_z * maxParamVolume;
+ uint paramIndex = 0;
+
+ if (maxParamVolume > MAX_SHARED_MEMORY_SIZE)
+ {
+ if (id_x >= maxBufferLength)
+ return;
+
+ // validate if id_x is within the roi of input and compute paramIndex if valid
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ return;
+ paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+ atomicAdd(&meanTensor[paramBase + paramIndex], static_cast(srcPtr[srcIdx]));
+ }
+ else
+ {
+
+ if (id_x >= (hipBlockDim_x * hipGridDim_x))
+ return;
+
+ // if number of means needed to compute is within in the max shared memory size
+ // use shared memory for atomic addition to reduce global memory traffic
+ bool isValid = true;
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ {
+ isValid = false;
+ break;
+ }
+ paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+
+ extern __shared__ float sh_mem[];
+ sh_mem[hipThreadIdx_x] = 0.0f;
+ __syncthreads();
+
+ if (isValid && id_x < maxBufferLength)
+ atomicAdd(&sh_mem[paramIndex], static_cast(srcPtr[srcIdx]));
+ __syncthreads();
+
+ if (hipThreadIdx_x < maxParamVolume)
+ atomicAdd(&meanTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]);
+ }
+}
+
+// -------------------- Set 5 - stddev compute kernels (reduction stage 1) --------------------
+
+template
+__global__ void compute_stddev_2d_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ float *meanTensor,
+ float *stdDevTensor,
+ float *partialSumTensor,
+ uint *roiTensor,
+ uint maxParamVolume,
+ uint axisMask)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *roi = &roiTensor[id_z * 4];
+ uint yBegin = roi[0];
+ uint xBegin = roi[1];
+ uint height = roi[2];
+ uint width = roi[3];
+
+ // compute column wise stdDev
+ if (axisMask == 1)
+ {
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin);
+ uint paramIndex = id_z * maxParamVolume + id_x;
+ float mean = meanTensor[paramIndex];
+ if (id_x < width)
+ {
+ float accum = 0.0f;
+ for(int i = 0; i < height; i++)
+ {
+ float val = (static_cast(srcPtr[srcIdx]) - mean);
+ accum += (val * val);
+ srcIdx += srcStridesNH.y;
+ }
+ stdDevTensor[paramIndex] = sqrtf(accum / static_cast(height));
+ }
+ }
+ // compute partial mean subtracted squared sums needed for row wise stdDev
+ else if (axisMask == 2)
+ {
+ id_x *= 8;
+ __shared__ float partialRowSum_smem[256];
+ partialRowSum_smem[hipThreadIdx_x] = 0.0f;
+
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ int xAlignedLength = width & ~7; // alignedLength for vectorized global loads
+ int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+ uint paramIndex = id_z * maxParamVolume + id_y;
+ float mean = meanTensor[paramIndex];
+ float4 mean_f4 = static_cast(mean);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+
+ if (id_x + 8 > width)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+ __syncthreads();
+
+ // Now do block level reduction sum
+ reduction_sum_x_hip(partialRowSum_smem);
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x;
+ partialSumTensor[paramIndex] = partialRowSum_smem[0];
+ }
+ }
+ // compute partial mean subtracted squared sums need for computing stdDev over entire rows and columns
+ else if (axisMask == 3)
+ {
+ id_x *= 8;
+ __shared__ float partialSum_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ int xAlignedLength = width & ~7; // alignedLength for vectorized global loads
+ int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+ float mean = meanTensor[id_z];
+ float4 mean_f4 = static_cast(mean);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+ if (id_x + 8 > width)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSumRowPtr_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void compute_stddev_3d_hip_tensor(T *srcPtr,
+ uint3 srcStridesNZY,
+ float *meanTensor,
+ float *stdDevTensor,
+ uint *roiTensor,
+ float *partialSumTensor,
+ uint maxParamVolume,
+ uint axisMask)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *roi = &roiTensor[id_z * 6];
+ uint zBegin = roi[0];
+ uint yBegin = roi[1];
+ uint xBegin = roi[2];
+ uint lengthZ = roi[3];
+ uint lengthY = roi[4];
+ uint lengthX = roi[5];
+
+ // compute stddev along z direction
+ if (axisMask == 1)
+ {
+ if (id_x >= lengthX || id_y >= lengthY)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin);
+ uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x;
+ float mean = meanTensor[paramIndex];
+ float accum = 0.0f;
+ for(uint i = 0; i < lengthZ; i++)
+ {
+ float val = (static_cast(srcPtr[srcIdx]) - mean);
+ accum += (val * val);
+ srcIdx += srcStridesNZY.y;
+ }
+ stdDevTensor[paramIndex] = sqrtf(accum / static_cast(lengthZ));
+ }
+ // compute stddev along y direction
+ else if (axisMask == 2)
+ {
+ if (id_x >= lengthX || id_y >= lengthZ)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin);
+ uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x;
+ float mean = meanTensor[paramIndex];
+ float accum = 0.0f;
+ for(uint i = 0; i < lengthY; i++)
+ {
+ float val = (static_cast(srcPtr[srcIdx]) - mean);
+ accum += (val * val);
+ srcIdx += srcStridesNZY.z;
+ }
+ stdDevTensor[paramIndex] = sqrtf(accum / static_cast(lengthY));
+ }
+ // compute stddev along x direction
+ else if (axisMask == 4)
+ {
+ if (id_x >= lengthY || id_y >= lengthZ)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+ uint paramIndex = id_z * maxParamVolume + id_y * lengthY + id_x;
+ float mean = meanTensor[paramIndex];
+ float4 mean_f4 = static_cast(mean);
+ d_float8 accum_f8;
+ accum_f8.f4[0] = (float4)0.0f;
+ accum_f8.f4[1] = (float4)0.0f;
+ for(int i = 0; i < lengthX; i += 8)
+ {
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+ if (i + 8 > lengthX)
+ {
+ int xDiff = i + 8 - lengthX;
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f;
+ }
+ accum_f8.f4[0] += src_f8.f4[0];
+ accum_f8.f4[1] += src_f8.f4[1];
+ srcIdx += 8;
+ }
+ accum_f8.f4[0] += accum_f8.f4[1];
+ accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+
+ stdDevTensor[paramIndex] = sqrtf(accum_f8.f1[0] / static_cast(lengthX));
+ }
+ // compute partial mean subtracted squared sums required for computing stdDev along z-y direction
+ else if (axisMask == 3)
+ {
+ for(uint xIndex = 0; xIndex < lengthX; xIndex++)
+ {
+ __shared__ float partialSum_smem[16][16];
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ if ((id_x >= lengthY) || (id_y >= lengthZ))
+ {
+ return;
+ }
+
+ uint paramIndex = id_z * maxParamVolume + xIndex;
+ float mean = meanTensor[paramIndex];
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex);
+ float val = static_cast(srcPtr[srcIdx]) - mean;
+ partialSumRowPtr_smem[hipThreadIdx_x] = (val * val);
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax];
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in z dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x);
+ partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+ }
+ }
+ __syncthreads();
+ }
+ }
+ // compute partial mean subtracted squared sums required for computing stdDev along y-x direction
+ else if (axisMask == 6)
+ {
+ __shared__ float partialSum_smem[256];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+ __syncthreads();
+
+ if (id_x >= lengthY || id_y >= lengthZ)
+ return;
+
+ uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y;
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+
+ uint paramIndex = id_z * maxParamVolume + id_y;
+ float mean = meanTensor[paramIndex];
+ float4 mean_f4 = static_cast(mean);
+
+ d_float8 accum_f8;
+ accum_f8.f4[0] = (float4)0.0f;
+ accum_f8.f4[1] = (float4)0.0f;
+ for(int i = 0; i < lengthX; i += 8)
+ {
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+ if (i + 8 > lengthX)
+ {
+ int xDiff = lengthX - i;
+ for(int j = xDiff; j < 8; j++)
+ src_f8.f1[j] = 0.0f;
+ }
+ accum_f8.f4[0] += src_f8.f4[0];
+ accum_f8.f4[1] += src_f8.f4[1];
+ srcIdx += 8;
+ }
+ accum_f8.f4[0] += accum_f8.f4[1];
+ accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+ partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0];
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSum_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
+ partialSumTensor[dstIdx] = partialSum_smem[0];
+ }
+ }
+ // compute stddev along z-x direction
+ else if (axisMask == 5)
+ {
+ __shared__ float partialSum_smem[32];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+ if (hipBlockIdx_x >= lengthY)
+ return;
+
+ uint paramIndex = id_z * maxParamVolume + hipBlockIdx_x;
+ float mean = meanTensor[paramIndex];
+ float accum = 0.0f;
+ for (uint i = 0; i < lengthZ; i++)
+ {
+ uint tid_x = hipThreadIdx_x;
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin;
+ while (tid_x < lengthX)
+ {
+ float val = (static_cast(srcPtr[srcIdx + tid_x]) - mean);
+ accum += (val * val);
+ tid_x += hipBlockDim_x;
+ }
+ }
+ partialSum_smem[hipThreadIdx_x] = accum;
+ __syncthreads();
+
+ // perform reduction on shared memory sums
+ reduction_sum_x_hip(partialSum_smem);
+
+ if (hipThreadIdx_x == 0)
+ stdDevTensor[paramIndex] = sqrtf(partialSum_smem[0] / static_cast(lengthX * lengthZ));
+ }
+ // compute partial mean subtracted squared sums required for computing stdDev along z-y-x direction
+ else if (axisMask == 7)
+ {
+ id_x *= 8;
+ __shared__ float partialSum_smem[16][16];
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ uint xIndex = id_x % srcStridesNZY.z;
+ uint yIndex = id_x / srcStridesNZY.z;
+ if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ))
+ {
+ return;
+ }
+
+ int xAlignedLength = lengthX & ~7; // alignedLength for vectorized global loads
+ int xDiff = lengthX - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin);
+
+ uint paramIndex = id_z * maxParamVolume;
+ float mean = meanTensor[paramIndex];
+ float4 mean_f4 = static_cast(mean);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+
+ if (xIndex + 8 > lengthX)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1];
+ partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSumRowPtr_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x);
+ partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+ }
+ }
+ }
+}
+
+template
+__global__ void compute_stddev_nd_hip_tensor(T *srcPtr,
+ uint *srcMaxDims,
+ uint *srcStrides,
+ float *meanTensor,
+ float *stdDevTensor,
+ uint *roiTensor,
+ uint *paramShapeTensor,
+ uint *paramStridesTensor,
+ uint maxParamVolume,
+ uint tensorDims,
+ uint maxBufferLength)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *begin = &roiTensor[id_z * tensorDims * 2];
+ uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+ uint *paramShape = ¶mShapeTensor[id_z * tensorDims];
+ uint *paramStrides = ¶mStridesTensor[id_z * tensorDims];
+ uint srcIdx = id_z * maxBufferLength;
+ uint paramBase = id_z * maxParamVolume;
+ uint paramIndex = 0;
+
+ if (maxParamVolume > MAX_SHARED_MEMORY_SIZE)
+ {
+ if (id_x >= maxBufferLength)
+ return;
+
+ // validate if id_x is within the roi of input and compute paramIndex if valid
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ return;
+ paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+ float val = static_cast(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex];
+ atomicAdd(&stdDevTensor[paramBase + paramIndex], (val * val));
+ }
+ else
+ {
+
+ if (id_x >= (hipBlockDim_x * hipGridDim_x))
+ return;
+
+ // if number of means needed to compute is within in the max shared memory size
+ // use shared memory for atomic addition to reduce global memory traffic
+ bool isValid = true;
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ {
+ isValid = false;
+ break;
+ }
+ paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+
+ extern __shared__ float sh_mem[];
+ sh_mem[hipThreadIdx_x] = 0.0f;
+ __syncthreads();
+
+ if (isValid && id_x < maxBufferLength)
+ {
+ float val = static_cast(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex];
+ atomicAdd(&sh_mem[paramIndex], (val * val));
+ }
+ __syncthreads();
+
+ if (hipThreadIdx_x < maxParamVolume)
+ atomicAdd(&stdDevTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]);
+ }
+}
+
+// -------------------- Set 6 - mean and stddev compute kernels (reduction stage 2) --------------------
+
+__global__ void reduce_final_result_hip(float *partialSumTensor,
+ uint numPartialSums,
+ float *meanTensor,
+ float *stdDevTensor,
+ bool isMean,
+ uint *roiTensor,
+ uint axisMask,
+ uint tensorDims)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+
+ uint meanFactor;
+ if (tensorDims == 3)
+ {
+ uint lengthZ = roi[0];
+ uint lengthY = roi[1];
+ uint lengthX = roi[2];
+
+ if (axisMask == 3)
+ meanFactor = lengthZ * lengthY;
+ else if (axisMask == 6)
+ meanFactor = lengthY * lengthX;
+ else if (axisMask == 7)
+ meanFactor = lengthZ * lengthY * lengthX;
+ }
+ else if (tensorDims == 2)
+ {
+ uint lengthY = roi[0];
+ uint lengthX = roi[1];
+
+ if (axisMask == 2)
+ meanFactor = lengthX;
+ else if (axisMask == 3)
+ meanFactor = lengthY * lengthX;
+ }
+
+ __shared__ float partialSum_smem[16];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+ float accum = 0.0f;
+ while(id_x < numPartialSums)
+ {
+ uint srcIdx = (id_z * hipGridDim_y * numPartialSums) + (id_y * numPartialSums) + id_x;
+ accum += partialSumTensor[srcIdx];
+ id_x += hipBlockDim_x;
+ }
+ partialSum_smem[hipThreadIdx_x] = accum;
+ __syncthreads();
+
+ // Now do block level reduction sum
+ reduction_sum_x_hip(partialSum_smem);
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ if (isMean)
+ meanTensor[id_z * hipGridDim_y + id_y] = partialSum_smem[0] / meanFactor;
+ else
+ stdDevTensor[id_z * hipGridDim_y + id_y] = sqrtf(partialSum_smem[0] / meanFactor);
+ }
+}
+
+__global__ void final_reduction_nd_hip_tensor(float *meanTensor,
+ float *stdDevTensor,
+ uint *paramShapeTensor,
+ uint *roiTensor,
+ uint tensorDims,
+ uint maxParamVolume,
+ bool isMean)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *paramShape = ¶mShapeTensor[id_z * tensorDims];
+ uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+
+ uint divisionFactor = 1;
+ uint paramVolume = 1;
+ for(int i = 0; i < tensorDims; i++)
+ {
+ paramVolume *= paramShape[i];
+ if (paramShape[i] == 1)
+ divisionFactor *= roi[i];
+ }
+
+ if (id_x >= paramVolume)
+ return;
+
+ uint paramIndex = id_z * maxParamVolume + id_x;
+ if (isMean)
+ meanTensor[paramIndex] = meanTensor[paramIndex] / divisionFactor;
+ else
+ stdDevTensor[paramIndex] = sqrtf(stdDevTensor[paramIndex] / divisionFactor);
+}
+
+// -------------------- Set 7 - mean and stddev compute kernels launch helpers --------------------
+
+void set_kernel_launch_config_2d(RpptGenericDescPtr srcGenericDescPtr,
+ int &globalThreads_x,
+ int &globalThreads_y,
+ int &globalThreads_z,
+ int &localThreads_x,
+ int &localThreads_y,
+ int &localThreads_z,
+ Rpp32u axisMask,
+ Rpp32f *partialSumArr,
+ rpp::Handle& handle)
+{
+ switch (axisMask)
+ {
+ // compute along Y direction
+ case 1:
+ {
+ localThreads_x = 256;
+ localThreads_y = 1;
+ localThreads_z = 1;
+ globalThreads_x = static_cast(ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+ globalThreads_y = 1;
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along X direction
+ case 2:
+ {
+ localThreads_x = 256;
+ localThreads_y = 1;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / 256));
+ globalThreads_y = srcGenericDescPtr->dims[1];
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = srcGenericDescPtr->dims[0] * srcGenericDescPtr->dims[1] * globalThreads_x;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ // compute along XY direction
+ case 3:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ }
+}
+
+void set_kernel_launch_config_3d(RpptGenericDescPtr srcGenericDescPtr,
+ int &globalThreads_x,
+ int &globalThreads_y,
+ int &globalThreads_z,
+ int &localThreads_x,
+ int &localThreads_y,
+ int &localThreads_z,
+ Rpp32u axisMask,
+ Rpp32f *partialSumArr,
+ rpp::Handle& handle)
+{
+ switch (axisMask)
+ {
+ // compute along Z direction
+ case 1:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along Y direction
+ case 2:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along YZ direction
+ case 3:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ // compute along X direction
+ case 4:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along XZ direction
+ case 5:
+ {
+ localThreads_x = 32;
+ localThreads_y = 1;
+ localThreads_z = 1;
+ globalThreads_x = srcGenericDescPtr->dims[2];
+ globalThreads_y = 1;
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along XY direction
+ case 6:
+ {
+ localThreads_x = 256;
+ localThreads_y = 1;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+ globalThreads_y = srcGenericDescPtr->dims[1];
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ // compute along XYZ direction
+ case 7:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ Rpp32u numValues = (srcGenericDescPtr->dims[2] * srcGenericDescPtr->dims[3] + 7) >> 3;
+ globalThreads_x = static_cast (ceil((float)numValues / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ }
+}
+
+// -------------------- Set 8 - mean and stddev compute kernels executor --------------------
+
+template
+RppStatus hip_exec_compute_mean_stddev_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *meanTensor,
+ Rpp32f *stdDevTensor,
+ bool isMean,
+ Rpp32u *roiTensor,
+ Rpp32u axisMask,
+ Rpp32u tensorDims,
+ Rpp32u maxParamVolume,
+ Rpp32u *paramShape,
+ Rpp32u *paramStrides,
+ rpp::Handle& handle)
+{
+ Rpp32f *partialSumArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+ Rpp32u partialSumArrLength, partialSumBlocksPerSample;
+
+ int globalThreads_x, globalThreads_y, globalThreads_z;
+ int localThreads_x, localThreads_y, localThreads_z;
+ // based on number of dimensions call the corresponding kernel
+ if (tensorDims == 2)
+ {
+ // set the block and grid configuration based on axisMask
+ set_kernel_launch_config_2d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z,
+ localThreads_x, localThreads_y, localThreads_z, axisMask,
+ partialSumArr, handle);
+
+ if (isMean)
+ {
+ hipLaunchKernelGGL(compute_mean_2d_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(localThreads_x, localThreads_y, localThreads_z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+ meanTensor,
+ partialSumArr,
+ roiTensor,
+ maxParamVolume,
+ axisMask);
+ }
+ else
+ {
+ hipLaunchKernelGGL(compute_stddev_2d_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(localThreads_x, localThreads_y, localThreads_z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+ meanTensor,
+ stdDevTensor,
+ partialSumArr,
+ roiTensor,
+ maxParamVolume,
+ axisMask);
+ }
+
+ if (axisMask == 2)
+ {
+ partialSumBlocksPerSample = globalThreads_x;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ else if (axisMask == 3)
+ {
+ partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), 1, globalThreads_z),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ }
+ else if (tensorDims == 3)
+ {
+ // set the block and grid configuration based on axisMask
+ set_kernel_launch_config_3d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z,
+ localThreads_x, localThreads_y, localThreads_z, axisMask,
+ partialSumArr, handle);
+
+ if (isMean)
+ {
+ hipLaunchKernelGGL(compute_mean_3d_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(localThreads_x, localThreads_y, localThreads_z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ meanTensor,
+ roiTensor,
+ partialSumArr,
+ maxParamVolume,
+ axisMask);
+ }
+ else
+ {
+ hipLaunchKernelGGL(compute_stddev_3d_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(localThreads_x, localThreads_y, localThreads_z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ meanTensor,
+ stdDevTensor,
+ roiTensor,
+ partialSumArr,
+ maxParamVolume,
+ axisMask);
+ }
+
+ // perform final reduction on block wise sums for below cases
+ // reduce on YZ partial sums
+ if (axisMask == 3)
+ {
+ partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[3], srcGenericDescPtr->dims[0]),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ // reduce on XY partial sums
+ if (axisMask == 6)
+ {
+ partialSumBlocksPerSample = globalThreads_x;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[1], srcGenericDescPtr->dims[0]),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ // reduce on XYZ block partial sums
+ else if (axisMask == 7)
+ {
+ partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), 1, srcGenericDescPtr->dims[0]),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ }
+ else
+ {
+ // interpret the input as 1D tensor
+ globalThreads_x = srcGenericDescPtr->strides[0];
+ globalThreads_y = 1;
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ Rpp32u batchSize = globalThreads_z;
+
+ // allocate tensor for src strides
+ Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1];
+ Rpp32u *srcStrides = &srcGenericDescPtr->strides[1];
+
+ Rpp32u shared_memory_size = 0;
+ Rpp32u block_size = 1024;
+ if (maxParamVolume <= MAX_SHARED_MEMORY_SIZE)
+ {
+ if (maxParamVolume <= 32)
+ shared_memory_size = 32;
+ else if (maxParamVolume <= 64)
+ shared_memory_size = 64;
+ else if (maxParamVolume <= 128)
+ shared_memory_size = 128;
+ else if (maxParamVolume <= 256)
+ shared_memory_size = 256;
+ else if (maxParamVolume <= 512)
+ shared_memory_size = 512;
+ else
+ shared_memory_size = MAX_SHARED_MEMORY_SIZE;
+ block_size = shared_memory_size;
+ }
+
+ if (isMean)
+ {
+ hipLaunchKernelGGL(compute_mean_nd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+ dim3(block_size, 1, 1),
+ shared_memory_size,
+ handle.GetStream(),
+ srcPtr,
+ srcMaxDims,
+ srcStrides,
+ meanTensor,
+ roiTensor,
+ paramShape,
+ paramStrides,
+ maxParamVolume,
+ tensorDims,
+ srcGenericDescPtr->strides[0]);
+ }
+ else
+ {
+ hipLaunchKernelGGL(compute_stddev_nd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+ dim3(block_size, 1, 1),
+ shared_memory_size,
+ handle.GetStream(),
+ srcPtr,
+ srcMaxDims,
+ srcStrides,
+ meanTensor,
+ stdDevTensor,
+ roiTensor,
+ paramShape,
+ paramStrides,
+ maxParamVolume,
+ tensorDims,
+ srcGenericDescPtr->strides[0]);
+ }
+ hipLaunchKernelGGL(final_reduction_nd_hip_tensor,
+ dim3(ceil((float)maxParamVolume/1024), 1, globalThreads_z),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ meanTensor,
+ stdDevTensor,
+ paramShape,
+ roiTensor,
+ tensorDims,
+ maxParamVolume,
+ isMean);
+ }
+ hipStreamSynchronize(handle.GetStream());
+ return RPP_SUCCESS;
+}
+
+// -------------------- Set 9 - normalization kernel executor --------------------
+
+template
+RppStatus hip_exec_normalize_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ T *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensor,
+ Rpp32f *stdDevTensor,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u batchSize = srcGenericDescPtr->dims[0];
+ Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+
+ // create buffer for paramShape and paramStride needed for generic kernel
+ Rpp32u *paramShape, *paramStrides;
+ paramShape = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem;
+ paramStrides = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem + (batchSize * tensorDims);
+
+ // do initial preprocessing, compute maxParamVolue and fill the values for paramShape and paramStrides
+ Rpp32u maxParamVolume;
+ if (tensorDims == 2 || tensorDims == 3)
+ normalize_setup_2d_and_3d(roiTensor, batchSize, tensorDims,
+ axisMask, maxParamVolume);
+ else
+ normalize_setup_nd(roiTensor, batchSize, tensorDims, axisMask,
+ paramShape, paramStrides, maxParamVolume);
+
+ bool computeMean = computeMeanStddev & 1; // if 0th bit in computeMeanStddev is set, computeMean is set to true. Otherwise it is set to false
+ bool computeStdDev = computeMeanStddev & 2; // if 1st bit in computeMeanStddev is set, computeStdDev is set to true. Otherwise it is set to false
+ if ((!computeMean) && (!computeStdDev))
+ maxParamVolume = 0;
+
+ // if computeMean is set, compute mean values by processing over input based on axisMask values
+ if (computeMean)
+ hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, true,
+ roiTensor, axisMask, tensorDims, maxParamVolume,
+ paramShape, paramStrides, handle);
+
+ // if computeStdDev is set, compute stdDev values by processing over input based on axisMask values
+ if (computeStdDev)
+ hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, false,
+ roiTensor, axisMask, tensorDims, maxParamVolume,
+ paramShape, paramStrides, handle);
+
+ // based on number of dimensions call the corresponding kernel
+ if (tensorDims == 2)
+ {
+ // NHW
+ int globalThreads_x = dstGenericDescPtr->dims[2];
+ int globalThreads_y = dstGenericDescPtr->dims[1];
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ hipLaunchKernelGGL(normalize_2d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+ dstPtr,
+ make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]),
+ meanTensor,
+ stdDevTensor,
+ make_float2(scale, shift),
+ roiTensor,
+ make_uint2(maxParamVolume, axisMask),
+ computeStdDev);
+ }
+ else if (tensorDims == 3)
+ {
+ // NDHW
+ int globalThreads_x = dstGenericDescPtr->dims[3];
+ int globalThreads_y = dstGenericDescPtr->dims[2];
+ int globalThreads_z = dstGenericDescPtr->dims[1];
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(normalize_3d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ &meanTensor[batchCount * maxParamVolume],
+ &stdDevTensor[batchCount * maxParamVolume],
+ make_float2(scale, shift),
+ &roiTensor[batchCount * 6],
+ axisMask,
+ computeStdDev);
+ }
+ }
+ else
+ {
+ // interpret the input as 1D tensor
+ int globalThreads_x = dstGenericDescPtr->strides[0];
+ int globalThreads_y = 1;
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ // allocate tensor for src strides
+ Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1];
+ Rpp32u *srcStrides = &srcGenericDescPtr->strides[1];
+ hipLaunchKernelGGL(normalize_nd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ srcMaxDims,
+ srcStrides,
+ dstPtr,
+ meanTensor,
+ stdDevTensor,
+ make_float2(scale, shift),
+ roiTensor,
+ paramShape,
+ paramStrides,
+ make_uint2(maxParamVolume, srcGenericDescPtr->strides[0]),
+ tensorDims,
+ computeStdDev);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/slice.hpp b/src/modules/hip/kernel/slice.hpp
index 8deb52bbb..c1b7a6c41 100644
--- a/src/modules/hip/kernel/slice.hpp
+++ b/src/modules/hip/kernel/slice.hpp
@@ -2,24 +2,79 @@
#include
#include "rpp_hip_common.hpp"
+template
+__global__ void fill_value_ncdhw_hip_tensor(T *dstPtr,
+ uint3 dstStridesCDH,
+ int channels,
+ uint3 dstDimsDHW,
+ T *fillValue)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z))
+ {
+ return;
+ }
+
+ uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
+ d_float8 val_f8;
+ val_f8.f4[0] = (float4)(*fillValue);
+ val_f8.f4[1] = val_f8.f4[0];
+ for(int c = 0; c < channels; c++)
+ {
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &val_f8);
+ dstIdx += dstStridesCDH.x;
+ }
+}
+
+
+template
+__global__ void fill_value_ndhwc_hip_tensor(T *dstPtr,
+ uint2 dstStridesDH,
+ uint3 dstDimsDHW,
+ T *fillValue)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
+
+ if ((id_z >= dstDimsDHW.x) || (id_y >= dstDimsDHW.y) || (id_x >= dstDimsDHW.z))
+ {
+ return;
+ }
+
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+ d_float24 val_f24;
+ val_f24.f4[0] = (float4)(*fillValue);
+ val_f24.f4[1] = val_f24.f4[0];
+ val_f24.f4[2] = val_f24.f4[0];
+ val_f24.f4[3] = val_f24.f4[0];
+ val_f24.f4[4] = val_f24.f4[0];
+ val_f24.f4[5] = val_f24.f4[0];
+ rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
+}
+
+
template
__global__ void slice_ncdhw_hip_tensor(T *srcPtr,
uint3 srcStridesCDH,
T *dstPtr,
uint3 dstStridesCDH,
int channels,
- RpptROI3DPtr roiGenericSrc)
+ uint3 validShapeDHW)
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // W - inner most dim vectorized
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
- if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth))
+ if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z))
{
return;
}
- uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesCDH.y) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesCDH.z) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x);
+ uint srcIdx = (id_z * srcStridesCDH.y) + (id_y * srcStridesCDH.z) + id_x;
uint dstIdx = (id_z * dstStridesCDH.y) + (id_y * dstStridesCDH.z) + id_x;
d_float8 val_f8;
@@ -32,77 +87,439 @@ __global__ void slice_ncdhw_hip_tensor(T *srcPtr,
}
}
+
template
__global__ void slice_ndhwc_hip_tensor(T *srcPtr,
uint2 srcStridesDH,
T *dstPtr,
uint2 dstStridesDH,
- RpptROI3DPtr roiGenericSrc)
+ uint3 validShapeDHW)
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // WC - inner most dim vectorized
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // H - second to inner
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // D - outer most dim
- if ((id_z >= roiGenericSrc->xyzwhdROI.roiDepth) || (id_y >= roiGenericSrc->xyzwhdROI.roiHeight) || (id_x >= roiGenericSrc->xyzwhdROI.roiWidth))
+ if ((id_z >= validShapeDHW.x) || (id_y >= validShapeDHW.y) || (id_x >= validShapeDHW.z))
{
return;
}
- uint srcIdx = ((id_z + roiGenericSrc->xyzwhdROI.xyz.z) * srcStridesDH.x) + ((id_y + roiGenericSrc->xyzwhdROI.xyz.y) * srcStridesDH.y) + (id_x + roiGenericSrc->xyzwhdROI.xyz.x) * 3;
- uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x * 3;
+ uint srcIdx = (id_z * srcStridesDH.x) + (id_y * srcStridesDH.y) + (id_x * 3);
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + (id_x * 3);
d_float24 val_f24;
rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &val_f24);
rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &val_f24);
}
+template
+RppStatus hip_exec_fill_value_tensor(T *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ T *fillValue,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle,
+ Rpp32u numDims)
+{
+ if (numDims == 4)
+ {
+ // set the dimsOrder and globalthreads values required for NDHWC layout
+ Rpp32s dimsOrder[3] = {0, 1, 2};
+ int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
+
+ // change the dimsOrder and globalthreads values if layout is NCDHW
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ dimsOrder[0] = 1; // depth
+ dimsOrder[1] = 2; // height
+ dimsOrder[2] = 3; // width
+ globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
+ globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+ }
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]);
+
+ // checking if padding is required
+ bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+ ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]) ||
+ ((anchor[dimsOrder[2]] + shape[dimsOrder[2]]) > length[dimsOrder[2]]));
+
+ // if needPadding is set, launch kernel for filling the padded region with fill value specified
+ if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+ dstGenericDescPtr->dims[1],
+ make_uint3(maxDepth, maxHeight, maxWidth),
+ fillValue);
+ }
+ else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ make_uint3(maxDepth, maxHeight, maxWidth),
+ fillValue);
+ }
+ }
+ }
+ else if (numDims == 3)
+ {
+ // set the dimsOrder and globalthreads values required for NHWC layout
+ Rpp32s dimsOrder[2] = {0, 1};
+ int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction)
+ int globalThreads_z = 1;
+
+ // change the dimsOrder and globalthreads values if layout is NCHW
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
+ {
+ dimsOrder[0] = 1; // height
+ dimsOrder[1] = 2; // width
+ globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ globalThreads_z = 1;
+ }
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+
+ // check if padding is needed
+ bool needPadding = (((anchor[dimsOrder[0]] + shape[dimsOrder[0]]) > length[dimsOrder[0]]) ||
+ ((anchor[dimsOrder[1]] + shape[dimsOrder[1]]) > length[dimsOrder[1]]));
+
+ // launch kernel for filling the padded region with fill value specified
+ if (needPadding && dstGenericDescPtr->layout == RpptLayout::NCHW)
+ {
+ hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]),
+ dstGenericDescPtr->dims[1],
+ make_uint3(1, shape[1], shape[2]),
+ fillValue);
+ }
+ else if (needPadding && dstGenericDescPtr->layout == RpptLayout::NHWC)
+ {
+ hipLaunchKernelGGL(fill_value_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(1, dstGenericDescPtr->strides[1]),
+ make_uint3(1, maxHeight, maxWidth),
+ fillValue);
+ }
+ }
+ }
+ else if (numDims == 2)
+ {
+ // NHW
+ int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction)
+ int globalThreads_z = 1;
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]);
+ Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]);
+
+ // check if padding is needed
+ bool needPadding = (((anchor[0] + shape[0]) > length[0]) ||
+ ((anchor[1] + shape[1]) > length[1]));
+
+ // launch kernel for filling the padded region with fill value specified
+ if (needPadding)
+ {
+ hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(0, 0, dstGenericDescPtr->strides[1]),
+ 1,
+ make_uint3(1, shape[0], shape[1]),
+ fillValue);
+ }
+ }
+ }
+ else if (numDims == 1)
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = 1;
+ int globalThreads_z = 1;
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]);
+
+ // check if padding is needed
+ bool needPadding = ((anchor[0] + shape[0]) > length[0]);
+
+ // launch kernel for filling the padded region with fill value specified
+ if (needPadding)
+ {
+ hipLaunchKernelGGL(fill_value_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, 1, 1),
+ 0,
+ handle.GetStream(),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint3(0, 0, 1),
+ 1,
+ make_uint3(1, 1, shape[0]),
+ fillValue);
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
template
RppStatus hip_exec_slice_tensor(T *srcPtr,
RpptGenericDescPtr srcGenericDescPtr,
T *dstPtr,
RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ T *fillValue,
+ bool enablePadding,
+ Rpp32u *roiTensor,
rpp::Handle& handle)
{
- if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+
+ /* if enabledPadding is set to true, launch kernel to fill the output buffers with fill value specified.
+ This will be only done if shapeTensor[d] > roiTensor[d] where d is the dimension*/
+ if (enablePadding)
+ {
+ hip_exec_fill_value_tensor(dstPtr,
+ dstGenericDescPtr,
+ anchorTensor,
+ shapeTensor,
+ fillValue,
+ roiTensor,
+ handle,
+ numDims);
+ hipStreamSynchronize(handle.GetStream());
+ }
+
+ if(numDims == 4)
{
- int globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
- int globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
- int globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+ // set the dimsOrder and globalthreads values required for NDHWC layout
+ Rpp32s dimsOrder[3] = {0, 1, 2};
+ int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
+
+ // change the dimsOrder and globalthreads values if layout is NCDHW
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ dimsOrder[0] = 1; // depth
+ dimsOrder[1] = 2; // height
+ dimsOrder[2] = 3; // width
+ globalThreads_x = (dstGenericDescPtr->strides[3] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ globalThreads_y = dstGenericDescPtr->dims[3]; // H - height (y direction)
+ globalThreads_z = dstGenericDescPtr->dims[2]; // D - depth (z direction)
+ }
for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
{
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxDepth = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[2]], length[dimsOrder[2]] - anchor[dimsOrder[2]]);
+ if (dstGenericDescPtr->layout == RpptLayout::NCDHW)
+ {
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2] * srcGenericDescPtr->strides[3] + anchor[3];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+ hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtrTemp,
+ make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
+ dstPtrTemp,
+ make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
+ dstGenericDescPtr->dims[1],
+ make_uint3(maxDepth, maxHeight, maxWidth));
+ }
+ else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ {
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+ hipLaunchKernelGGL(slice_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtrTemp,
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtrTemp,
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ make_uint3(maxDepth, maxHeight, maxWidth));
+ }
+ }
+ }
+ else if (numDims == 3)
+ {
+ // set the dimsOrder and globalthreads values required for NHWC layout
+ Rpp32s dimsOrder[2] = {0, 1};
+ int globalThreads_x = (dstGenericDescPtr->strides[1] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction)
+ int globalThreads_z = 1;
+
+ // change the dimsOrder and globalthreads values if layout is NCHW
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
+ {
+ dimsOrder[0] = 1; // height
+ dimsOrder[1] = 2; // width
+ globalThreads_x = (dstGenericDescPtr->strides[2] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
+ globalThreads_z = 1;
+ }
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxHeight = std::min(shape[dimsOrder[0]], length[dimsOrder[0]] - anchor[dimsOrder[0]]);
+ Rpp32u maxWidth = std::min(shape[dimsOrder[1]], length[dimsOrder[1]] - anchor[dimsOrder[1]]);
+ if (dstGenericDescPtr->layout == RpptLayout::NCHW)
+ {
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[1] * srcGenericDescPtr->strides[2] + anchor[2];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+ hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ srcPtrTemp,
+ make_uint3(srcGenericDescPtr->strides[1], 0, srcGenericDescPtr->strides[2]),
+ dstPtrTemp,
+ make_uint3(dstGenericDescPtr->strides[1], 0, dstGenericDescPtr->strides[2]),
+ dstGenericDescPtr->dims[1],
+ make_uint3(1, maxHeight, maxWidth));
+ }
+ else if (dstGenericDescPtr->layout == RpptLayout::NHWC)
+ {
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[1] + anchor[1];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+ hipLaunchKernelGGL(slice_ndhwc_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), globalThreads_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
+ 0,
+ handle.GetStream(),
+ srcPtrTemp,
+ make_uint2(1, srcGenericDescPtr->strides[1]),
+ dstPtrTemp,
+ make_uint2(1, dstGenericDescPtr->strides[1]),
+ make_uint3(1, maxHeight, maxWidth));
+ }
+ }
+ }
+ else if (numDims == 2)
+ {
+ // NHW
+ int globalThreads_x = (dstGenericDescPtr->strides[1] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = dstGenericDescPtr->dims[1]; // H - height (y direction)
+ int globalThreads_z = 1;
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxHeight = std::min(shape[0], length[0] - anchor[0]);
+ Rpp32u maxWidth = std::min(shape[1], length[1] - anchor[1]);
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0] * srcGenericDescPtr->strides[2] + anchor[1];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+
hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
- dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, 1),
0,
handle.GetStream(),
- srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
- make_uint3(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2], srcGenericDescPtr->strides[3]),
- dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
- make_uint3(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2], dstGenericDescPtr->strides[3]),
- dstGenericDescPtr->dims[1],
- &roiGenericPtrSrc[batchCount]);
+ srcPtrTemp,
+ make_uint3(0, 0, srcGenericDescPtr->strides[1]),
+ dstPtrTemp,
+ make_uint3(0, 0, dstGenericDescPtr->strides[1]),
+ 1,
+ make_uint3(1, maxHeight, maxWidth));
}
}
- else if (dstGenericDescPtr->layout == RpptLayout::NDHWC)
+ else if (numDims == 1)
{
- int globalThreads_x = (dstGenericDescPtr->strides[2] / 3 + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
- int globalThreads_y = dstGenericDescPtr->dims[2]; // H - height (y direction)
- int globalThreads_z = dstGenericDescPtr->dims[1]; // D - depth (z direction)
-
+ int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; // W - width (x direction) - vectorized for 8 element loads/stores per channel
+ int globalThreads_y = 1;
+ int globalThreads_z = 1;
for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
{
- hipLaunchKernelGGL(slice_ndhwc_hip_tensor,
+ Rpp32s *anchor = &anchorTensor[batchCount * numDims];
+ Rpp32s *shape = &shapeTensor[batchCount * numDims];
+ Rpp32u *roi = roiTensor + batchCount * numDims * 2;
+ Rpp32s *length = reinterpret_cast(&roi[numDims]);
+ Rpp32u maxLength = std::min(shape[0], length[0] - anchor[0]);
+ T *srcPtrTemp = srcPtr + (batchCount * srcGenericDescPtr->strides[0]) + anchor[0];
+ T *dstPtrTemp = dstPtr + (batchCount * dstGenericDescPtr->strides[0]);
+
+ hipLaunchKernelGGL(slice_ncdhw_hip_tensor,
dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
- dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ dim3(LOCAL_THREADS_X, 1, 1),
0,
handle.GetStream(),
- srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
- make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
- dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
- make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
- &roiGenericPtrSrc[batchCount]);
+ srcPtrTemp,
+ make_uint3(0, 0, 1),
+ dstPtrTemp,
+ make_uint3(0, 0, 1),
+ 1,
+ make_uint3(1, 1, maxLength));
}
}
diff --git a/src/modules/rppi_validate.hpp b/src/modules/rppi_validate.hpp
index e35e5c514..3285ee756 100644
--- a/src/modules/rppi_validate.hpp
+++ b/src/modules/rppi_validate.hpp
@@ -56,11 +56,9 @@ inline RppLayoutParams get_layout_params(RpptLayout layout, Rpp32u channels)
}
else if(layout == RpptLayout::NHWC || layout == RpptLayout::NDHWC)
{
- if (channels == 3) // PKD3
- {
- layoutParams.channelParam = 1;
- layoutParams.bufferMultiplier = 3;
- }
+ //PKD
+ layoutParams.channelParam = 1;
+ layoutParams.bufferMultiplier = channels;
}
return layoutParams;
}
diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
index e2c9b18f7..0267985e5 100644
--- a/src/modules/rppt_tensor_audio_augmentations.cpp
+++ b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -32,8 +32,8 @@ SOFTWARE.
RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
Rpp32s *srcLengthTensor,
- Rpp32f *detectedIndexTensor,
- Rpp32f *detectionLengthTensor,
+ Rpp32s *detectedIndexTensor,
+ Rpp32s *detectionLengthTensor,
Rpp32f cutOffDB,
Rpp32s windowLength,
Rpp32f referencePower,
@@ -155,6 +155,88 @@ RppStatus rppt_down_mixing_host(RppPtr_t srcPtr,
}
}
+/******************** spectrogram ********************/
+
+RppStatus rppt_spectrogram_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32s *srcLengthTensor,
+ bool centerWindows,
+ bool reflectPadding,
+ Rpp32f *windowFunction,
+ Rpp32s nfft,
+ Rpp32s power,
+ Rpp32s windowLength,
+ Rpp32s windowStep,
+ rppHandle_t rppHandle)
+{
+ if ((dstDescPtr->layout != RpptLayout::NFT) && (dstDescPtr->layout != RpptLayout::NTF)) return RPP_ERROR_INVALID_DST_LAYOUT;
+
+ if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ spectrogram_host_tensor(static_cast(srcPtr),
+ srcDescPtr,
+ static_cast(dstPtr),
+ dstDescPtr,
+ srcLengthTensor,
+ centerWindows,
+ reflectPadding,
+ windowFunction,
+ nfft,
+ power,
+ windowLength,
+ windowStep,
+ rpp::deref(rppHandle));
+
+ return RPP_SUCCESS;
+ }
+ else
+ {
+ return RPP_ERROR_NOT_IMPLEMENTED;
+ }
+}
+
+/******************** mel_filter_bank ********************/
+
+RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32s* srcDimsTensor,
+ Rpp32f maxFreq,
+ Rpp32f minFreq,
+ RpptMelScaleFormula melFormula,
+ Rpp32s numFilter,
+ Rpp32f sampleRate,
+ bool normalize,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_SRC_LAYOUT;
+ if (dstDescPtr->layout != RpptLayout::NFT) return RPP_ERROR_INVALID_DST_LAYOUT;
+
+ if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ mel_filter_bank_host_tensor(static_cast(srcPtr),
+ srcDescPtr,
+ static_cast(dstPtr),
+ dstDescPtr,
+ srcDimsTensor,
+ maxFreq,
+ minFreq,
+ melFormula,
+ numFilter,
+ sampleRate,
+ normalize,
+ rpp::deref(rppHandle));
+ return RPP_SUCCESS;
+ }
+ else
+ {
+ return RPP_ERROR_NOT_IMPLEMENTED;
+ }
+}
+
/******************** resample ********************/
RppStatus rppt_resample_host(RppPtr_t srcPtr,
diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp
index 45a0d5221..d758aa676 100644
--- a/src/modules/rppt_tensor_geometric_augmentations.cpp
+++ b/src/modules/rppt_tensor_geometric_augmentations.cpp
@@ -1050,43 +1050,54 @@ RppStatus rppt_slice_host(RppPtr_t srcPtr,
RpptGenericDescPtr srcGenericDescPtr,
RppPtr_t dstPtr,
RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
- RpptRoi3DType roiType,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ RppPtr_t fillValue,
+ bool enablePadding,
+ Rpp32u *roiTensor,
rppHandle_t rppHandle)
{
+ if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
+ if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH;
+
RppLayoutParams layoutParams;
if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
-
- if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
- if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
- if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
- if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
- if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
+ else if ((srcGenericDescPtr->layout == RpptLayout::NCHW) && (dstGenericDescPtr->layout == RpptLayout::NCHW))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]);
if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
{
- slice_f32_f32_host_tensor((Rpp32f*) (static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
- srcGenericDescPtr,
- (Rpp32f*) (static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
- dstGenericDescPtr,
- roiGenericPtrSrc,
- roiType,
- layoutParams,
- rpp::deref(rppHandle));
+ slice_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ anchorTensor,
+ shapeTensor,
+ static_cast(fillValue),
+ enablePadding,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
}
else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
{
- slice_u8_u8_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
- srcGenericDescPtr,
- static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
- dstGenericDescPtr,
- roiGenericPtrSrc,
- roiType,
- layoutParams,
- rpp::deref(rppHandle));
+ slice_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ anchorTensor,
+ shapeTensor,
+ static_cast(fillValue),
+ enablePadding,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
}
return RPP_SUCCESS;
@@ -1848,24 +1859,29 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr,
RpptGenericDescPtr srcGenericDescPtr,
RppPtr_t dstPtr,
RpptGenericDescPtr dstGenericDescPtr,
- RpptROI3DPtr roiGenericPtrSrc,
- RpptRoi3DType roiType,
+ Rpp32s *anchorTensor,
+ Rpp32s *shapeTensor,
+ RppPtr_t fillValue,
+ bool enablePadding,
+ Rpp32u *roiTensor,
rppHandle_t rppHandle)
{
#ifdef HIP_COMPILE
- if ((srcGenericDescPtr->layout != RpptLayout::NCDHW) && (srcGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_SRC_LAYOUT;
- if ((dstGenericDescPtr->layout != RpptLayout::NCDHW) && (dstGenericDescPtr->layout != RpptLayout::NDHWC)) return RPP_ERROR_INVALID_DST_LAYOUT;
- if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_INVALID_ARGUMENTS;
if ((srcGenericDescPtr->dataType != RpptDataType::F32) && (srcGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_SRC_DATATYPE;
if ((dstGenericDescPtr->dataType != RpptDataType::F32) && (dstGenericDescPtr->dataType != RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ if (srcGenericDescPtr->layout != dstGenericDescPtr->layout) return RPP_ERROR_LAYOUT_MISMATCH;
if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
{
- hip_exec_slice_tensor((Rpp32f*) (static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ hip_exec_slice_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
srcGenericDescPtr,
- (Rpp32f*) (static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
dstGenericDescPtr,
- roiGenericPtrSrc,
+ anchorTensor,
+ shapeTensor,
+ static_cast(fillValue),
+ enablePadding,
+ roiTensor,
rpp::deref(rppHandle));
}
else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
@@ -1874,7 +1890,11 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr,
srcGenericDescPtr,
static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
dstGenericDescPtr,
- roiGenericPtrSrc,
+ anchorTensor,
+ shapeTensor,
+ static_cast(fillValue),
+ enablePadding,
+ roiTensor,
rpp::deref(rppHandle));
}
diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp
index f14e73a1a..dd151ec7d 100644
--- a/src/modules/rppt_tensor_statistical_operations.cpp
+++ b/src/modules/rppt_tensor_statistical_operations.cpp
@@ -241,6 +241,100 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** normalize_ND ********************/
+
+RppStatus rppt_normalize_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensor,
+ Rpp32f *stdDevTensor,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams;
+ Rpp32u tensorDim = srcGenericDescPtr->numDims - 1;
+ if (tensorDim == 3 && (srcGenericDescPtr->layout == RpptLayout::NHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+ else if(tensorDim == 2 && (srcGenericDescPtr->layout == RpptLayout::NHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[2]);
+
+ if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+ {
+ normalize_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+ {
+ normalize_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ normalize_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+ {
+ normalize_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
/******************** tensor_mean ********************/
@@ -402,6 +496,7 @@ RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr,
/********************************************************************************************************************/
#ifdef GPU_SUPPORT
+
/******************** tensor_sum ********************/
RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
@@ -480,7 +575,6 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
}
/******************** tensor_min ********************/
-
RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
RppPtr_t imageMinArr,
@@ -609,6 +703,87 @@ RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
#endif // backend
}
+RppStatus rppt_normalize_gpu(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensor,
+ Rpp32f *stdDevTensor,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_normalize_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_normalize_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_normalize_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_normalize_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
/******************** tensor_mean ********************/
RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr,
@@ -761,5 +936,5 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr,
return RPP_ERROR_NOT_IMPLEMENTED;
#endif // backend
}
-#endif // GPU_SUPPORT
+#endif // GPU_SUPPORT
\ No newline at end of file
diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt
index 82ed65309..77052cabe 100644
--- a/utilities/test_suite/CMakeLists.txt
+++ b/utilities/test_suite/CMakeLists.txt
@@ -83,7 +83,7 @@ if(Python3_FOUND)
if(NIFTI_FOUND)
add_test(
NAME rpp_qa_tests_tensor_voxel_host_all
- COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runTests_voxel.py --qa_mode 1 --batch_size 3
+ COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runVoxelTests.py --qa_mode 1 --batch_size 3
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endif(NIFTI_FOUND)
@@ -105,7 +105,7 @@ if(Python3_FOUND)
if(NIFTI_FOUND)
add_test(
NAME rpp_qa_tests_tensor_voxel_hip_all
- COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runTests_voxel.py --qa_mode 1 --batch_size 3
+ COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runVoxelTests.py --qa_mode 1 --batch_size 3
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endif(NIFTI_FOUND)
diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt
index 26017065e..a0bd42fa0 100644
--- a/utilities/test_suite/HIP/CMakeLists.txt
+++ b/utilities/test_suite/HIP/CMakeLists.txt
@@ -83,8 +83,10 @@ if (hip_FOUND AND OpenCV_FOUND)
link_directories(${ROCM_PATH}/lib /usr/local/lib)
add_executable(Tensor_hip Tensor_hip.cpp)
+ add_executable(Tensor_misc_hip Tensor_misc_hip.cpp)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPU_SUPPORT=1 -DRPP_BACKEND_HIP=1 -std=gnu++17")
target_link_libraries(Tensor_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device)
+ target_link_libraries(Tensor_misc_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device)
else()
message(FATAL_ERROR "-- ${Red}Error: OpenCV and hip must be installed to install ${PROJECT_NAME} successfully!${ColourReset}")
endif()
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 6008f33d3..fca310947 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -334,11 +334,20 @@ int main(int argc, char **argv)
bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f, for testCase 90, 91
else if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+
CHECK_RETURN_STATUS(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
if(testCase == 91)
CHECK_RETURN_STATUS(hipHostMalloc(&mean, reductionFuncResultArrLength * bitDepthByteSize));
}
+ // create generic descriptor and params in case of slice
+ RpptGenericDesc descriptor3D;
+ RpptGenericDescPtr descriptorPtr3D = &descriptor3D;
+ Rpp32s *anchorTensor = NULL, *shapeTensor = NULL;
+ Rpp32u *roiTensor = NULL;
+ if(testCase == 92)
+ set_generic_descriptor_slice(srcDescPtr, descriptorPtr3D, batchSize);
+
// Allocate hip memory for src/dst
CHECK_RETURN_STATUS(hipMalloc(&d_input, inputBufferSize));
CHECK_RETURN_STATUS(hipMalloc(&d_output, outputBufferSize));
@@ -349,6 +358,7 @@ int main(int argc, char **argv)
if(testCase == 82)
CHECK_RETURN_STATUS(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI)));
+ // create cropRoi and patchRoi in case of crop_and_patch
RpptROI *cropRoi, *patchRoi;
if(testCase == 33)
{
@@ -1168,6 +1178,28 @@ int main(int argc, char **argv)
break;
}
+ case 92:
+ {
+ testCaseName = "slice";
+ Rpp32u numDims = descriptorPtr3D->numDims - 1; // exclude batchSize from input dims
+ if(anchorTensor == NULL)
+ CHECK_RETURN_STATUS(hipHostMalloc(&anchorTensor, batchSize * numDims * sizeof(Rpp32s)));
+ if(shapeTensor == NULL)
+ CHECK_RETURN_STATUS(hipHostMalloc(&shapeTensor, batchSize * numDims * sizeof(Rpp32s)));
+ if(roiTensor == NULL)
+ CHECK_RETURN_STATUS(hipHostMalloc(&roiTensor, batchSize * numDims * 2 * sizeof(Rpp32u)));
+ bool enablePadding = false;
+ auto fillValue = 0;
+ init_slice(descriptorPtr3D, roiTensorPtrSrc, roiTensor, anchorTensor, shapeTensor);
+
+ startWallTime = omp_get_wtime();
+ if((inputBitDepth == 0 || inputBitDepth == 2) && srcDescPtr->layout == dstDescPtr->layout)
+ rppt_slice_gpu(d_input, descriptorPtr3D, d_output, descriptorPtr3D, anchorTensor, shapeTensor, &fillValue, enablePadding, roiTensor, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
{
missingFuncFlag = 1;
@@ -1262,6 +1294,42 @@ int main(int argc, char **argv)
refFile.close();
}
+ // if test case is slice and qaFlag is set, update the dstImgSizes with shapeTensor values
+ // for output display and comparision purposes
+ if (testCase == 92)
+ {
+ if (dstDescPtr->layout == RpptLayout::NCHW)
+ {
+ if (dstDescPtr->c == 3)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 3;
+ dstImgSizes[i].height = shapeTensor[idx1 + 1];
+ dstImgSizes[i].width = shapeTensor[idx1 + 2];
+ }
+ }
+ else
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 2;
+ dstImgSizes[i].height = shapeTensor[idx1];
+ dstImgSizes[i].width = shapeTensor[idx1 + 1];
+ }
+ }
+ }
+ else if (dstDescPtr->layout == RpptLayout::NHWC)
+ {
+ for(int i = 0; i < batchSize; i++)
+ {
+ int idx1 = i * 3;
+ dstImgSizes[i].height = shapeTensor[idx1];
+ dstImgSizes[i].width = shapeTensor[idx1 + 1];
+ }
+ }
+ }
+
/*Compare the output of the function with golden outputs only if
1.QA Flag is set
2.input bit depth 0 (Input U8 && Output U8)
@@ -1328,6 +1396,12 @@ int main(int argc, char **argv)
if(testCase == 91)
CHECK_RETURN_STATUS(hipHostFree(mean));
}
+ if(anchorTensor != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(anchorTensor));
+ if(shapeTensor != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(shapeTensor));
+ if(roiTensor != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(roiTensor));
free(input);
free(input_second);
free(output);
diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
new file mode 100644
index 000000000..96197f432
--- /dev/null
+++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
@@ -0,0 +1,231 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "../rpp_test_suite_misc.h"
+
+int main(int argc, char **argv)
+{
+ // Handle inputs
+ const int MIN_ARG_COUNT = 9;
+ if (argc < MIN_ARG_COUNT)
+ {
+ printf("\nImproper Usage! Needs all arguments!\n");
+ printf("\nUsage: ./Tensor_misc_hip