diff --git a/include/rppdefs.h b/include/rppdefs.h index f7cd7a711..7d49c457d 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -1045,6 +1045,7 @@ typedef struct Rpp64u* dstBatchIndex; Rpp32u* inc; Rpp32u* dstInc; + hipMemRpp32u scratchBuf; } memGPU; /*! \brief RPP HIP-HOST memory management diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h index 3cb49a82b..7c2d3318b 100644 --- a/include/rppt_tensor_statistical_operations.h +++ b/include/rppt_tensor_statistical_operations.h @@ -150,6 +150,49 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Normalize Generic augmentation on HOST backend + * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor. + * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. Also has toggle variant(NHWC->NCHW) support for 3D. + * \param [in] srcPtr source tensor memory in HOST memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor memory in HOST memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] axisMask axis along which normalization needs to be done + * \param [in] meanTensor values to be subtracted from input + * \param [in] stdDevTensor standard deviation values to scale the input + * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided) + * \param [in] scale value to be multiplied with data after subtracting from mean + * \param [in] shift value to be added finally + * \param [in] roiTensor values to represent dimensions of input tensor + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_normalize_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Normalize Generic augmentation on HIP backend + * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor. + * \param [in] srcPtr source tensor memory in HIP memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor memory in HIP memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] axisMask axis along which normalization needs to be done + * \param [in] meanTensor values to be subtracted from input + * \param [in] stdDevTensor standard deviation values to scale the input + * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided) + * \param [in] scale value to be multiplied with data after subtracting from mean + * \param [in] shift value to be added finally + * \param [in] roiTensor values to represent dimensions of input tensor + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 19121957b..6c327fdd6 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -2537,6 +2537,127 @@ inline Rpp32f rpp_hsum_ps(__m256 x) return _mm_cvtss_f32(sum); } +/* Computes inverse square root */ +inline Rpp32f rpp_rsqrt_ps(Rpp32f x) +{ + __m128 X = _mm_set_ss(x); + __m128 tmp = _mm_rsqrt_ss(X); + Rpp32f y = _mm_cvtss_f32(tmp); + return y * (1.5f - x * 0.5f * y * y); +} + +/* Compute inverse square root */ +/* SSE matches to 6 decimal places with raw C version due to newton rhapson approximation*/ +inline void rpp_rsqrt_sse(Rpp32f *input, Rpp64s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f mul) +{ + Rpp64s i = 0; + __m128 rdivx4 = _mm_set1_ps(rdiv); + __m128 mulx4 = _mm_set1_ps(mul * 0.5f); + if (eps) // epsilon is present - no need for masking, but we need to add it + { + __m128 epsx4 = _mm_set1_ps(eps); + for (; i + 4 <= numElements; i += 4) + { + __m128 x = _mm_loadu_ps(&input[i]); + x = _mm_mul_ps(x, rdivx4); + x = _mm_add_ps(x, epsx4); + __m128 y = _mm_rsqrt_ps(x); + __m128 y2 = _mm_mul_ps(y, y); + __m128 xy2 = _mm_mul_ps(x, y2); + __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2); + y = _mm_mul_ps(y, three_minus_xy2); + y = _mm_mul_ps(y, mulx4); + _mm_storeu_ps(&input[i], y); + } + } + else + { + for (; i + 4 <= numElements; i += 4) + { + __m128 x = _mm_loadu_ps(&input[i]); + x = _mm_mul_ps(x, rdivx4); + __m128 mask = _mm_cmpneq_ps(x, xmm_p0); + __m128 y = _mm_rsqrt_ps(x); + y = _mm_and_ps(y, mask); + __m128 y2 = _mm_mul_ps(y, y); + __m128 xy2 = _mm_mul_ps(x, y2); + __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2); + y = _mm_mul_ps(y, three_minus_xy2); + y = _mm_mul_ps(y, mulx4); + _mm_storeu_ps(&input[i], y); + } + } + if (eps) + { + for (; i < numElements; i++) + input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * mul; + } + else + { + for (; i < numElements; i++) + { + Rpp32f x = input[i] * rdiv; + input[i] = x ? rpp_rsqrt_ps(x) * mul : 0; + } + } +} + +/* Compute inverse square root */ +/* AVX2 matches to 6 decimal places with raw C version due to newton rhapson approximation*/ +inline void rpp_rsqrt_avx(Rpp32f *input, Rpp32s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f scale) +{ + Rpp32s i = 0; + __m256 rdivx8 = _mm256_set1_ps(rdiv); + __m256 mulx8 = _mm256_set1_ps(scale * 0.5f); + if (eps) // epsilon is present - no need for masking, but we need to add it + { + __m256 epsx8 = _mm256_set1_ps(eps); + for (; i + 8 <= numElements; i += 8) + { + __m256 x = _mm256_loadu_ps(&input[i]); + x = _mm256_mul_ps(x, rdivx8); + x = _mm256_add_ps(x, epsx8); + __m256 y = _mm256_rsqrt_ps(x); + __m256 y2 = _mm256_mul_ps(y, y); + __m256 xy2 = _mm256_mul_ps(x, y2); + __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2); + y = _mm256_mul_ps(y, three_minus_xy2); + y = _mm256_mul_ps(y, mulx8); + _mm256_storeu_ps(&input[i], y); + } + } + else + { + for (; i + 8 <= numElements; i += 8) + { + __m256 x = _mm256_loadu_ps(&input[i]); + x = _mm256_mul_ps(x, rdivx8); + __m256 mask = _mm256_cmp_ps(x, avx_p0, _CMP_NEQ_OQ); + __m256 y = _mm256_rsqrt_ps(x); + y = _mm256_and_ps(y, mask); + __m256 y2 = _mm256_mul_ps(y, y); + __m256 xy2 = _mm256_mul_ps(x, y2); + __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2); + y = _mm256_mul_ps(y, three_minus_xy2); + y = _mm256_mul_ps(y, mulx8); + _mm256_storeu_ps(&input[i], y); + } + } + if (eps) + { + for (; i < numElements; i++) + input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * scale; + } + else + { + for (; i < numElements; i++) + { + Rpp32f x = input[i] * rdiv; + input[i] = x ? rpp_rsqrt_ps(x) * scale : 0; + } + } +} + static inline void fast_matmul4x4_sse(float *A, float *B, float *C) { __m128 row1 = _mm_load_ps(&B[0]); // Row 0 of B diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt index 1f47e8ee7..4338483a9 100644 --- a/src/modules/CMakeLists.txt +++ b/src/modules/CMakeLists.txt @@ -97,6 +97,7 @@ if( "${BACKEND}" STREQUAL "HIP") set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS}") set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) + set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs # Add HIP specific preprocessor flags add_definitions(-DHIP_COMPILE) @@ -111,6 +112,7 @@ elseif( "${BACKEND}" STREQUAL "OCL") list(APPEND Rpp_Source ${CPPFILES} ${MOD_CL_CPP}) message("-- ${Green}OpenCL kernels added!${ColourReset}") set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) + set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs # Add OpenCL specific preprocessor flags add_definitions(-DOCL_COMPILE) @@ -125,6 +127,7 @@ elseif( "${BACKEND}" STREQUAL "CPU") # Add CPU specific includes set(INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/common/) set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) + set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs endif() message("-- ${White}AMD RPP ${PROJECT_NAME} -- Include Directories:${INCLUDE_LIST}${ColourReset}") add_compile_options("-Wno-unused-result") diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp index 32b8b62b5..7ddc238fa 100644 --- a/src/modules/cpu/host_tensor_statistical_operations.hpp +++ b/src/modules/cpu/host_tensor_statistical_operations.hpp @@ -28,5 +28,6 @@ SOFTWARE. #include "kernel/tensor_sum.hpp" #include "kernel/tensor_min.hpp" #include "kernel/tensor_max.hpp" +#include "kernel/normalize.hpp" -#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP \ No newline at end of file +#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP diff --git a/src/modules/cpu/kernel/normalize.hpp b/src/modules/cpu/kernel/normalize.hpp new file mode 100644 index 000000000..dbe746d1a --- /dev/null +++ b/src/modules/cpu/kernel/normalize.hpp @@ -0,0 +1,882 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +// Computes strides +void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim) +{ + if (tensorDim > 0) + { + Rpp32u v = 1; + for (Rpp32u i = tensorDim - 1; i > 0; i--) + { + strides[i] = v; + v *= shape[i]; + } + strides[0] = v; + } +} + +// Recursive reduction helper function to compute difference of input with mean and squares them up +template +void compute_diff_square_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements, Rpp32f mean) +{ + if (numElements > 32) + { + Rpp32s currElements = numElements >> 1; + Rpp32f tmp1 = 0, tmp2 = 0; + + // reduce first half and accumulate + compute_diff_square_sum(tmp1, input, inputStride, currElements, mean); + + // reduce second half and accumulate + compute_diff_square_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements, mean); + + tmp1 += tmp2; + output += tmp1; + } + else + { + // reduce to a temporary + Rpp32f tmp = 0; + for (Rpp32s i = 0; i < numElements; i++) + { + Rpp32f curr = (input[i * inputStride] - mean); + auto curSq = curr * curr; + tmp += curSq; + } + + // accumulate in target value + output += tmp; + } +} + +// Recursive reduction helper function to sum up input values +template +void compute_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements) +{ + if (numElements > 32) + { + Rpp32s currElements = numElements >> 1; + Rpp32f tmp1 = 0, tmp2 = 0; + + // reduce first half and accumulate + compute_sum(tmp1, input, inputStride, currElements); + + // reduce second half and accumulate + compute_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements); + + tmp1 += tmp2; + output += tmp1; + } + else + { + // reduce to a temporary + Rpp32f tmp = 0; + for (Rpp32s i = 0; i < numElements; i++) + tmp += input[i * inputStride]; + + // accumulate in target value + output += tmp; + } +} + +// Computes mean for 2D inputs +void compute_2D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride) +{ + Rpp32f *srcPtrTemp = srcPtr; + Rpp32f normFactor = 1.0 / dims[1]; + for(Rpp32u i = 0; i < dims[0]; i++) + { + meanPtr[i] = 0; + compute_sum(meanPtr[i], srcPtrTemp, stride[0], dims[1]); + srcPtrTemp += stride[1]; + meanPtr[i] *= normFactor; + } +} + +// Computes inverse stddev for 2D inputs +void compute_2D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale) +{ + + Rpp32f *srcPtrTemp = srcPtr; + Rpp32f normFactor = (Rpp32f)(1.0 / dims[1]); + for(Rpp32u i = 0; i < dims[0]; i++) + { + stdDevPtr[i] = 0; + compute_diff_square_sum(stdDevPtr[i], srcPtrTemp, stride[0], dims[1], meanPtr[i]); + srcPtrTemp += stride[1]; + } + rpp_rsqrt_sse(stdDevPtr, (Rpp32s)dims[0], 0, normFactor, scale); +} + +// Computes mean for 3D inputs +void compute_3D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, bool isConsecutive = true) +{ + Rpp32f *srcPtrTemp = srcPtr; + if(isConsecutive) + { + Rpp32f normFactor = 1.0 / dims[2]; + for(Rpp32u i = 0; i < dims[0]; i++) + { + float *srcPtrRow = srcPtrTemp; + for(Rpp32u j = 0; j < dims[1]; j++) + { + Rpp32u index = i * dims[1] + j; + meanPtr[index] = 0; + compute_sum(meanPtr[index], srcPtrRow, stride[0], dims[2]); + srcPtrRow += stride[1]; + meanPtr[index] *= normFactor; + } + srcPtrTemp += stride[2]; + } + } + else + { + Rpp32f normFactor = 1.0 / (dims[1] * dims[2]); + for(Rpp32u i = 0; i < dims[0]; i++) + { + meanPtr[i] = 0; + Rpp32f *srcPtrRow = srcPtrTemp; + for(Rpp32u j = 0; j < dims[1]; j++) + { + compute_sum(meanPtr[i], srcPtrRow, stride[0], dims[2]); + srcPtrRow += stride[1]; + } + meanPtr[i] *= normFactor; + srcPtrTemp += stride[2]; + } + } +} + +// Computes inverse stddev for 3D inputs +void compute_3D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale, bool isConsecutive = true) +{ + Rpp32f *srcPtrTemp = srcPtr; + if(isConsecutive) + { + Rpp32f normFactor = (Rpp32f)(1.0 / dims[2]); + for(Rpp32u i = 0; i < dims[0]; i++) + { + float *srcPtrRow = srcPtrTemp; + for(Rpp32u j = 0; j < dims[1]; j++) + { + Rpp32u index = i * dims[1] + j; + stdDevPtr[index] = 0; + compute_diff_square_sum(stdDevPtr[index], srcPtrRow, stride[0], dims[2], meanPtr[index]); + srcPtrRow += stride[1]; + } + srcPtrTemp += stride[2]; + } + rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0] * dims[1]), 0, normFactor, scale); + } + else + { + Rpp32f normFactor = (Rpp32f)(1.0 / (dims[1] * dims[2])); + for(Rpp32u i = 0; i < dims[0]; i++) + { + stdDevPtr[i] = 0; + Rpp32f *srcPtrRow = srcPtrTemp; + for(Rpp32u j = 0; j < dims[1]; j++) + { + compute_diff_square_sum(stdDevPtr[i], srcPtrRow, stride[0], dims[2], meanPtr[i]); + srcPtrRow += stride[1]; + } + srcPtrTemp += stride[2]; + } + rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0]), 0, normFactor, scale); + } +} + +// Computes mean for ND inputs +template +void compute_ND_mean(T *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis) +{ + if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeSum when last dimension is to be normalized + compute_sum(meanPtr[index], srcPtr, stride[level], dims[level]); + else if(level == tensorDim) // Calls computeSum when only 1st axis need to be normalized + compute_sum(meanPtr[index], srcPtr, stride[norm], dims[norm]); + else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store mean + { + for(Rpp32u i = 0; i < dims[level]; i++) + compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis); + } + else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized + compute_ND_mean(srcPtr, meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis); + else if(axis[level]) // Called when axis at present level needs to be normalized + { + for(Rpp32u i = 0; i < dims[level]; i++) + compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis); + } +} + +// Computes inverse stddev for ND inputs +template +void compute_ND_stddev(T *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis) +{ + if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeDiffSumSquare when last dimension is to be normalized + compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[level], dims[level], meanPtr[index]); + else if(level == tensorDim) // Calls computeDiffSumSquare when only 1st axis need to be normalized + compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[norm], dims[norm], meanPtr[index]); + else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store stddev + { + for(Rpp32u i = 0; i < dims[level]; i++) + compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis); + } + else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized + compute_ND_stddev(srcPtr, meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis); + else if(axis[level]) // Called when axis at present level needs to be normalized + { + for(Rpp32u i = 0; i < dims[level]; i++) + compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis); + } +} + +// Computes normalize for 3D non toggle variants +void normalize_3D_tensor_nontoggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length) +{ + Rpp32s paramIdx = 0; + Rpp32s idx1 = 0; + + for(Rpp32u i = 0; i < length[0]; i++) + { + Rpp32f *srcPtrRow = srcPtr; + Rpp32f *dstPtrRow = dstPtr; + for(Rpp32u j = 0; j < length[1]; j++) + { + Rpp32f *srcPtrRowTemp = srcPtrRow; + Rpp32f *dstPtrRowTemp = dstPtrRow; + idx1 = paramIdx; + for(Rpp32u k = 0; k < length[2]; k++) + { + *dstPtrRowTemp = ((*srcPtrRowTemp - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift; + if(k < length[2] - 1) + paramIdx += paramStride[2]; + srcPtrRowTemp++; + dstPtrRowTemp++; + } + if(j < length[1] - 1) + paramIdx = (!paramStride[1]) ? idx1 : paramIdx + paramStride[1]; + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + if(i < length[0] - 1) + paramIdx = (!paramStride[0]) ? 0 : paramIdx + paramStride[0]; + srcPtr += srcGenericDescPtr->strides[1]; + dstPtr += dstGenericDescPtr->strides[1]; + } +} + +// Computes normalize for 3D toggle variants when axis mask is set to 3 +void normalize_3D_tensor_axis3_toggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length) +{ + Rpp32f *srcPtrTemp = srcPtr; + Rpp32f *dstPtrTemp[length[2]]; + dstPtrTemp[0] = dstPtr; + for(Rpp32u i = 1; i < length[2]; i++) + dstPtrTemp[i] = dstPtrTemp[i-1] + dstGenericDescPtr->strides[1]; + Rpp32s paramIdx = 0; + + for(Rpp32u i = 0; i < length[0]; i++) + { + Rpp32f *srcPtrRow = srcPtrTemp; + Rpp32f *dstPtrRow[length[2]]; + for(Rpp32u l = 0; l < length[2]; l++) + dstPtrRow[l] = dstPtrTemp[l]; + for(Rpp32u j = 0; j < length[1]; j++) + { + Rpp32f *srcPtrRowTemp = srcPtrRow; + Rpp32f *dstPtrRowTemp[length[2]]; + for(Rpp32u l = 0; l < length[2]; l++) + dstPtrRowTemp[l] = dstPtrRow[l]; + for(Rpp32u k = 0; k < length[2]; k++) + { + *dstPtrRowTemp[k]++ = ((*srcPtrRowTemp++ - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift; + paramIdx += paramStride[2]; + } + paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1]; + srcPtrRow += srcGenericDescPtr->strides[2]; + for(Rpp32u l = 0; l < length[2]; l++) + dstPtrRow[l] += dstGenericDescPtr->strides[3]; + } + srcPtrTemp += srcGenericDescPtr->strides[1]; + for(Rpp32u l = 0; l < length[2]; l++) + dstPtrTemp[l] += dstGenericDescPtr->strides[2]; + } +} + +// Computes normalize for 3D non toggle variants, optimized with AVX when axis mask set to 3 and 16 channel normalize +void normalize_3D_tensor_avx_axis3(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr, + Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u bufferLength, Rpp32u *length) +{ + Rpp32u vectorIncrement = 16; + Rpp32u alignedLength = (bufferLength / 16) * 16; + Rpp32u outerDim = length[0]; + + // set shift, mean and stddev + __m256 pShift = _mm256_set1_ps(shift); + __m256 pMean1 = _mm256_loadu_ps(meanPtr); + __m256 pMean2 = _mm256_loadu_ps(meanPtr + 8); + __m256 pMultiplier1 = _mm256_loadu_ps(multiplierPtr); + __m256 pMultiplier2 = _mm256_loadu_ps(multiplierPtr + 8); + + for(Rpp32u i = 0; i < outerDim; i++) + { + Rpp32f *srcPtrTemp = srcPtr + i * srcGenericDescPtr->strides[1]; + Rpp32f *dstPtrTemp = dstPtr + i * dstGenericDescPtr->strides[1]; + + Rpp32u vectorLoopCount = 0; + for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement) + { + __m256 pSrc1 = _mm256_loadu_ps(srcPtrTemp); + __m256 pSrc2 = _mm256_loadu_ps(srcPtrTemp + 8); + __m256 pDst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc1, pMean1), pMultiplier1), pShift); + __m256 pDst2 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc2, pMean2), pMultiplier2), pShift); + _mm256_storeu_ps(dstPtrTemp, pDst1); + _mm256_storeu_ps(dstPtrTemp + 8, pDst2); + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + } +} + +// Computes normalize for ND non toggle variants for i8 dataype +void normalize_ND_tensor_nontoggle(Rpp32s *srcPtr, Rpp32u *srcStride, Rpp32f *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr, + Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx) +{ + Rpp32u idx1 = 0; + if(tensorDim == 1) + { + for(Rpp32u k = 0; k < length[level]; k++) + { + *dstPtr++ = (((Rpp32f)(*srcPtr + 128) - meanPtr[idx]) * multiplierPtr[idx]) + shift; + if(k < length[level] - 1) + idx += paramStride[level]; + srcPtr++; + } + } + else + { + idx1 = idx; + for (Rpp32u i = 0; i < length[level]; i++) + { + normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx); + if(i < length[level] - 1) + idx = (!paramStride[level]) ? idx1 : idx + paramStride[level]; + dstPtr += srcStride[level]; + srcPtr += srcStride[level]; + } + } +} + +// Computes normalize for ND non toggle variants +template +void normalize_ND_tensor_nontoggle(T1 *srcPtr, Rpp32u *srcStride, T2 *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr, + Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx) +{ + Rpp32u idx1 = 0; + if(tensorDim == 1) + { + T1 *srcPtrTemp = srcPtr; + T2 *dstPtrTemp = dstPtr; + + for(Rpp32u k = 0; k < length[level]; k++) + { + *dstPtrTemp = (((T2)*srcPtrTemp - meanPtr[idx]) * multiplierPtr[idx]) + shift; + if(k < length[level] - 1) + idx += paramStride[level]; + srcPtrTemp++; + dstPtrTemp++; + } + } + else + { + idx1 = idx; + for (Rpp32u i = 0; i < length[level]; i++) + { + normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx); + if(i < length[level] - 1) + idx = (!paramStride[level]) ? idx1 : idx + paramStride[level]; + dstPtr += srcStride[level]; + srcPtr += srcStride[level]; + } + } +} + +// Computes normalize for 2D +void normalize_2D_tensor(Rpp32f *srcPtr, RpptGenericDescPtr srcDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstDescPtr, + Rpp32f *meanPtr, Rpp32f *invStdDevPtr, Rpp32f shift, Rpp32u *dims, Rpp32u *paramStride) +{ + if (paramStride[1]) // Optimized with AVX when axis mask set to 2 + { + Rpp32u vectorIncrement = 8; + Rpp32u bufferLength = dims[1]; + Rpp32u alignedLength = (bufferLength / 8) * 8; + + __m256 pShift = _mm256_set1_ps(shift); + for(Rpp32u i = 0; i < dims[0]; i++) + { + Rpp32f *srcPtrTemp = srcPtr + i * srcDescPtr->strides[1]; + Rpp32f *dstPtrTemp = dstPtr + i * dstDescPtr->strides[1]; + + // set mean and stddev + Rpp32f mean = meanPtr[i]; + Rpp32f invStdDev = invStdDevPtr[i]; + __m256 pMean, pInvStdDev; + pMean = _mm256_set1_ps(mean); + pInvStdDev = _mm256_set1_ps(invStdDev); + + Rpp32u vectorLoopCount = 0; + for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement) + { + __m256 pSrc = _mm256_loadu_ps(srcPtrTemp); + __m256 pDst = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc, pMean), pInvStdDev), pShift); + _mm256_storeu_ps(dstPtrTemp, pDst); + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } + for(; vectorLoopCount < dims[1] ; vectorLoopCount ++) + *dstPtrTemp++ = (*srcPtrTemp++ - mean) * invStdDev + shift; + } + } + else + { + Rpp32s paramIdx = 0; + for(Rpp32u i = 0; i < dims[0]; i++) + { + Rpp32f *srcPtrTemp = srcPtr; + Rpp32f *dstPtrTemp = dstPtr; + for(Rpp32u j = 0; j < dims[1]; j++) + { + *dstPtrTemp++ = (*srcPtrTemp++ - meanPtr[paramIdx]) * invStdDevPtr[paramIdx] + shift; + paramIdx += paramStride[0]; + } + paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1]; + srcPtr += srcDescPtr->strides[1]; + dstPtr += dstDescPtr->strides[1]; + } + } +} + +// Performs collapse axis operation wherein continuous axis that require normalization are combined together +void collapse_axis(Rpp32u *tensorDim, Rpp32u *axis, Rpp32u *length, Rpp32u *newAxis, Rpp32u *newDims, Rpp32u *lastNormAxis) +{ + int skipped = 0, prev = -2, k = 0; + for(Rpp32u i = 0; i < *tensorDim; i++) + { + if(axis[i]) + { + int temp = i - skipped; + if(temp != prev + 1) + { + newAxis[k] = 1; + newDims[k] = length[i]; + prev = i; + k++; + } + else if(prev >= 0) + { + newDims[prev] *= length[i]; + skipped++; + } + } + else + { + newDims[k] = length[i]; + k++; + } + } + *tensorDim -= skipped; + for(Rpp32u i = 0; i < *tensorDim; i++) + { + if(newAxis[i]) + *lastNormAxis = i; + } +} + +RppStatus normalize_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensorPtr, + Rpp32f *stdDevTensorPtr, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + Rpp32u maxSize = 1; + // Compute maxSize as length of input tensors differ based on axisMask and tensorDims + for(int batch = 0; batch < batchSize; batch++) + { + Rpp32u size = 1; + for(int i = 0; i < tensorDims; i++) + size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i]; + maxSize = std::max(maxSize, size); + } + + if(!computeMeanStddev) + { + for(Rpp32u i = 0; i < maxSize; i++) + stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i]; + maxSize = 0; + } + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * tensorDims * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[tensorDims]; + + Rpp32f *srcPtrTemp, *dstPtrTemp, *meanTensor, *stdDevTensor; + srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + meanTensor = meanTensorPtr + batchCount * maxSize; + stdDevTensor = stdDevTensorPtr + batchCount * maxSize; + + // Set all values in dst buffer to 0.0 + for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++) + dstPtrTemp[cnt] = 0.0f; + + Rpp32f *srcPtrChannel = srcPtrTemp; + + if(tensorDims == 2) // Called for audio testcase and for any other 2D case + { + Rpp32u paramStride[2]; + Rpp32u srcReductionDims[2], srcStride[2]; + if (axisMask == 3) + { + srcStride[0] = srcStride[1] = srcGenericDescPtr->strides[2]; + srcReductionDims[0] = 1; + srcReductionDims[1] = length[0] * length[1]; + paramStride[0] = paramStride[1] = 0; + } + else if (axisMask == 1) + { + srcStride[0] = srcGenericDescPtr->strides[1]; + srcStride[1] = srcGenericDescPtr->strides[2]; + srcReductionDims[0] = length[1]; + srcReductionDims[1] = length[0]; + paramStride[0] = 1; + paramStride[1] = 0; + } + else if (axisMask == 2) + { + srcStride[0] = srcGenericDescPtr->strides[2]; + srcStride[1] = srcGenericDescPtr->strides[1]; + srcReductionDims[0] = length[0]; + srcReductionDims[1] = length[1]; + paramStride[0] = 0; + paramStride[1] = 1; + } + + if(computeMeanStddev & 1) // Check if mean is to be computed internally + compute_2D_mean(srcPtrTemp, meanTensor, srcReductionDims, srcStride); + if(computeMeanStddev & 2) // Check if stddev is to be computed internally + compute_2D_inv_std_dev(srcPtrTemp, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale); + + normalize_2D_tensor(srcPtrTemp, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, length, paramStride); + } + else if(tensorDims == 3) // Called when a 3D tensor is passed to kernel + { + Rpp32u paramStride[3]; + Rpp32u srcReductionDims[3], srcStride[3]; + Rpp32u reductionDims; + bool isConsecutive = true; + switch(axisMask) + { + case 1: // Normalize axes 0 + { + reductionDims = length[1] * length[2]; + paramStride[0] = 0; + paramStride[1] = paramStride[2] = 1; + srcReductionDims[0] = length[1]; + srcReductionDims[1] = length[2]; + srcReductionDims[2] = length[0]; + srcStride[0] = srcGenericDescPtr->strides[1]; + srcStride[1] = srcGenericDescPtr->strides[3]; + srcStride[2] = srcGenericDescPtr->strides[2]; + break; + } + case 2: // Normalize axes 1 + { + reductionDims = length[0] * length[2]; + paramStride[1] = 0; + paramStride[0] = paramStride[2] = 1; + srcReductionDims[0] = length[0]; + srcReductionDims[1] = length[2]; + srcReductionDims[2] = length[1]; + srcStride[0] = srcGenericDescPtr->strides[2]; + srcStride[1] = srcGenericDescPtr->strides[3]; + srcStride[2] = srcGenericDescPtr->strides[1]; + break; + } + case 3: // Normalize axes 0, 1 + { + reductionDims = length[2]; + paramStride[0] = paramStride[1] = 0; + paramStride[2] = 1; + srcReductionDims[0] = 1; + srcReductionDims[1] = length[2]; + srcReductionDims[2] = length[0] * length[1]; + srcStride[0] = srcGenericDescPtr->strides[2]; + srcStride[1] = srcGenericDescPtr->strides[3]; + srcStride[2] = srcGenericDescPtr->strides[3]; + break; + } + case 4: // Normalize across 2 + { + reductionDims = length[0] * length[1]; + paramStride[2] = 0; + paramStride[0] = paramStride[1] = 1; + srcReductionDims[0] = length[0]; + srcReductionDims[1] = length[1]; + srcReductionDims[2] = length[2]; + srcStride[0] = srcGenericDescPtr->strides[3]; + srcStride[1] = srcGenericDescPtr->strides[2]; + srcStride[2] = srcGenericDescPtr->strides[1]; + break; + } + case 5: // Normalize across 0, 2 + { + reductionDims = length[1]; + paramStride[0] = paramStride[2] = 0; + paramStride[1] = 1; + srcReductionDims[0] = length[1]; + srcReductionDims[1] = length[0]; + srcReductionDims[2] = length[2]; + srcStride[0] = srcGenericDescPtr->strides[3]; + srcStride[1] = srcGenericDescPtr->strides[1]; + srcStride[2] = srcGenericDescPtr->strides[2]; + isConsecutive = false; + break; + } + case 6: // Normalize across 1, 2 + { + reductionDims = length[0]; + paramStride[1] = paramStride[2] = 0; + paramStride[0] = 1; + srcReductionDims[0] = 1; + srcReductionDims[1] = length[0]; + srcReductionDims[2] = length[1] * length[2]; + srcStride[0] = srcGenericDescPtr->strides[3]; + srcStride[1] = srcGenericDescPtr->strides[1]; + srcStride[2] = srcGenericDescPtr->strides[3]; + break; + } + case 7: // Normalize across 0, 1, 2 + { + reductionDims = 1; + paramStride[0] = paramStride[1] = paramStride[2] = 0; + srcReductionDims[0] = 1; + srcReductionDims[1] = 1; + srcReductionDims[2] = length[0] * length[1] * length[2]; + srcStride[0] = srcStride[1] = srcStride[2] = srcGenericDescPtr->strides[3]; + break; + } + default: + { + std::cout<<"Invalid Axis mask"<strides[i]; + + if(computeMeanStddev & 1) // Check if mean is to be computed internally + compute_3D_mean(srcPtrChannel, meanTensor, srcReductionDims, srcStride, isConsecutive); + if(computeMeanStddev & 2) // Check if stddev is to be computed internally + compute_3D_inv_std_dev(srcPtrChannel, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale, isConsecutive); + + if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC) && (srcGenericDescPtr->dims[3] == 16)) + normalize_3D_tensor_avx_axis3(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length[1] * layoutParams.bufferMultiplier, length); + else if((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC)) + normalize_3D_tensor_nontoggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length); + else if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NCHW)) + normalize_3D_tensor_axis3_toggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length); + } + else // Handle any other ND tensor is passed to kernel + { + // Compute length of input tensors as they differ based on axisMask and tensorDims + int size = 1; + for(int i = 0; i < tensorDims; i++) + size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i]; + + Rpp32u totalElements = 1; + Rpp32u lastNormAxis = 0; + Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims]; + // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis + memset(newAxis, 0, tensorDims * sizeof(Rpp32u)); + memset(newDims, 0, tensorDims * sizeof(Rpp32u)); + + for(Rpp32u i = 0; i < tensorDims; i++) + { + axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0; + totalElements *= axis[i] ? length[i] : 1; + srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1]; + } + + Rpp32u paramStride[tensorDims], srcStride[tensorDims]; + collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis); + compute_strides(srcStride, newDims, tensorDims); + + if(computeMeanStddev & 1) // Check if mean is to be computed internally + { + compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis); + Rpp32f normFactor = 1.0 / totalElements; + for(int i = 0; i < size; i++) + meanTensor[i] *= normFactor; + } + if(computeMeanStddev & 2) // Check if stddev is to be computed internally + { + compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis); + Rpp32f normFactor = (Rpp32f)(1.0 / totalElements); + rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale); + } + + for(Rpp32u i = 0; i < tensorDims; i++) + paramStride[i] = !newAxis[i]; + + Rpp32u idx = 0; + normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx); + } + } + + return RPP_SUCCESS; +} +template +RppStatus normalize_generic_host_tensor(T1 *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + T2 *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensorPtr, + Rpp32f *stdDevTensorPtr, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + Rpp32u maxSize = 1; + for(int batch = 0; batch < batchSize; batch++) + { + Rpp32u size = 1; // length of input tensors differ based on axisMask and tensorDims + for(int i = 0; i < tensorDims; i++) + size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i]; + maxSize = std::max(maxSize, size); + } + if(!computeMeanStddev) + { + for(Rpp32u i = 0; i < maxSize; i++) + stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i]; + maxSize = 0; + } + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + int size = 1; + Rpp32u *roi = roiTensor + batchCount * tensorDims * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[tensorDims]; + + for(int i = 0; i < tensorDims; i++) + size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i]; + + T1 *srcPtrTemp; + T2 *dstPtrTemp; + Rpp32f *meanTensor, *stdDevTensor; + srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + meanTensor = meanTensorPtr + batchCount * maxSize; + stdDevTensor = stdDevTensorPtr + batchCount * maxSize; + + // Set all values in dst buffer to 0.0 + for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++) + dstPtrTemp[cnt] = 0.0f; + + T1 *srcPtrChannel = srcPtrTemp; + + int totalElements = 1; + Rpp32u lastNormAxis = 0; + Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims]; + // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis + memset(newAxis, 0, sizeof(newAxis)); + memset(newDims, 0, sizeof(newDims)); + + for(int i = 0; i < tensorDims; i++) + { + axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0; + totalElements *= axis[i] ? length[i] : 1; + srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1]; + } + + Rpp32u paramStride[tensorDims], srcStride[tensorDims]; + collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis); + compute_strides(srcStride, newDims, tensorDims); + + if(computeMeanStddev & 1) // Check if mean is to be computed internally + { + compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis); + Rpp32f normFactor = 1.0 / totalElements; + for(int i = 0; i < size; i++) + meanTensor[i] *= normFactor; + } + if(computeMeanStddev & 2) // Check if stddev is to be computed internally + { + compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis); + Rpp32f normFactor = (Rpp32f)(1.0 / totalElements); + rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale); + } + + for(int i = 0; i < tensorDims; i++) + paramStride[i] = !newAxis[i]; + + Rpp32u idx = 0; + normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp index c79e0a951..6923b9a3f 100644 --- a/src/modules/hip/hip_tensor_statistical_operations.hpp +++ b/src/modules/hip/hip_tensor_statistical_operations.hpp @@ -27,5 +27,6 @@ SOFTWARE. #include "kernel/tensor_sum.hpp" #include "kernel/tensor_min.hpp" #include "kernel/tensor_max.hpp" +#include "kernel/normalize.hpp" #endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP diff --git a/src/modules/hip/kernel/normalize.hpp b/src/modules/hip/kernel/normalize.hpp new file mode 100644 index 000000000..384454ca7 --- /dev/null +++ b/src/modules/hip/kernel/normalize.hpp @@ -0,0 +1,1921 @@ +#include +#include "rpp_hip_common.hpp" + +#define MAX_SHARED_MEMORY_SIZE 1024 + +// -------------------- Set 0 - normalization kernels device helpers -------------------- + +__device__ __forceinline__ void normalize_check_and_store(float outVal, uchar* dst) +{ + outVal = fmax(fminf(outVal, 255), 0); + *dst = static_cast(outVal); +} + +__device__ __forceinline__ void normalize_check_and_store(float outVal, schar* dst) +{ + outVal = fmax(fminf(outVal, 127), -128); + *dst = static_cast(outVal); +} + +__device__ __forceinline__ void normalize_check_and_store(float outVal, float* dst) +{ + *dst = outVal; +} + +__device__ __forceinline__ void normalize_check_and_store(float outVal, half* dst) +{ + *dst = static_cast(outVal); +} + +// -------------------- Set 1 - normalization kernel host helpers -------------------- + +// setup function needed for 2D/3D optimized kernel variants when mean and stddev needs to be internally computed +void normalize_setup_2d_and_3d(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims, + Rpp32u axisMask, Rpp32u &maxParamVolume) +{ + maxParamVolume = 1; + uint axisSet[RPPT_MAX_DIMS]; + for(int i = 0; i < tensorDims; i++) + axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0; + + for(uint i = 0; i < batchSize; i++) + { + // calculate the max param volume + Rpp32u paramVolume = 1; + Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims]; + for(uint j = 0; j < tensorDims; j++) + paramVolume *= (axisSet[j]) ? 1 : roi[j]; + maxParamVolume = std::max(maxParamVolume, paramVolume); + } +} + +// setup function needed for ND generic kernel variants +void normalize_setup_nd(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims, Rpp32u axisMask, + Rpp32u *paramShapeTensor, Rpp32u *paramStridesTensor, Rpp32u &maxParamVolume) +{ + maxParamVolume = 1; + uint axisSet[RPPT_MAX_DIMS]; + for(int i = 0; i < tensorDims; i++) + axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0; + + for(uint i = 0; i < batchSize; i++) + { + // calculate the param shape and param volume based on the axis mask + Rpp32u paramVolume = 1; + Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims]; + Rpp32u *paramShape = ¶mShapeTensor[i * tensorDims]; + for(uint j = 0; j < tensorDims; j++) + { + paramShape[j] = (axisSet[j]) ? 1 : roi[j]; + paramVolume *= paramShape[j]; + } + maxParamVolume = std::max(maxParamVolume, paramVolume); + + // calculate the param strides from the param shape + Rpp32u *paramStrides = ¶mStridesTensor[i * tensorDims]; + Rpp32u val = 1; + for(uint j = tensorDims - 1; j > 0; j--) + { + paramStrides[j] = val; + val *= paramShape[j]; + } + paramStrides[0] = val; + } +} + +// -------------------- Set 2 - normalization kernels -------------------- + +template +__global__ void normalize_2d_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + float *meanTensor, + float *stdDevTensor, + float2 scaleAndShift, + uint *roiTensor, + uint2 maxParamVolumeAndAxisMask, + bool computeStdDev) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // width + uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize + + uint *roi = &roiTensor[id_z * 4]; + uint yBegin = roi[0]; + uint xBegin = roi[1]; + uint height = roi[2]; + uint width = roi[3]; + + if (id_x >= width || id_y >= height) + return; + + uint maxParamVolume = maxParamVolumeAndAxisMask.x; + uint axisMask = maxParamVolumeAndAxisMask.y; + uint paramIndex = id_z * maxParamVolume; + // update paramIndex based on axisMask value + if (axisMask == 1) + paramIndex += id_x; + else if (axisMask == 2) + paramIndex += id_y; + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + id_x + xBegin; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x; + float mean = meanTensor[paramIndex]; + float stdDev = stdDevTensor[paramIndex]; + float scale = scaleAndShift.x; + float shift = scaleAndShift.y; + float invStdDev; + if (computeStdDev) + { + float stdDevSquare = stdDev * stdDev; + invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0; + } + else + { + invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f; + } + float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift); + normalize_check_and_store(outVal, &dstPtr[dstIdx]); +} + +template +__global__ void normalize_3d_hip_tensor(T *srcPtr, + uint2 srcStridesDH, + T *dstPtr, + uint2 dstStridesDH, + float *meanTensor, + float *stdDevTensor, + float2 scaleAndShift, + uint *roiTensor, + uint axisMask, + bool computeStdDev) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX + uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ + + uint *roi = roiTensor; + uint zBegin = roi[0]; + uint yBegin = roi[1]; + uint xBegin = roi[2]; + uint lengthZ = roi[3]; + uint lengthY = roi[4]; + uint lengthX = roi[5]; + + if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ) + return; + + uint paramIndex = 0; + // update paramIndex based on axisMask value + if (axisMask == 1) + paramIndex += id_y * lengthX + id_x; + else if (axisMask == 2) + paramIndex += id_z * lengthX + id_x; + else if (axisMask == 4) + paramIndex += id_z * lengthY + id_y; + else if (axisMask == 3) + paramIndex += id_x; + else if (axisMask == 5) + paramIndex += id_y; + else if (axisMask == 6) + paramIndex += id_z; + + uint srcIdx = ((id_z + zBegin) * srcStridesDH.x) + ((id_y + yBegin) * srcStridesDH.y) + id_x + xBegin; + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x; + float mean = meanTensor[paramIndex]; + float stdDev = stdDevTensor[paramIndex]; + float scale = scaleAndShift.x; + float shift = scaleAndShift.y; + float invStdDev; + if (computeStdDev) + { + float stdDevSquare = stdDev * stdDev; + invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0; + } + else + { + invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f; + } + float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift); + normalize_check_and_store(outVal, &dstPtr[dstIdx]); +} + +template +__global__ void normalize_nd_hip_tensor(T *srcPtr, + uint *srcMaxDims, + uint *srcStrides, + T *dstPtr, + float *meanTensor, + float *stdDevTensor, + float2 scaleAndShift, + uint *roiTensor, + uint *paramShapeTensor, + uint *paramStridesTensor, + uint2 maxParamVolumeAndBufferLength, + uint tensorDims, + bool computeStdDev) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint maxBufferLength = maxParamVolumeAndBufferLength.y; + + if (id_x >= maxBufferLength) + return; + + uint *begin = &roiTensor[id_z * tensorDims * 2]; + uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + uint *paramShape = ¶mShapeTensor[id_z * tensorDims]; + uint *paramStrides = ¶mStridesTensor[id_z * tensorDims]; + uint maxParamVolume = maxParamVolumeAndBufferLength.x; + uint srcIdx = id_z * maxBufferLength; + + uint paramIndex = id_z * maxParamVolume; + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + return; + paramIndex += (maxParamVolume != 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + + float mean = meanTensor[paramIndex]; + float stdDev = stdDevTensor[paramIndex]; + float scale = scaleAndShift.x; + float shift = scaleAndShift.y; + float invStdDev; + if (computeStdDev) + { + float stdDevSquare = stdDev * stdDev; + invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0; + } + else + { + invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f; + } + uint dstIdx = id_z * maxBufferLength + id_x; + float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift); + normalize_check_and_store(outVal, &dstPtr[dstIdx]); +} + +// -------------------- Set 3 - mean and stddev compute kernels device helpers -------------------- + +__device__ __forceinline__ void reduction_sum_x_hip(float *partialSum_smem) +{ + for(uint threadMax = hipBlockDim_x / 2; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialSum_smem[hipThreadIdx_x] += partialSum_smem[hipThreadIdx_x + threadMax]; + __syncthreads(); + } +} + +// -------------------- Set 4 - mean compute kernels (reduction stage 1) -------------------- + +template +__global__ void compute_mean_2d_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + float *meanTensor, + float *partialSumTensor, + uint *roiTensor, + uint maxParamVolume, + uint axisMask) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *roi = &roiTensor[id_z * 4]; + uint yBegin = roi[0]; + uint xBegin = roi[1]; + uint height = roi[2]; + uint width = roi[3]; + + // compute column wise mean + if (axisMask == 1) + { + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin); + uint dstIdx = id_z * maxParamVolume + id_x; + if (id_x < width) + { + float accum = 0.0f; + for(int i = 0; i < height; i++) + { + accum += static_cast(srcPtr[srcIdx]); + srcIdx += srcStridesNH.y; + } + meanTensor[dstIdx] = accum / static_cast(height); + } + } + // compute partial sums needed for row wise mean + else if (axisMask == 2) + { + id_x *= 8; + __shared__ float partialRowSum_smem[256]; + partialRowSum_smem[hipThreadIdx_x] = 0.0f; + + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + int xAlignedLength = width & ~7; // alignedLength for vectorized global loads + int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + if (id_x + 8 > width) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 256 threads and store in Shared + __syncthreads(); + + // Now do block level reduction sum + reduction_sum_x_hip(partialRowSum_smem); + + // Final store to dst + if (hipThreadIdx_x == 0) + { + uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x; + partialSumTensor[paramIndex] = partialRowSum_smem[0]; + } + } + // compute partial sums need for computing mean over entire rows and columns + else if (axisMask == 3) + { + id_x *= 8; + __shared__ float partialSum_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + int xAlignedLength = width & ~7; // alignedLength for vectorized global loads + int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + if (id_x + 8 > width) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSumRowPtr_smem); + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0]; + } + } +} + +template +__global__ void compute_mean_3d_hip_tensor(T *srcPtr, + uint3 srcStridesNZY, + float *meanTensor, + uint *roiTensor, + float *partialSumTensor, + uint maxParamVolume, + uint axisMask) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *roi = &roiTensor[id_z * 6]; + uint zBegin = roi[0]; + uint yBegin = roi[1]; + uint xBegin = roi[2]; + uint lengthZ = roi[3]; + uint lengthY = roi[4]; + uint lengthX = roi[5]; + + // compute mean along z direction + if (axisMask == 1) + { + if (id_x >= lengthX || id_y >= lengthY) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin); + uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x; + float accum = 0.0f; + for(uint i = 0; i < lengthZ; i++) + { + accum += static_cast(srcPtr[srcIdx]); + srcIdx += srcStridesNZY.y; + } + meanTensor[dstIdx] = accum / static_cast(lengthZ); + } + // compute mean along y direction + else if (axisMask == 2) + { + if (id_x >= lengthX || id_y >= lengthZ) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin); + uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x; + float accum = 0.0f; + for(uint i = 0; i < lengthY; i++) + { + accum += static_cast(srcPtr[srcIdx]); + srcIdx += srcStridesNZY.z; + } + meanTensor[dstIdx] = accum / static_cast(lengthY); + } + // compute mean along x direction + else if (axisMask == 4) + { + if (id_x >= lengthY || id_y >= lengthZ) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin; + d_float8 accum_f8; + accum_f8.f4[0] = (float4)0.0f; + accum_f8.f4[1] = (float4)0.0f; + for(int i = 0; i < lengthX; i += 8) + { + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + if (i + 8 > lengthX) + { + int xDiff = i + 8 - lengthX; + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; + } + accum_f8.f4[0] += src_f8.f4[0]; + accum_f8.f4[1] += src_f8.f4[1]; + srcIdx += 8; + } + accum_f8.f4[0] += accum_f8.f4[1]; + accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]); + uint dstIdx = id_z * maxParamVolume + id_y * lengthY + id_x; + meanTensor[dstIdx] = accum_f8.f1[0] / static_cast(lengthX); + } + // compute partial sums required for computing mean along z-y direction + else if (axisMask == 3) + { + for(uint xIndex = 0; xIndex < lengthX; xIndex++) + { + __shared__ float partialSum_smem[16][16]; + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + if ((id_x >= lengthY) || (id_y >= lengthZ)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex); + partialSumRowPtr_smem[hipThreadIdx_x] = static_cast(srcPtr[srcIdx]); + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax]; + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in z dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x); + partialSumTensor[dstIdx] = partialSumRowPtr_smem[0]; + } + } + __syncthreads(); + } + } + // compute partial sums required for computing mean along y-x direction + else if (axisMask == 6) + { + __shared__ float partialSum_smem[256]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + __syncthreads(); + + if (id_x >= lengthY || id_y >= lengthZ) + return; + + uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y; + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin; + d_float8 accum_f8; + accum_f8.f4[0] = (float4)0.0f; + accum_f8.f4[1] = (float4)0.0f; + for(int i = 0; i < lengthX; i += 8) + { + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + if (i + 8 > lengthX) + { + int xDiff = lengthX - i; + for(int j = xDiff; j < 8; j++) + src_f8.f1[j] = 0.0f; + } + accum_f8.f4[0] += src_f8.f4[0]; + accum_f8.f4[1] += src_f8.f4[1]; + srcIdx += 8; + } + accum_f8.f4[0] += accum_f8.f4[1]; + accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]); + partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0]; + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSum_smem); + + if (hipThreadIdx_x == 0) + { + uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x; + partialSumTensor[dstIdx] = partialSum_smem[0]; + } + } + // compute mean along z-x direction + else if (axisMask == 5) + { + __shared__ float partialSum_smem[32]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + + if (hipBlockIdx_x >= lengthY) + return; + + uint dstIdx = id_z * maxParamVolume + hipBlockIdx_x; + float accum = 0.0f; + for (uint i = 0; i < lengthZ; i++) + { + uint tid_x = hipThreadIdx_x; + uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin; + while (tid_x < lengthX) + { + accum += static_cast(srcPtr[srcIdx + tid_x]); + tid_x += hipBlockDim_x; + } + } + partialSum_smem[hipThreadIdx_x] = accum; + __syncthreads(); + + // perform reduction on shared memory sums + reduction_sum_x_hip(partialSum_smem); + + if (hipThreadIdx_x == 0) + meanTensor[dstIdx] = partialSum_smem[0] / static_cast(lengthX * lengthZ); + } + // compute partial sums required for computing mean along z-y-x direction + else if (axisMask == 7) + { + id_x *= 8; + __shared__ float partialSum_smem[16][16]; + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + uint xIndex = id_x % srcStridesNZY.z; + uint yIndex = id_x / srcStridesNZY.z; + if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ)) + { + return; + } + + int xAlignedLength = lengthX & ~7; // alignedLength for vectorized global loads + int xDiff = lengthX - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + if (xIndex + 8 > lengthX) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; + partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSumRowPtr_smem); + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x); + partialSumTensor[dstIdx] = partialSumRowPtr_smem[0]; + } + } + } +} + +template +__global__ void compute_mean_nd_hip_tensor(T *srcPtr, + uint *srcMaxDims, + uint *srcStrides, + float *meanTensor, + uint *roiTensor, + uint *paramShapeTensor, + uint *paramStridesTensor, + uint maxParamVolume, + uint tensorDims, + uint maxBufferLength) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *begin = &roiTensor[id_z * tensorDims * 2]; + uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + uint *paramShape = ¶mShapeTensor[id_z * tensorDims]; + uint *paramStrides = ¶mStridesTensor[id_z * tensorDims]; + uint srcIdx = id_z * maxBufferLength; + uint paramBase = id_z * maxParamVolume; + uint paramIndex = 0; + + if (maxParamVolume > MAX_SHARED_MEMORY_SIZE) + { + if (id_x >= maxBufferLength) + return; + + // validate if id_x is within the roi of input and compute paramIndex if valid + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + return; + paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + atomicAdd(&meanTensor[paramBase + paramIndex], static_cast(srcPtr[srcIdx])); + } + else + { + + if (id_x >= (hipBlockDim_x * hipGridDim_x)) + return; + + // if number of means needed to compute is within in the max shared memory size + // use shared memory for atomic addition to reduce global memory traffic + bool isValid = true; + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + { + isValid = false; + break; + } + paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + + extern __shared__ float sh_mem[]; + sh_mem[hipThreadIdx_x] = 0.0f; + __syncthreads(); + + if (isValid && id_x < maxBufferLength) + atomicAdd(&sh_mem[paramIndex], static_cast(srcPtr[srcIdx])); + __syncthreads(); + + if (hipThreadIdx_x < maxParamVolume) + atomicAdd(&meanTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]); + } +} + +// -------------------- Set 5 - stddev compute kernels (reduction stage 1) -------------------- + +template +__global__ void compute_stddev_2d_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + float *meanTensor, + float *stdDevTensor, + float *partialSumTensor, + uint *roiTensor, + uint maxParamVolume, + uint axisMask) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *roi = &roiTensor[id_z * 4]; + uint yBegin = roi[0]; + uint xBegin = roi[1]; + uint height = roi[2]; + uint width = roi[3]; + + // compute column wise stdDev + if (axisMask == 1) + { + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin); + uint paramIndex = id_z * maxParamVolume + id_x; + float mean = meanTensor[paramIndex]; + if (id_x < width) + { + float accum = 0.0f; + for(int i = 0; i < height; i++) + { + float val = (static_cast(srcPtr[srcIdx]) - mean); + accum += (val * val); + srcIdx += srcStridesNH.y; + } + stdDevTensor[paramIndex] = sqrtf(accum / static_cast(height)); + } + } + // compute partial mean subtracted squared sums needed for row wise stdDev + else if (axisMask == 2) + { + id_x *= 8; + __shared__ float partialRowSum_smem[256]; + partialRowSum_smem[hipThreadIdx_x] = 0.0f; + + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + int xAlignedLength = width & ~7; // alignedLength for vectorized global loads + int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin); + + uint paramIndex = id_z * maxParamVolume + id_y; + float mean = meanTensor[paramIndex]; + float4 mean_f4 = static_cast(mean); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + + if (id_x + 8 > width) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared + __syncthreads(); + + // Now do block level reduction sum + reduction_sum_x_hip(partialRowSum_smem); + + // Final store to dst + if (hipThreadIdx_x == 0) + { + uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x; + partialSumTensor[paramIndex] = partialRowSum_smem[0]; + } + } + // compute partial mean subtracted squared sums need for computing stdDev over entire rows and columns + else if (axisMask == 3) + { + id_x *= 8; + __shared__ float partialSum_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + if ((id_y >= height) || (id_x >= width)) + { + return; + } + + int xAlignedLength = width & ~7; // alignedLength for vectorized global loads + int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin); + + float mean = meanTensor[id_z]; + float4 mean_f4 = static_cast(mean); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + if (id_x + 8 > width) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition + partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSumRowPtr_smem); + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0]; + } + } +} + +template +__global__ void compute_stddev_3d_hip_tensor(T *srcPtr, + uint3 srcStridesNZY, + float *meanTensor, + float *stdDevTensor, + uint *roiTensor, + float *partialSumTensor, + uint maxParamVolume, + uint axisMask) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *roi = &roiTensor[id_z * 6]; + uint zBegin = roi[0]; + uint yBegin = roi[1]; + uint xBegin = roi[2]; + uint lengthZ = roi[3]; + uint lengthY = roi[4]; + uint lengthX = roi[5]; + + // compute stddev along z direction + if (axisMask == 1) + { + if (id_x >= lengthX || id_y >= lengthY) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin); + uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x; + float mean = meanTensor[paramIndex]; + float accum = 0.0f; + for(uint i = 0; i < lengthZ; i++) + { + float val = (static_cast(srcPtr[srcIdx]) - mean); + accum += (val * val); + srcIdx += srcStridesNZY.y; + } + stdDevTensor[paramIndex] = sqrtf(accum / static_cast(lengthZ)); + } + // compute stddev along y direction + else if (axisMask == 2) + { + if (id_x >= lengthX || id_y >= lengthZ) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin); + uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x; + float mean = meanTensor[paramIndex]; + float accum = 0.0f; + for(uint i = 0; i < lengthY; i++) + { + float val = (static_cast(srcPtr[srcIdx]) - mean); + accum += (val * val); + srcIdx += srcStridesNZY.z; + } + stdDevTensor[paramIndex] = sqrtf(accum / static_cast(lengthY)); + } + // compute stddev along x direction + else if (axisMask == 4) + { + if (id_x >= lengthY || id_y >= lengthZ) + return; + + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin; + uint paramIndex = id_z * maxParamVolume + id_y * lengthY + id_x; + float mean = meanTensor[paramIndex]; + float4 mean_f4 = static_cast(mean); + d_float8 accum_f8; + accum_f8.f4[0] = (float4)0.0f; + accum_f8.f4[1] = (float4)0.0f; + for(int i = 0; i < lengthX; i += 8) + { + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + if (i + 8 > lengthX) + { + int xDiff = i + 8 - lengthX; + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; + } + accum_f8.f4[0] += src_f8.f4[0]; + accum_f8.f4[1] += src_f8.f4[1]; + srcIdx += 8; + } + accum_f8.f4[0] += accum_f8.f4[1]; + accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]); + + stdDevTensor[paramIndex] = sqrtf(accum_f8.f1[0] / static_cast(lengthX)); + } + // compute partial mean subtracted squared sums required for computing stdDev along z-y direction + else if (axisMask == 3) + { + for(uint xIndex = 0; xIndex < lengthX; xIndex++) + { + __shared__ float partialSum_smem[16][16]; + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + if ((id_x >= lengthY) || (id_y >= lengthZ)) + { + return; + } + + uint paramIndex = id_z * maxParamVolume + xIndex; + float mean = meanTensor[paramIndex]; + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex); + float val = static_cast(srcPtr[srcIdx]) - mean; + partialSumRowPtr_smem[hipThreadIdx_x] = (val * val); + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax]; + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in z dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x); + partialSumTensor[dstIdx] = partialSumRowPtr_smem[0]; + } + } + __syncthreads(); + } + } + // compute partial mean subtracted squared sums required for computing stdDev along y-x direction + else if (axisMask == 6) + { + __shared__ float partialSum_smem[256]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + __syncthreads(); + + if (id_x >= lengthY || id_y >= lengthZ) + return; + + uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y; + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin; + + uint paramIndex = id_z * maxParamVolume + id_y; + float mean = meanTensor[paramIndex]; + float4 mean_f4 = static_cast(mean); + + d_float8 accum_f8; + accum_f8.f4[0] = (float4)0.0f; + accum_f8.f4[1] = (float4)0.0f; + for(int i = 0; i < lengthX; i += 8) + { + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + if (i + 8 > lengthX) + { + int xDiff = lengthX - i; + for(int j = xDiff; j < 8; j++) + src_f8.f1[j] = 0.0f; + } + accum_f8.f4[0] += src_f8.f4[0]; + accum_f8.f4[1] += src_f8.f4[1]; + srcIdx += 8; + } + accum_f8.f4[0] += accum_f8.f4[1]; + accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]); + partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0]; + __syncthreads(); + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSum_smem); + + if (hipThreadIdx_x == 0) + { + uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x; + partialSumTensor[dstIdx] = partialSum_smem[0]; + } + } + // compute stddev along z-x direction + else if (axisMask == 5) + { + __shared__ float partialSum_smem[32]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + + if (hipBlockIdx_x >= lengthY) + return; + + uint paramIndex = id_z * maxParamVolume + hipBlockIdx_x; + float mean = meanTensor[paramIndex]; + float accum = 0.0f; + for (uint i = 0; i < lengthZ; i++) + { + uint tid_x = hipThreadIdx_x; + uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin; + while (tid_x < lengthX) + { + float val = (static_cast(srcPtr[srcIdx + tid_x]) - mean); + accum += (val * val); + tid_x += hipBlockDim_x; + } + } + partialSum_smem[hipThreadIdx_x] = accum; + __syncthreads(); + + // perform reduction on shared memory sums + reduction_sum_x_hip(partialSum_smem); + + if (hipThreadIdx_x == 0) + stdDevTensor[paramIndex] = sqrtf(partialSum_smem[0] / static_cast(lengthX * lengthZ)); + } + // compute partial mean subtracted squared sums required for computing stdDev along z-y-x direction + else if (axisMask == 7) + { + id_x *= 8; + __shared__ float partialSum_smem[16][16]; + float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared + partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads + + uint xIndex = id_x % srcStridesNZY.z; + uint yIndex = id_x / srcStridesNZY.z; + if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ)) + { + return; + } + + int xAlignedLength = lengthX & ~7; // alignedLength for vectorized global loads + int xDiff = lengthX - xAlignedLength; // difference between roiWidth and alignedLength + uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin); + + uint paramIndex = id_z * maxParamVolume; + float mean = meanTensor[paramIndex]; + float4 mean_f4 = static_cast(mean); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4); + rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8); + + if (xIndex + 8 > lengthX) + { + for(int i = xDiff; i < 8; i++) + src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f + } + src_f8.f4[0] += src_f8.f4[1]; + partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] + + src_f8.f1[1] + + src_f8.f1[2] + + src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared + __syncthreads(); // syncthreads after Shared load + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + reduction_sum_x_hip(partialSumRowPtr_smem); + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment]; + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + { + uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x); + partialSumTensor[dstIdx] = partialSumRowPtr_smem[0]; + } + } + } +} + +template +__global__ void compute_stddev_nd_hip_tensor(T *srcPtr, + uint *srcMaxDims, + uint *srcStrides, + float *meanTensor, + float *stdDevTensor, + uint *roiTensor, + uint *paramShapeTensor, + uint *paramStridesTensor, + uint maxParamVolume, + uint tensorDims, + uint maxBufferLength) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *begin = &roiTensor[id_z * tensorDims * 2]; + uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + uint *paramShape = ¶mShapeTensor[id_z * tensorDims]; + uint *paramStrides = ¶mStridesTensor[id_z * tensorDims]; + uint srcIdx = id_z * maxBufferLength; + uint paramBase = id_z * maxParamVolume; + uint paramIndex = 0; + + if (maxParamVolume > MAX_SHARED_MEMORY_SIZE) + { + if (id_x >= maxBufferLength) + return; + + // validate if id_x is within the roi of input and compute paramIndex if valid + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + return; + paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + float val = static_cast(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex]; + atomicAdd(&stdDevTensor[paramBase + paramIndex], (val * val)); + } + else + { + + if (id_x >= (hipBlockDim_x * hipGridDim_x)) + return; + + // if number of means needed to compute is within in the max shared memory size + // use shared memory for atomic addition to reduce global memory traffic + bool isValid = true; + for (int i = 0; i < tensorDims; i++) + { + uint coord = id_x / srcStrides[i] % srcMaxDims[i]; + srcIdx += ((begin[i] + coord) * srcStrides[i]); + if (coord >= length[i]) + { + isValid = false; + break; + } + paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0; + } + + extern __shared__ float sh_mem[]; + sh_mem[hipThreadIdx_x] = 0.0f; + __syncthreads(); + + if (isValid && id_x < maxBufferLength) + { + float val = static_cast(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex]; + atomicAdd(&sh_mem[paramIndex], (val * val)); + } + __syncthreads(); + + if (hipThreadIdx_x < maxParamVolume) + atomicAdd(&stdDevTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]); + } +} + +// -------------------- Set 6 - mean and stddev compute kernels (reduction stage 2) -------------------- + +__global__ void reduce_final_result_hip(float *partialSumTensor, + uint numPartialSums, + float *meanTensor, + float *stdDevTensor, + bool isMean, + uint *roiTensor, + uint axisMask, + uint tensorDims) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + + uint meanFactor; + if (tensorDims == 3) + { + uint lengthZ = roi[0]; + uint lengthY = roi[1]; + uint lengthX = roi[2]; + + if (axisMask == 3) + meanFactor = lengthZ * lengthY; + else if (axisMask == 6) + meanFactor = lengthY * lengthX; + else if (axisMask == 7) + meanFactor = lengthZ * lengthY * lengthX; + } + else if (tensorDims == 2) + { + uint lengthY = roi[0]; + uint lengthX = roi[1]; + + if (axisMask == 2) + meanFactor = lengthX; + else if (axisMask == 3) + meanFactor = lengthY * lengthX; + } + + __shared__ float partialSum_smem[16]; + partialSum_smem[hipThreadIdx_x] = 0.0f; + + float accum = 0.0f; + while(id_x < numPartialSums) + { + uint srcIdx = (id_z * hipGridDim_y * numPartialSums) + (id_y * numPartialSums) + id_x; + accum += partialSumTensor[srcIdx]; + id_x += hipBlockDim_x; + } + partialSum_smem[hipThreadIdx_x] = accum; + __syncthreads(); + + // Now do block level reduction sum + reduction_sum_x_hip(partialSum_smem); + + // Final store to dst + if (hipThreadIdx_x == 0) + { + if (isMean) + meanTensor[id_z * hipGridDim_y + id_y] = partialSum_smem[0] / meanFactor; + else + stdDevTensor[id_z * hipGridDim_y + id_y] = sqrtf(partialSum_smem[0] / meanFactor); + } +} + +__global__ void final_reduction_nd_hip_tensor(float *meanTensor, + float *stdDevTensor, + uint *paramShapeTensor, + uint *roiTensor, + uint tensorDims, + uint maxParamVolume, + bool isMean) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + uint *paramShape = ¶mShapeTensor[id_z * tensorDims]; + uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims]; + + uint divisionFactor = 1; + uint paramVolume = 1; + for(int i = 0; i < tensorDims; i++) + { + paramVolume *= paramShape[i]; + if (paramShape[i] == 1) + divisionFactor *= roi[i]; + } + + if (id_x >= paramVolume) + return; + + uint paramIndex = id_z * maxParamVolume + id_x; + if (isMean) + meanTensor[paramIndex] = meanTensor[paramIndex] / divisionFactor; + else + stdDevTensor[paramIndex] = sqrtf(stdDevTensor[paramIndex] / divisionFactor); +} + +// -------------------- Set 7 - mean and stddev compute kernels launch helpers -------------------- + +void set_kernel_launch_config_2d(RpptGenericDescPtr srcGenericDescPtr, + int &globalThreads_x, + int &globalThreads_y, + int &globalThreads_z, + int &localThreads_x, + int &localThreads_y, + int &localThreads_z, + Rpp32u axisMask, + Rpp32f *partialSumArr, + rpp::Handle& handle) +{ + switch (axisMask) + { + // compute along Y direction + case 1: + { + localThreads_x = 256; + localThreads_y = 1; + localThreads_z = 1; + globalThreads_x = static_cast(ceil((float)srcGenericDescPtr->dims[2] / localThreads_x)); + globalThreads_y = 1; + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along X direction + case 2: + { + localThreads_x = 256; + localThreads_y = 1; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / 256)); + globalThreads_y = srcGenericDescPtr->dims[1]; + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = srcGenericDescPtr->dims[0] * srcGenericDescPtr->dims[1] * globalThreads_x; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + // compute along XY direction + case 3: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + } +} + +void set_kernel_launch_config_3d(RpptGenericDescPtr srcGenericDescPtr, + int &globalThreads_x, + int &globalThreads_y, + int &globalThreads_z, + int &localThreads_x, + int &localThreads_y, + int &localThreads_z, + Rpp32u axisMask, + Rpp32f *partialSumArr, + rpp::Handle& handle) +{ + switch (axisMask) + { + // compute along Z direction + case 1: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along Y direction + case 2: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along YZ direction + case 3: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + // compute along X direction + case 4: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along XZ direction + case 5: + { + localThreads_x = 32; + localThreads_y = 1; + localThreads_z = 1; + globalThreads_x = srcGenericDescPtr->dims[2]; + globalThreads_y = 1; + globalThreads_z = srcGenericDescPtr->dims[0]; + break; + } + // compute along XY direction + case 6: + { + localThreads_x = 256; + localThreads_y = 1; + localThreads_z = 1; + globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x)); + globalThreads_y = srcGenericDescPtr->dims[1]; + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + // compute along XYZ direction + case 7: + { + localThreads_x = 16; + localThreads_y = 16; + localThreads_z = 1; + Rpp32u numValues = (srcGenericDescPtr->dims[2] * srcGenericDescPtr->dims[3] + 7) >> 3; + globalThreads_x = static_cast (ceil((float)numValues / localThreads_x)); + globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y)); + globalThreads_z = srcGenericDescPtr->dims[0]; + + Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z; + hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream()); + hipStreamSynchronize(handle.GetStream()); + break; + } + } +} + +// -------------------- Set 8 - mean and stddev compute kernels executor -------------------- + +template +RppStatus hip_exec_compute_mean_stddev_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *meanTensor, + Rpp32f *stdDevTensor, + bool isMean, + Rpp32u *roiTensor, + Rpp32u axisMask, + Rpp32u tensorDims, + Rpp32u maxParamVolume, + Rpp32u *paramShape, + Rpp32u *paramStrides, + rpp::Handle& handle) +{ + Rpp32f *partialSumArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; + Rpp32u partialSumArrLength, partialSumBlocksPerSample; + + int globalThreads_x, globalThreads_y, globalThreads_z; + int localThreads_x, localThreads_y, localThreads_z; + // based on number of dimensions call the corresponding kernel + if (tensorDims == 2) + { + // set the block and grid configuration based on axisMask + set_kernel_launch_config_2d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z, + localThreads_x, localThreads_y, localThreads_z, axisMask, + partialSumArr, handle); + + if (isMean) + { + hipLaunchKernelGGL(compute_mean_2d_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]), + meanTensor, + partialSumArr, + roiTensor, + maxParamVolume, + axisMask); + } + else + { + hipLaunchKernelGGL(compute_stddev_2d_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]), + meanTensor, + stdDevTensor, + partialSumArr, + roiTensor, + maxParamVolume, + axisMask); + } + + if (axisMask == 2) + { + partialSumBlocksPerSample = globalThreads_x; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), ceil((float)globalThreads_y), ceil((float)globalThreads_z)), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + else if (axisMask == 3) + { + partialSumBlocksPerSample = globalThreads_x * globalThreads_y; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), 1, globalThreads_z), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + } + else if (tensorDims == 3) + { + // set the block and grid configuration based on axisMask + set_kernel_launch_config_3d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z, + localThreads_x, localThreads_y, localThreads_z, axisMask, + partialSumArr, handle); + + if (isMean) + { + hipLaunchKernelGGL(compute_mean_3d_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + meanTensor, + roiTensor, + partialSumArr, + maxParamVolume, + axisMask); + } + else + { + hipLaunchKernelGGL(compute_stddev_3d_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + meanTensor, + stdDevTensor, + roiTensor, + partialSumArr, + maxParamVolume, + axisMask); + } + + // perform final reduction on block wise sums for below cases + // reduce on YZ partial sums + if (axisMask == 3) + { + partialSumBlocksPerSample = globalThreads_x * globalThreads_y; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[3], srcGenericDescPtr->dims[0]), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + // reduce on XY partial sums + if (axisMask == 6) + { + partialSumBlocksPerSample = globalThreads_x; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[1], srcGenericDescPtr->dims[0]), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + // reduce on XYZ block partial sums + else if (axisMask == 7) + { + partialSumBlocksPerSample = globalThreads_x * globalThreads_y; + hipLaunchKernelGGL(reduce_final_result_hip, + dim3(ceil((float)partialSumBlocksPerSample/16), 1, srcGenericDescPtr->dims[0]), + dim3(16, 1, 1), + 0, + handle.GetStream(), + partialSumArr, + partialSumBlocksPerSample, + meanTensor, + stdDevTensor, + isMean, + roiTensor, + axisMask, + tensorDims); + } + } + else + { + // interpret the input as 1D tensor + globalThreads_x = srcGenericDescPtr->strides[0]; + globalThreads_y = 1; + globalThreads_z = srcGenericDescPtr->dims[0]; + Rpp32u batchSize = globalThreads_z; + + // allocate tensor for src strides + Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1]; + Rpp32u *srcStrides = &srcGenericDescPtr->strides[1]; + + Rpp32u shared_memory_size = 0; + Rpp32u block_size = 1024; + if (maxParamVolume <= MAX_SHARED_MEMORY_SIZE) + { + if (maxParamVolume <= 32) + shared_memory_size = 32; + else if (maxParamVolume <= 64) + shared_memory_size = 64; + else if (maxParamVolume <= 128) + shared_memory_size = 128; + else if (maxParamVolume <= 256) + shared_memory_size = 256; + else if (maxParamVolume <= 512) + shared_memory_size = 512; + else + shared_memory_size = MAX_SHARED_MEMORY_SIZE; + block_size = shared_memory_size; + } + + if (isMean) + { + hipLaunchKernelGGL(compute_mean_nd_hip_tensor, + dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)), + dim3(block_size, 1, 1), + shared_memory_size, + handle.GetStream(), + srcPtr, + srcMaxDims, + srcStrides, + meanTensor, + roiTensor, + paramShape, + paramStrides, + maxParamVolume, + tensorDims, + srcGenericDescPtr->strides[0]); + } + else + { + hipLaunchKernelGGL(compute_stddev_nd_hip_tensor, + dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)), + dim3(block_size, 1, 1), + shared_memory_size, + handle.GetStream(), + srcPtr, + srcMaxDims, + srcStrides, + meanTensor, + stdDevTensor, + roiTensor, + paramShape, + paramStrides, + maxParamVolume, + tensorDims, + srcGenericDescPtr->strides[0]); + } + hipLaunchKernelGGL(final_reduction_nd_hip_tensor, + dim3(ceil((float)maxParamVolume/1024), 1, globalThreads_z), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + meanTensor, + stdDevTensor, + paramShape, + roiTensor, + tensorDims, + maxParamVolume, + isMean); + } + hipStreamSynchronize(handle.GetStream()); + return RPP_SUCCESS; +} + +// -------------------- Set 9 - normalization kernel executor -------------------- + +template +RppStatus hip_exec_normalize_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + T *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensor, + Rpp32f *stdDevTensor, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u batchSize = srcGenericDescPtr->dims[0]; + Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims + + // create buffer for paramShape and paramStride needed for generic kernel + Rpp32u *paramShape, *paramStrides; + paramShape = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem; + paramStrides = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem + (batchSize * tensorDims); + + // do initial preprocessing, compute maxParamVolue and fill the values for paramShape and paramStrides + Rpp32u maxParamVolume; + if (tensorDims == 2 || tensorDims == 3) + normalize_setup_2d_and_3d(roiTensor, batchSize, tensorDims, + axisMask, maxParamVolume); + else + normalize_setup_nd(roiTensor, batchSize, tensorDims, axisMask, + paramShape, paramStrides, maxParamVolume); + + bool computeMean = computeMeanStddev & 1; // if 0th bit in computeMeanStddev is set, computeMean is set to true. Otherwise it is set to false + bool computeStdDev = computeMeanStddev & 2; // if 1st bit in computeMeanStddev is set, computeStdDev is set to true. Otherwise it is set to false + if ((!computeMean) && (!computeStdDev)) + maxParamVolume = 0; + + // if computeMean is set, compute mean values by processing over input based on axisMask values + if (computeMean) + hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, true, + roiTensor, axisMask, tensorDims, maxParamVolume, + paramShape, paramStrides, handle); + + // if computeStdDev is set, compute stdDev values by processing over input based on axisMask values + if (computeStdDev) + hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, false, + roiTensor, axisMask, tensorDims, maxParamVolume, + paramShape, paramStrides, handle); + + // based on number of dimensions call the corresponding kernel + if (tensorDims == 2) + { + // NHW + int globalThreads_x = dstGenericDescPtr->dims[2]; + int globalThreads_y = dstGenericDescPtr->dims[1]; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + hipLaunchKernelGGL(normalize_2d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]), + dstPtr, + make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]), + meanTensor, + stdDevTensor, + make_float2(scale, shift), + roiTensor, + make_uint2(maxParamVolume, axisMask), + computeStdDev); + } + else if (tensorDims == 3) + { + // NDHW + int globalThreads_x = dstGenericDescPtr->dims[3]; + int globalThreads_y = dstGenericDescPtr->dims[2]; + int globalThreads_z = dstGenericDescPtr->dims[1]; + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(normalize_3d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + &meanTensor[batchCount * maxParamVolume], + &stdDevTensor[batchCount * maxParamVolume], + make_float2(scale, shift), + &roiTensor[batchCount * 6], + axisMask, + computeStdDev); + } + } + else + { + // interpret the input as 1D tensor + int globalThreads_x = dstGenericDescPtr->strides[0]; + int globalThreads_y = 1; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + // allocate tensor for src strides + Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1]; + Rpp32u *srcStrides = &srcGenericDescPtr->strides[1]; + hipLaunchKernelGGL(normalize_nd_hip_tensor, + dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y), ceil((float)globalThreads_z)), + dim3(1024, 1, 1), + 0, + handle.GetStream(), + srcPtr, + srcMaxDims, + srcStrides, + dstPtr, + meanTensor, + stdDevTensor, + make_float2(scale, shift), + roiTensor, + paramShape, + paramStrides, + make_uint2(maxParamVolume, srcGenericDescPtr->strides[0]), + tensorDims, + computeStdDev); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/rppi_validate.hpp b/src/modules/rppi_validate.hpp index e35e5c514..3285ee756 100644 --- a/src/modules/rppi_validate.hpp +++ b/src/modules/rppi_validate.hpp @@ -56,11 +56,9 @@ inline RppLayoutParams get_layout_params(RpptLayout layout, Rpp32u channels) } else if(layout == RpptLayout::NHWC || layout == RpptLayout::NDHWC) { - if (channels == 3) // PKD3 - { - layoutParams.channelParam = 1; - layoutParams.bufferMultiplier = 3; - } + //PKD + layoutParams.channelParam = 1; + layoutParams.bufferMultiplier = channels; } return layoutParams; } diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp index 28313a88f..ef69a49bb 100644 --- a/src/modules/rppt_tensor_statistical_operations.cpp +++ b/src/modules/rppt_tensor_statistical_operations.cpp @@ -241,13 +241,109 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** normalize_ND ********************/ + +RppStatus rppt_normalize_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensor, + Rpp32f *stdDevTensor, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams; + Rpp32u tensorDim = srcGenericDescPtr->numDims - 1; + if (tensorDim == 3 && (srcGenericDescPtr->layout == RpptLayout::NHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]); + else if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]); + else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]); + else if(tensorDim == 2 && (srcGenericDescPtr->layout == RpptLayout::NHWC)) + layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[2]); + + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) + { + normalize_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + normalize_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + normalize_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); + } + + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) + { + normalize_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ +#ifdef GPU_SUPPORT + /******************** tensor_sum ********************/ -#ifdef HIP_COMPILE + RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorSumArr, @@ -256,6 +352,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptRoiType roiType, rppHandle_t rppHandle) { +#ifdef HIP_COMPILE if (srcDescPtr->c == 1) { if (tensorSumArrLength < srcDescPtr->n) // sum of single channel @@ -317,10 +414,12 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, } return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend } /******************** tensor_min ********************/ - RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMinArr, @@ -329,6 +428,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptRoiType roiType, rppHandle_t rppHandle) { +#ifdef HIP_COMPILE if (srcDescPtr->c == 1) { if (imageMinArrLength < srcDescPtr->n) // min of single channel @@ -378,6 +478,9 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, } return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend } /******************** tensor_max ********************/ @@ -390,6 +493,7 @@ RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptRoiType roiType, rppHandle_t rppHandle) { +#ifdef HIP_COMPILE if (srcDescPtr->c == 1) { if (imageMaxArrLength < srcDescPtr->n) // max of single channel @@ -439,5 +543,90 @@ RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, } return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend } + +RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u axisMask, + Rpp32f *meanTensor, + Rpp32f *stdDevTensor, + Rpp8u computeMeanStddev, + Rpp32f scale, + Rpp32f shift, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_normalize_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_normalize_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_normalize_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_normalize_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + axisMask, + meanTensor, + stdDevTensor, + computeMeanStddev, + scale, + shift, + roiTensor, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; #endif // backend +} + +#endif // GPU_SUPPORT diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt index 82ed65309..77052cabe 100644 --- a/utilities/test_suite/CMakeLists.txt +++ b/utilities/test_suite/CMakeLists.txt @@ -83,7 +83,7 @@ if(Python3_FOUND) if(NIFTI_FOUND) add_test( NAME rpp_qa_tests_tensor_voxel_host_all - COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runTests_voxel.py --qa_mode 1 --batch_size 3 + COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runVoxelTests.py --qa_mode 1 --batch_size 3 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endif(NIFTI_FOUND) @@ -105,7 +105,7 @@ if(Python3_FOUND) if(NIFTI_FOUND) add_test( NAME rpp_qa_tests_tensor_voxel_hip_all - COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runTests_voxel.py --qa_mode 1 --batch_size 3 + COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runVoxelTests.py --qa_mode 1 --batch_size 3 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endif(NIFTI_FOUND) diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt index 26017065e..a0bd42fa0 100644 --- a/utilities/test_suite/HIP/CMakeLists.txt +++ b/utilities/test_suite/HIP/CMakeLists.txt @@ -83,8 +83,10 @@ if (hip_FOUND AND OpenCV_FOUND) link_directories(${ROCM_PATH}/lib /usr/local/lib) add_executable(Tensor_hip Tensor_hip.cpp) + add_executable(Tensor_misc_hip Tensor_misc_hip.cpp) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPU_SUPPORT=1 -DRPP_BACKEND_HIP=1 -std=gnu++17") target_link_libraries(Tensor_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device) + target_link_libraries(Tensor_misc_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device) else() message(FATAL_ERROR "-- ${Red}Error: OpenCV and hip must be installed to install ${PROJECT_NAME} successfully!${ColourReset}") endif() diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp new file mode 100644 index 000000000..96197f432 --- /dev/null +++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp @@ -0,0 +1,231 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "../rpp_test_suite_misc.h" + +int main(int argc, char **argv) +{ + // Handle inputs + const int MIN_ARG_COUNT = 9; + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_misc_hip