diff --git a/include/rppdefs.h b/include/rppdefs.h
index f7cd7a711..7d49c457d 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -1045,6 +1045,7 @@ typedef struct
Rpp64u* dstBatchIndex;
Rpp32u* inc;
Rpp32u* dstInc;
+ hipMemRpp32u scratchBuf;
} memGPU;
/*! \brief RPP HIP-HOST memory management
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 3cb49a82b..7c2d3318b 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -150,6 +150,49 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Normalize Generic augmentation on HOST backend
+ * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor.
+ * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. Also has toggle variant(NHWC->NCHW) support for 3D.
+ * \param [in] srcPtr source tensor memory in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] axisMask axis along which normalization needs to be done
+ * \param [in] meanTensor values to be subtracted from input
+ * \param [in] stdDevTensor standard deviation values to scale the input
+ * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided)
+ * \param [in] scale value to be multiplied with data after subtracting from mean
+ * \param [in] shift value to be added finally
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_normalize_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Normalize Generic augmentation on HIP backend
+ * \details Normalizes the input generic ND buffer by removing the mean and dividing by the standard deviation for a given ND Tensor.
+ * \param [in] srcPtr source tensor memory in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor memory in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] axisMask axis along which normalization needs to be done
+ * \param [in] meanTensor values to be subtracted from input
+ * \param [in] stdDevTensor standard deviation values to scale the input
+ * \param [in] computeMeanStddev flag to represent internal computation of mean, stddev (Wherein 0th bit used to represent computeMean and 1st bit for computeStddev, 0- Externally provided)
+ * \param [in] scale value to be multiplied with data after subtracting from mean
+ * \param [in] shift value to be added finally
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u axisMask, Rpp32f *meanTensor, Rpp32f *stdDevTensor, Rpp8u computeMeanStddev, Rpp32f scale, Rpp32f shift, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! @}
*/
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index 19121957b..6c327fdd6 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -2537,6 +2537,127 @@ inline Rpp32f rpp_hsum_ps(__m256 x)
return _mm_cvtss_f32(sum);
}
+/* Computes inverse square root */
+inline Rpp32f rpp_rsqrt_ps(Rpp32f x)
+{
+ __m128 X = _mm_set_ss(x);
+ __m128 tmp = _mm_rsqrt_ss(X);
+ Rpp32f y = _mm_cvtss_f32(tmp);
+ return y * (1.5f - x * 0.5f * y * y);
+}
+
+/* Compute inverse square root */
+/* SSE matches to 6 decimal places with raw C version due to newton rhapson approximation*/
+inline void rpp_rsqrt_sse(Rpp32f *input, Rpp64s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f mul)
+{
+ Rpp64s i = 0;
+ __m128 rdivx4 = _mm_set1_ps(rdiv);
+ __m128 mulx4 = _mm_set1_ps(mul * 0.5f);
+ if (eps) // epsilon is present - no need for masking, but we need to add it
+ {
+ __m128 epsx4 = _mm_set1_ps(eps);
+ for (; i + 4 <= numElements; i += 4)
+ {
+ __m128 x = _mm_loadu_ps(&input[i]);
+ x = _mm_mul_ps(x, rdivx4);
+ x = _mm_add_ps(x, epsx4);
+ __m128 y = _mm_rsqrt_ps(x);
+ __m128 y2 = _mm_mul_ps(y, y);
+ __m128 xy2 = _mm_mul_ps(x, y2);
+ __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2);
+ y = _mm_mul_ps(y, three_minus_xy2);
+ y = _mm_mul_ps(y, mulx4);
+ _mm_storeu_ps(&input[i], y);
+ }
+ }
+ else
+ {
+ for (; i + 4 <= numElements; i += 4)
+ {
+ __m128 x = _mm_loadu_ps(&input[i]);
+ x = _mm_mul_ps(x, rdivx4);
+ __m128 mask = _mm_cmpneq_ps(x, xmm_p0);
+ __m128 y = _mm_rsqrt_ps(x);
+ y = _mm_and_ps(y, mask);
+ __m128 y2 = _mm_mul_ps(y, y);
+ __m128 xy2 = _mm_mul_ps(x, y2);
+ __m128 three_minus_xy2 = _mm_sub_ps(xmm_p3, xy2);
+ y = _mm_mul_ps(y, three_minus_xy2);
+ y = _mm_mul_ps(y, mulx4);
+ _mm_storeu_ps(&input[i], y);
+ }
+ }
+ if (eps)
+ {
+ for (; i < numElements; i++)
+ input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * mul;
+ }
+ else
+ {
+ for (; i < numElements; i++)
+ {
+ Rpp32f x = input[i] * rdiv;
+ input[i] = x ? rpp_rsqrt_ps(x) * mul : 0;
+ }
+ }
+}
+
+/* Compute inverse square root */
+/* AVX2 matches to 6 decimal places with raw C version due to newton rhapson approximation*/
+inline void rpp_rsqrt_avx(Rpp32f *input, Rpp32s numElements, Rpp32f eps, Rpp32f rdiv, Rpp32f scale)
+{
+ Rpp32s i = 0;
+ __m256 rdivx8 = _mm256_set1_ps(rdiv);
+ __m256 mulx8 = _mm256_set1_ps(scale * 0.5f);
+ if (eps) // epsilon is present - no need for masking, but we need to add it
+ {
+ __m256 epsx8 = _mm256_set1_ps(eps);
+ for (; i + 8 <= numElements; i += 8)
+ {
+ __m256 x = _mm256_loadu_ps(&input[i]);
+ x = _mm256_mul_ps(x, rdivx8);
+ x = _mm256_add_ps(x, epsx8);
+ __m256 y = _mm256_rsqrt_ps(x);
+ __m256 y2 = _mm256_mul_ps(y, y);
+ __m256 xy2 = _mm256_mul_ps(x, y2);
+ __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2);
+ y = _mm256_mul_ps(y, three_minus_xy2);
+ y = _mm256_mul_ps(y, mulx8);
+ _mm256_storeu_ps(&input[i], y);
+ }
+ }
+ else
+ {
+ for (; i + 8 <= numElements; i += 8)
+ {
+ __m256 x = _mm256_loadu_ps(&input[i]);
+ x = _mm256_mul_ps(x, rdivx8);
+ __m256 mask = _mm256_cmp_ps(x, avx_p0, _CMP_NEQ_OQ);
+ __m256 y = _mm256_rsqrt_ps(x);
+ y = _mm256_and_ps(y, mask);
+ __m256 y2 = _mm256_mul_ps(y, y);
+ __m256 xy2 = _mm256_mul_ps(x, y2);
+ __m256 three_minus_xy2 = _mm256_sub_ps(avx_p3, xy2);
+ y = _mm256_mul_ps(y, three_minus_xy2);
+ y = _mm256_mul_ps(y, mulx8);
+ _mm256_storeu_ps(&input[i], y);
+ }
+ }
+ if (eps)
+ {
+ for (; i < numElements; i++)
+ input[i] = rpp_rsqrt_ps(input[i] * rdiv + eps) * scale;
+ }
+ else
+ {
+ for (; i < numElements; i++)
+ {
+ Rpp32f x = input[i] * rdiv;
+ input[i] = x ? rpp_rsqrt_ps(x) * scale : 0;
+ }
+ }
+}
+
static inline void fast_matmul4x4_sse(float *A, float *B, float *C)
{
__m128 row1 = _mm_load_ps(&B[0]); // Row 0 of B
diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
index 1f47e8ee7..4338483a9 100644
--- a/src/modules/CMakeLists.txt
+++ b/src/modules/CMakeLists.txt
@@ -97,6 +97,7 @@ if( "${BACKEND}" STREQUAL "HIP")
set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS}")
set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+ set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
# Add HIP specific preprocessor flags
add_definitions(-DHIP_COMPILE)
@@ -111,6 +112,7 @@ elseif( "${BACKEND}" STREQUAL "OCL")
list(APPEND Rpp_Source ${CPPFILES} ${MOD_CL_CPP})
message("-- ${Green}OpenCL kernels added!${ColourReset}")
set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+ set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
# Add OpenCL specific preprocessor flags
add_definitions(-DOCL_COMPILE)
@@ -125,6 +127,7 @@ elseif( "${BACKEND}" STREQUAL "CPU")
# Add CPU specific includes
set(INCLUDE_LIST ${CMAKE_SOURCE_DIR}/src/include/common/)
set_source_files_properties(rppt_tensor_audio_augmentations.cpp PROPERTIES COMPILE_FLAGS -mno-fma)
+ set_source_files_properties(rppt_tensor_statistical_operations.cpp PROPERTIES COMPILE_FLAGS -mno-fma) # no-fma flag added to get the exact output match with golden outputs
endif()
message("-- ${White}AMD RPP ${PROJECT_NAME} -- Include Directories:${INCLUDE_LIST}${ColourReset}")
add_compile_options("-Wno-unused-result")
diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp
index 32b8b62b5..7ddc238fa 100644
--- a/src/modules/cpu/host_tensor_statistical_operations.hpp
+++ b/src/modules/cpu/host_tensor_statistical_operations.hpp
@@ -28,5 +28,6 @@ SOFTWARE.
#include "kernel/tensor_sum.hpp"
#include "kernel/tensor_min.hpp"
#include "kernel/tensor_max.hpp"
+#include "kernel/normalize.hpp"
-#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
+#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/cpu/kernel/normalize.hpp b/src/modules/cpu/kernel/normalize.hpp
new file mode 100644
index 000000000..dbe746d1a
--- /dev/null
+++ b/src/modules/cpu/kernel/normalize.hpp
@@ -0,0 +1,882 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+// Computes strides
+void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim)
+{
+ if (tensorDim > 0)
+ {
+ Rpp32u v = 1;
+ for (Rpp32u i = tensorDim - 1; i > 0; i--)
+ {
+ strides[i] = v;
+ v *= shape[i];
+ }
+ strides[0] = v;
+ }
+}
+
+// Recursive reduction helper function to compute difference of input with mean and squares them up
+template
+void compute_diff_square_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements, Rpp32f mean)
+{
+ if (numElements > 32)
+ {
+ Rpp32s currElements = numElements >> 1;
+ Rpp32f tmp1 = 0, tmp2 = 0;
+
+ // reduce first half and accumulate
+ compute_diff_square_sum(tmp1, input, inputStride, currElements, mean);
+
+ // reduce second half and accumulate
+ compute_diff_square_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements, mean);
+
+ tmp1 += tmp2;
+ output += tmp1;
+ }
+ else
+ {
+ // reduce to a temporary
+ Rpp32f tmp = 0;
+ for (Rpp32s i = 0; i < numElements; i++)
+ {
+ Rpp32f curr = (input[i * inputStride] - mean);
+ auto curSq = curr * curr;
+ tmp += curSq;
+ }
+
+ // accumulate in target value
+ output += tmp;
+ }
+}
+
+// Recursive reduction helper function to sum up input values
+template
+void compute_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements)
+{
+ if (numElements > 32)
+ {
+ Rpp32s currElements = numElements >> 1;
+ Rpp32f tmp1 = 0, tmp2 = 0;
+
+ // reduce first half and accumulate
+ compute_sum(tmp1, input, inputStride, currElements);
+
+ // reduce second half and accumulate
+ compute_sum(tmp2, input + currElements * inputStride, inputStride, numElements - currElements);
+
+ tmp1 += tmp2;
+ output += tmp1;
+ }
+ else
+ {
+ // reduce to a temporary
+ Rpp32f tmp = 0;
+ for (Rpp32s i = 0; i < numElements; i++)
+ tmp += input[i * inputStride];
+
+ // accumulate in target value
+ output += tmp;
+ }
+}
+
+// Computes mean for 2D inputs
+void compute_2D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride)
+{
+ Rpp32f *srcPtrTemp = srcPtr;
+ Rpp32f normFactor = 1.0 / dims[1];
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ meanPtr[i] = 0;
+ compute_sum(meanPtr[i], srcPtrTemp, stride[0], dims[1]);
+ srcPtrTemp += stride[1];
+ meanPtr[i] *= normFactor;
+ }
+}
+
+// Computes inverse stddev for 2D inputs
+void compute_2D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale)
+{
+
+ Rpp32f *srcPtrTemp = srcPtr;
+ Rpp32f normFactor = (Rpp32f)(1.0 / dims[1]);
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ stdDevPtr[i] = 0;
+ compute_diff_square_sum(stdDevPtr[i], srcPtrTemp, stride[0], dims[1], meanPtr[i]);
+ srcPtrTemp += stride[1];
+ }
+ rpp_rsqrt_sse(stdDevPtr, (Rpp32s)dims[0], 0, normFactor, scale);
+}
+
+// Computes mean for 3D inputs
+void compute_3D_mean(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, bool isConsecutive = true)
+{
+ Rpp32f *srcPtrTemp = srcPtr;
+ if(isConsecutive)
+ {
+ Rpp32f normFactor = 1.0 / dims[2];
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ float *srcPtrRow = srcPtrTemp;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ Rpp32u index = i * dims[1] + j;
+ meanPtr[index] = 0;
+ compute_sum(meanPtr[index], srcPtrRow, stride[0], dims[2]);
+ srcPtrRow += stride[1];
+ meanPtr[index] *= normFactor;
+ }
+ srcPtrTemp += stride[2];
+ }
+ }
+ else
+ {
+ Rpp32f normFactor = 1.0 / (dims[1] * dims[2]);
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ meanPtr[i] = 0;
+ Rpp32f *srcPtrRow = srcPtrTemp;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ compute_sum(meanPtr[i], srcPtrRow, stride[0], dims[2]);
+ srcPtrRow += stride[1];
+ }
+ meanPtr[i] *= normFactor;
+ srcPtrTemp += stride[2];
+ }
+ }
+}
+
+// Computes inverse stddev for 3D inputs
+void compute_3D_inv_std_dev(Rpp32f *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32f scale, bool isConsecutive = true)
+{
+ Rpp32f *srcPtrTemp = srcPtr;
+ if(isConsecutive)
+ {
+ Rpp32f normFactor = (Rpp32f)(1.0 / dims[2]);
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ float *srcPtrRow = srcPtrTemp;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ Rpp32u index = i * dims[1] + j;
+ stdDevPtr[index] = 0;
+ compute_diff_square_sum(stdDevPtr[index], srcPtrRow, stride[0], dims[2], meanPtr[index]);
+ srcPtrRow += stride[1];
+ }
+ srcPtrTemp += stride[2];
+ }
+ rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0] * dims[1]), 0, normFactor, scale);
+ }
+ else
+ {
+ Rpp32f normFactor = (Rpp32f)(1.0 / (dims[1] * dims[2]));
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ stdDevPtr[i] = 0;
+ Rpp32f *srcPtrRow = srcPtrTemp;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ compute_diff_square_sum(stdDevPtr[i], srcPtrRow, stride[0], dims[2], meanPtr[i]);
+ srcPtrRow += stride[1];
+ }
+ srcPtrTemp += stride[2];
+ }
+ rpp_rsqrt_avx(stdDevPtr, (Rpp32s)(dims[0]), 0, normFactor, scale);
+ }
+}
+
+// Computes mean for ND inputs
+template
+void compute_ND_mean(T *srcPtr, Rpp32f *meanPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis)
+{
+ if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeSum when last dimension is to be normalized
+ compute_sum(meanPtr[index], srcPtr, stride[level], dims[level]);
+ else if(level == tensorDim) // Calls computeSum when only 1st axis need to be normalized
+ compute_sum(meanPtr[index], srcPtr, stride[norm], dims[norm]);
+ else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store mean
+ {
+ for(Rpp32u i = 0; i < dims[level]; i++)
+ compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis);
+ }
+ else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized
+ compute_ND_mean(srcPtr, meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+ else if(axis[level]) // Called when axis at present level needs to be normalized
+ {
+ for(Rpp32u i = 0; i < dims[level]; i++)
+ compute_ND_mean(srcPtr + (i * stride[level]), meanPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+ }
+}
+
+// Computes inverse stddev for ND inputs
+template
+void compute_ND_stddev(T *srcPtr, Rpp32f *meanPtr, Rpp32f *stdDevPtr, Rpp32u *dims, Rpp32u *stride, Rpp32u *axis, Rpp32u tensorDim, Rpp32u level, Rpp32u index, Rpp32u size, Rpp32u norm, Rpp32u lastNormAxis)
+{
+ if((level == (tensorDim - 1)) && axis[tensorDim - 1]) // Calls computeDiffSumSquare when last dimension is to be normalized
+ compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[level], dims[level], meanPtr[index]);
+ else if(level == tensorDim) // Calls computeDiffSumSquare when only 1st axis need to be normalized
+ compute_diff_square_sum(stdDevPtr[index], srcPtr, stride[norm], dims[norm], meanPtr[index]);
+ else if (!axis[level]) // When that axis at present level isn't normalized, split srcPtr and modify index to store stddev
+ {
+ for(Rpp32u i = 0; i < dims[level]; i++)
+ compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index + (i * (size / dims[level])), size / dims[level], norm, lastNormAxis);
+ }
+ else if(axis[level] && (level == lastNormAxis)) // Increment level alone if its last axis to be normalized
+ compute_ND_stddev(srcPtr, meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+ else if(axis[level]) // Called when axis at present level needs to be normalized
+ {
+ for(Rpp32u i = 0; i < dims[level]; i++)
+ compute_ND_stddev(srcPtr + (i * stride[level]), meanPtr, stdDevPtr, dims, stride, axis, tensorDim, level + 1, index, size, level, lastNormAxis);
+ }
+}
+
+// Computes normalize for 3D non toggle variants
+void normalize_3D_tensor_nontoggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length)
+{
+ Rpp32s paramIdx = 0;
+ Rpp32s idx1 = 0;
+
+ for(Rpp32u i = 0; i < length[0]; i++)
+ {
+ Rpp32f *srcPtrRow = srcPtr;
+ Rpp32f *dstPtrRow = dstPtr;
+ for(Rpp32u j = 0; j < length[1]; j++)
+ {
+ Rpp32f *srcPtrRowTemp = srcPtrRow;
+ Rpp32f *dstPtrRowTemp = dstPtrRow;
+ idx1 = paramIdx;
+ for(Rpp32u k = 0; k < length[2]; k++)
+ {
+ *dstPtrRowTemp = ((*srcPtrRowTemp - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift;
+ if(k < length[2] - 1)
+ paramIdx += paramStride[2];
+ srcPtrRowTemp++;
+ dstPtrRowTemp++;
+ }
+ if(j < length[1] - 1)
+ paramIdx = (!paramStride[1]) ? idx1 : paramIdx + paramStride[1];
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ if(i < length[0] - 1)
+ paramIdx = (!paramStride[0]) ? 0 : paramIdx + paramStride[0];
+ srcPtr += srcGenericDescPtr->strides[1];
+ dstPtr += dstGenericDescPtr->strides[1];
+ }
+}
+
+// Computes normalize for 3D toggle variants when axis mask is set to 3
+void normalize_3D_tensor_axis3_toggle(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u *length)
+{
+ Rpp32f *srcPtrTemp = srcPtr;
+ Rpp32f *dstPtrTemp[length[2]];
+ dstPtrTemp[0] = dstPtr;
+ for(Rpp32u i = 1; i < length[2]; i++)
+ dstPtrTemp[i] = dstPtrTemp[i-1] + dstGenericDescPtr->strides[1];
+ Rpp32s paramIdx = 0;
+
+ for(Rpp32u i = 0; i < length[0]; i++)
+ {
+ Rpp32f *srcPtrRow = srcPtrTemp;
+ Rpp32f *dstPtrRow[length[2]];
+ for(Rpp32u l = 0; l < length[2]; l++)
+ dstPtrRow[l] = dstPtrTemp[l];
+ for(Rpp32u j = 0; j < length[1]; j++)
+ {
+ Rpp32f *srcPtrRowTemp = srcPtrRow;
+ Rpp32f *dstPtrRowTemp[length[2]];
+ for(Rpp32u l = 0; l < length[2]; l++)
+ dstPtrRowTemp[l] = dstPtrRow[l];
+ for(Rpp32u k = 0; k < length[2]; k++)
+ {
+ *dstPtrRowTemp[k]++ = ((*srcPtrRowTemp++ - meanPtr[paramIdx]) * multiplierPtr[paramIdx]) + shift;
+ paramIdx += paramStride[2];
+ }
+ paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1];
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ for(Rpp32u l = 0; l < length[2]; l++)
+ dstPtrRow[l] += dstGenericDescPtr->strides[3];
+ }
+ srcPtrTemp += srcGenericDescPtr->strides[1];
+ for(Rpp32u l = 0; l < length[2]; l++)
+ dstPtrTemp[l] += dstGenericDescPtr->strides[2];
+ }
+}
+
+// Computes normalize for 3D non toggle variants, optimized with AVX when axis mask set to 3 and 16 channel normalize
+void normalize_3D_tensor_avx_axis3(Rpp32f *srcPtr, RpptGenericDescPtr srcGenericDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32f *meanPtr, Rpp32f *multiplierPtr, Rpp32f shift, Rpp32u *paramStride, Rpp32u bufferLength, Rpp32u *length)
+{
+ Rpp32u vectorIncrement = 16;
+ Rpp32u alignedLength = (bufferLength / 16) * 16;
+ Rpp32u outerDim = length[0];
+
+ // set shift, mean and stddev
+ __m256 pShift = _mm256_set1_ps(shift);
+ __m256 pMean1 = _mm256_loadu_ps(meanPtr);
+ __m256 pMean2 = _mm256_loadu_ps(meanPtr + 8);
+ __m256 pMultiplier1 = _mm256_loadu_ps(multiplierPtr);
+ __m256 pMultiplier2 = _mm256_loadu_ps(multiplierPtr + 8);
+
+ for(Rpp32u i = 0; i < outerDim; i++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr + i * srcGenericDescPtr->strides[1];
+ Rpp32f *dstPtrTemp = dstPtr + i * dstGenericDescPtr->strides[1];
+
+ Rpp32u vectorLoopCount = 0;
+ for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement)
+ {
+ __m256 pSrc1 = _mm256_loadu_ps(srcPtrTemp);
+ __m256 pSrc2 = _mm256_loadu_ps(srcPtrTemp + 8);
+ __m256 pDst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc1, pMean1), pMultiplier1), pShift);
+ __m256 pDst2 = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc2, pMean2), pMultiplier2), pShift);
+ _mm256_storeu_ps(dstPtrTemp, pDst1);
+ _mm256_storeu_ps(dstPtrTemp + 8, pDst2);
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ }
+}
+
+// Computes normalize for ND non toggle variants for i8 dataype
+void normalize_ND_tensor_nontoggle(Rpp32s *srcPtr, Rpp32u *srcStride, Rpp32f *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr,
+ Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx)
+{
+ Rpp32u idx1 = 0;
+ if(tensorDim == 1)
+ {
+ for(Rpp32u k = 0; k < length[level]; k++)
+ {
+ *dstPtr++ = (((Rpp32f)(*srcPtr + 128) - meanPtr[idx]) * multiplierPtr[idx]) + shift;
+ if(k < length[level] - 1)
+ idx += paramStride[level];
+ srcPtr++;
+ }
+ }
+ else
+ {
+ idx1 = idx;
+ for (Rpp32u i = 0; i < length[level]; i++)
+ {
+ normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx);
+ if(i < length[level] - 1)
+ idx = (!paramStride[level]) ? idx1 : idx + paramStride[level];
+ dstPtr += srcStride[level];
+ srcPtr += srcStride[level];
+ }
+ }
+}
+
+// Computes normalize for ND non toggle variants
+template
+void normalize_ND_tensor_nontoggle(T1 *srcPtr, Rpp32u *srcStride, T2 *dstPtr, Rpp32f *meanPtr, Rpp32f *multiplierPtr,
+ Rpp32f shift, Rpp32u *paramStride, Rpp32u *length, Rpp32u tensorDim, Rpp32u level, Rpp32u& idx)
+{
+ Rpp32u idx1 = 0;
+ if(tensorDim == 1)
+ {
+ T1 *srcPtrTemp = srcPtr;
+ T2 *dstPtrTemp = dstPtr;
+
+ for(Rpp32u k = 0; k < length[level]; k++)
+ {
+ *dstPtrTemp = (((T2)*srcPtrTemp - meanPtr[idx]) * multiplierPtr[idx]) + shift;
+ if(k < length[level] - 1)
+ idx += paramStride[level];
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ }
+ else
+ {
+ idx1 = idx;
+ for (Rpp32u i = 0; i < length[level]; i++)
+ {
+ normalize_ND_tensor_nontoggle(srcPtr, srcStride, dstPtr, meanPtr, multiplierPtr, shift, paramStride, length + 1, tensorDim - 1, level + 1, idx);
+ if(i < length[level] - 1)
+ idx = (!paramStride[level]) ? idx1 : idx + paramStride[level];
+ dstPtr += srcStride[level];
+ srcPtr += srcStride[level];
+ }
+ }
+}
+
+// Computes normalize for 2D
+void normalize_2D_tensor(Rpp32f *srcPtr, RpptGenericDescPtr srcDescPtr, Rpp32f *dstPtr, RpptGenericDescPtr dstDescPtr,
+ Rpp32f *meanPtr, Rpp32f *invStdDevPtr, Rpp32f shift, Rpp32u *dims, Rpp32u *paramStride)
+{
+ if (paramStride[1]) // Optimized with AVX when axis mask set to 2
+ {
+ Rpp32u vectorIncrement = 8;
+ Rpp32u bufferLength = dims[1];
+ Rpp32u alignedLength = (bufferLength / 8) * 8;
+
+ __m256 pShift = _mm256_set1_ps(shift);
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr + i * srcDescPtr->strides[1];
+ Rpp32f *dstPtrTemp = dstPtr + i * dstDescPtr->strides[1];
+
+ // set mean and stddev
+ Rpp32f mean = meanPtr[i];
+ Rpp32f invStdDev = invStdDevPtr[i];
+ __m256 pMean, pInvStdDev;
+ pMean = _mm256_set1_ps(mean);
+ pInvStdDev = _mm256_set1_ps(invStdDev);
+
+ Rpp32u vectorLoopCount = 0;
+ for(; vectorLoopCount < alignedLength ; vectorLoopCount += vectorIncrement)
+ {
+ __m256 pSrc = _mm256_loadu_ps(srcPtrTemp);
+ __m256 pDst = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(pSrc, pMean), pInvStdDev), pShift);
+ _mm256_storeu_ps(dstPtrTemp, pDst);
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+ for(; vectorLoopCount < dims[1] ; vectorLoopCount ++)
+ *dstPtrTemp++ = (*srcPtrTemp++ - mean) * invStdDev + shift;
+ }
+ }
+ else
+ {
+ Rpp32s paramIdx = 0;
+ for(Rpp32u i = 0; i < dims[0]; i++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr;
+ Rpp32f *dstPtrTemp = dstPtr;
+ for(Rpp32u j = 0; j < dims[1]; j++)
+ {
+ *dstPtrTemp++ = (*srcPtrTemp++ - meanPtr[paramIdx]) * invStdDevPtr[paramIdx] + shift;
+ paramIdx += paramStride[0];
+ }
+ paramIdx = (!paramStride[1]) ? 0 : paramIdx + paramStride[1];
+ srcPtr += srcDescPtr->strides[1];
+ dstPtr += dstDescPtr->strides[1];
+ }
+ }
+}
+
+// Performs collapse axis operation wherein continuous axis that require normalization are combined together
+void collapse_axis(Rpp32u *tensorDim, Rpp32u *axis, Rpp32u *length, Rpp32u *newAxis, Rpp32u *newDims, Rpp32u *lastNormAxis)
+{
+ int skipped = 0, prev = -2, k = 0;
+ for(Rpp32u i = 0; i < *tensorDim; i++)
+ {
+ if(axis[i])
+ {
+ int temp = i - skipped;
+ if(temp != prev + 1)
+ {
+ newAxis[k] = 1;
+ newDims[k] = length[i];
+ prev = i;
+ k++;
+ }
+ else if(prev >= 0)
+ {
+ newDims[prev] *= length[i];
+ skipped++;
+ }
+ }
+ else
+ {
+ newDims[k] = length[i];
+ k++;
+ }
+ }
+ *tensorDim -= skipped;
+ for(Rpp32u i = 0; i < *tensorDim; i++)
+ {
+ if(newAxis[i])
+ *lastNormAxis = i;
+ }
+}
+
+RppStatus normalize_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensorPtr,
+ Rpp32f *stdDevTensorPtr,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u tensorDims = srcGenericDescPtr->numDims - 1;
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ Rpp32u maxSize = 1;
+ // Compute maxSize as length of input tensors differ based on axisMask and tensorDims
+ for(int batch = 0; batch < batchSize; batch++)
+ {
+ Rpp32u size = 1;
+ for(int i = 0; i < tensorDims; i++)
+ size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i];
+ maxSize = std::max(maxSize, size);
+ }
+
+ if(!computeMeanStddev)
+ {
+ for(Rpp32u i = 0; i < maxSize; i++)
+ stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i];
+ maxSize = 0;
+ }
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[tensorDims];
+
+ Rpp32f *srcPtrTemp, *dstPtrTemp, *meanTensor, *stdDevTensor;
+ srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+ meanTensor = meanTensorPtr + batchCount * maxSize;
+ stdDevTensor = stdDevTensorPtr + batchCount * maxSize;
+
+ // Set all values in dst buffer to 0.0
+ for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++)
+ dstPtrTemp[cnt] = 0.0f;
+
+ Rpp32f *srcPtrChannel = srcPtrTemp;
+
+ if(tensorDims == 2) // Called for audio testcase and for any other 2D case
+ {
+ Rpp32u paramStride[2];
+ Rpp32u srcReductionDims[2], srcStride[2];
+ if (axisMask == 3)
+ {
+ srcStride[0] = srcStride[1] = srcGenericDescPtr->strides[2];
+ srcReductionDims[0] = 1;
+ srcReductionDims[1] = length[0] * length[1];
+ paramStride[0] = paramStride[1] = 0;
+ }
+ else if (axisMask == 1)
+ {
+ srcStride[0] = srcGenericDescPtr->strides[1];
+ srcStride[1] = srcGenericDescPtr->strides[2];
+ srcReductionDims[0] = length[1];
+ srcReductionDims[1] = length[0];
+ paramStride[0] = 1;
+ paramStride[1] = 0;
+ }
+ else if (axisMask == 2)
+ {
+ srcStride[0] = srcGenericDescPtr->strides[2];
+ srcStride[1] = srcGenericDescPtr->strides[1];
+ srcReductionDims[0] = length[0];
+ srcReductionDims[1] = length[1];
+ paramStride[0] = 0;
+ paramStride[1] = 1;
+ }
+
+ if(computeMeanStddev & 1) // Check if mean is to be computed internally
+ compute_2D_mean(srcPtrTemp, meanTensor, srcReductionDims, srcStride);
+ if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+ compute_2D_inv_std_dev(srcPtrTemp, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale);
+
+ normalize_2D_tensor(srcPtrTemp, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, length, paramStride);
+ }
+ else if(tensorDims == 3) // Called when a 3D tensor is passed to kernel
+ {
+ Rpp32u paramStride[3];
+ Rpp32u srcReductionDims[3], srcStride[3];
+ Rpp32u reductionDims;
+ bool isConsecutive = true;
+ switch(axisMask)
+ {
+ case 1: // Normalize axes 0
+ {
+ reductionDims = length[1] * length[2];
+ paramStride[0] = 0;
+ paramStride[1] = paramStride[2] = 1;
+ srcReductionDims[0] = length[1];
+ srcReductionDims[1] = length[2];
+ srcReductionDims[2] = length[0];
+ srcStride[0] = srcGenericDescPtr->strides[1];
+ srcStride[1] = srcGenericDescPtr->strides[3];
+ srcStride[2] = srcGenericDescPtr->strides[2];
+ break;
+ }
+ case 2: // Normalize axes 1
+ {
+ reductionDims = length[0] * length[2];
+ paramStride[1] = 0;
+ paramStride[0] = paramStride[2] = 1;
+ srcReductionDims[0] = length[0];
+ srcReductionDims[1] = length[2];
+ srcReductionDims[2] = length[1];
+ srcStride[0] = srcGenericDescPtr->strides[2];
+ srcStride[1] = srcGenericDescPtr->strides[3];
+ srcStride[2] = srcGenericDescPtr->strides[1];
+ break;
+ }
+ case 3: // Normalize axes 0, 1
+ {
+ reductionDims = length[2];
+ paramStride[0] = paramStride[1] = 0;
+ paramStride[2] = 1;
+ srcReductionDims[0] = 1;
+ srcReductionDims[1] = length[2];
+ srcReductionDims[2] = length[0] * length[1];
+ srcStride[0] = srcGenericDescPtr->strides[2];
+ srcStride[1] = srcGenericDescPtr->strides[3];
+ srcStride[2] = srcGenericDescPtr->strides[3];
+ break;
+ }
+ case 4: // Normalize across 2
+ {
+ reductionDims = length[0] * length[1];
+ paramStride[2] = 0;
+ paramStride[0] = paramStride[1] = 1;
+ srcReductionDims[0] = length[0];
+ srcReductionDims[1] = length[1];
+ srcReductionDims[2] = length[2];
+ srcStride[0] = srcGenericDescPtr->strides[3];
+ srcStride[1] = srcGenericDescPtr->strides[2];
+ srcStride[2] = srcGenericDescPtr->strides[1];
+ break;
+ }
+ case 5: // Normalize across 0, 2
+ {
+ reductionDims = length[1];
+ paramStride[0] = paramStride[2] = 0;
+ paramStride[1] = 1;
+ srcReductionDims[0] = length[1];
+ srcReductionDims[1] = length[0];
+ srcReductionDims[2] = length[2];
+ srcStride[0] = srcGenericDescPtr->strides[3];
+ srcStride[1] = srcGenericDescPtr->strides[1];
+ srcStride[2] = srcGenericDescPtr->strides[2];
+ isConsecutive = false;
+ break;
+ }
+ case 6: // Normalize across 1, 2
+ {
+ reductionDims = length[0];
+ paramStride[1] = paramStride[2] = 0;
+ paramStride[0] = 1;
+ srcReductionDims[0] = 1;
+ srcReductionDims[1] = length[0];
+ srcReductionDims[2] = length[1] * length[2];
+ srcStride[0] = srcGenericDescPtr->strides[3];
+ srcStride[1] = srcGenericDescPtr->strides[1];
+ srcStride[2] = srcGenericDescPtr->strides[3];
+ break;
+ }
+ case 7: // Normalize across 0, 1, 2
+ {
+ reductionDims = 1;
+ paramStride[0] = paramStride[1] = paramStride[2] = 0;
+ srcReductionDims[0] = 1;
+ srcReductionDims[1] = 1;
+ srcReductionDims[2] = length[0] * length[1] * length[2];
+ srcStride[0] = srcStride[1] = srcStride[2] = srcGenericDescPtr->strides[3];
+ break;
+ }
+ default:
+ {
+ std::cout<<"Invalid Axis mask"<strides[i];
+
+ if(computeMeanStddev & 1) // Check if mean is to be computed internally
+ compute_3D_mean(srcPtrChannel, meanTensor, srcReductionDims, srcStride, isConsecutive);
+ if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+ compute_3D_inv_std_dev(srcPtrChannel, meanTensor, stdDevTensor, srcReductionDims, srcStride, scale, isConsecutive);
+
+ if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC) && (srcGenericDescPtr->dims[3] == 16))
+ normalize_3D_tensor_avx_axis3(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length[1] * layoutParams.bufferMultiplier, length);
+ else if((srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NHWC))
+ normalize_3D_tensor_nontoggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length);
+ else if((axisMask == 3) && (srcGenericDescPtr->layout == RpptLayout::NHWC) && (dstGenericDescPtr->layout == RpptLayout::NCHW))
+ normalize_3D_tensor_axis3_toggle(srcPtrChannel, srcGenericDescPtr, dstPtrTemp, dstGenericDescPtr, meanTensor, stdDevTensor, shift, paramStride, length);
+ }
+ else // Handle any other ND tensor is passed to kernel
+ {
+ // Compute length of input tensors as they differ based on axisMask and tensorDims
+ int size = 1;
+ for(int i = 0; i < tensorDims; i++)
+ size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i];
+
+ Rpp32u totalElements = 1;
+ Rpp32u lastNormAxis = 0;
+ Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims];
+ // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis
+ memset(newAxis, 0, tensorDims * sizeof(Rpp32u));
+ memset(newDims, 0, tensorDims * sizeof(Rpp32u));
+
+ for(Rpp32u i = 0; i < tensorDims; i++)
+ {
+ axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+ totalElements *= axis[i] ? length[i] : 1;
+ srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1];
+ }
+
+ Rpp32u paramStride[tensorDims], srcStride[tensorDims];
+ collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis);
+ compute_strides(srcStride, newDims, tensorDims);
+
+ if(computeMeanStddev & 1) // Check if mean is to be computed internally
+ {
+ compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+ Rpp32f normFactor = 1.0 / totalElements;
+ for(int i = 0; i < size; i++)
+ meanTensor[i] *= normFactor;
+ }
+ if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+ {
+ compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+ Rpp32f normFactor = (Rpp32f)(1.0 / totalElements);
+ rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale);
+ }
+
+ for(Rpp32u i = 0; i < tensorDims; i++)
+ paramStride[i] = !newAxis[i];
+
+ Rpp32u idx = 0;
+ normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+template
+RppStatus normalize_generic_host_tensor(T1 *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ T2 *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensorPtr,
+ Rpp32f *stdDevTensorPtr,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ Rpp32u maxSize = 1;
+ for(int batch = 0; batch < batchSize; batch++)
+ {
+ Rpp32u size = 1; // length of input tensors differ based on axisMask and tensorDims
+ for(int i = 0; i < tensorDims; i++)
+ size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : roiTensor[(tensorDims * 2 * batch) + tensorDims + i];
+ maxSize = std::max(maxSize, size);
+ }
+ if(!computeMeanStddev)
+ {
+ for(Rpp32u i = 0; i < maxSize; i++)
+ stdDevTensorPtr[i] = (!stdDevTensorPtr[i])? 1.0f : scale / stdDevTensorPtr[i];
+ maxSize = 0;
+ }
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ int size = 1;
+ Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[tensorDims];
+
+ for(int i = 0; i < tensorDims; i++)
+ size *= ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : length[i];
+
+ T1 *srcPtrTemp;
+ T2 *dstPtrTemp;
+ Rpp32f *meanTensor, *stdDevTensor;
+ srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+ meanTensor = meanTensorPtr + batchCount * maxSize;
+ stdDevTensor = stdDevTensorPtr + batchCount * maxSize;
+
+ // Set all values in dst buffer to 0.0
+ for(int cnt = 0; cnt < dstGenericDescPtr->strides[0]; cnt++)
+ dstPtrTemp[cnt] = 0.0f;
+
+ T1 *srcPtrChannel = srcPtrTemp;
+
+ int totalElements = 1;
+ Rpp32u lastNormAxis = 0;
+ Rpp32u axis[tensorDims], newAxis[tensorDims], newDims[tensorDims];
+ // Initialize newAxis and newDims used to store final Axis and Dims after removing redundant axis
+ memset(newAxis, 0, sizeof(newAxis));
+ memset(newDims, 0, sizeof(newDims));
+
+ for(int i = 0; i < tensorDims; i++)
+ {
+ axis[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+ totalElements *= axis[i] ? length[i] : 1;
+ srcPtrChannel += begin[i] * srcGenericDescPtr->strides[i + 1];
+ }
+
+ Rpp32u paramStride[tensorDims], srcStride[tensorDims];
+ collapse_axis(&tensorDims, axis, length, newAxis, newDims, &lastNormAxis);
+ compute_strides(srcStride, newDims, tensorDims);
+
+ if(computeMeanStddev & 1) // Check if mean is to be computed internally
+ {
+ compute_ND_mean(srcPtrChannel, meanTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+ Rpp32f normFactor = 1.0 / totalElements;
+ for(int i = 0; i < size; i++)
+ meanTensor[i] *= normFactor;
+ }
+ if(computeMeanStddev & 2) // Check if stddev is to be computed internally
+ {
+ compute_ND_stddev(srcPtrChannel, meanTensor, stdDevTensor, newDims, srcStride, newAxis, tensorDims, 0, 0, size, 0, lastNormAxis);
+ Rpp32f normFactor = (Rpp32f)(1.0 / totalElements);
+ rpp_rsqrt_avx(stdDevTensor, (Rpp32s)size, 0, normFactor, scale);
+ }
+
+ for(int i = 0; i < tensorDims; i++)
+ paramStride[i] = !newAxis[i];
+
+ Rpp32u idx = 0;
+ normalize_ND_tensor_nontoggle(srcPtrChannel, srcStride, dstPtrTemp, meanTensor, stdDevTensor, shift, paramStride, newDims, tensorDims, 0, idx);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp
index c79e0a951..6923b9a3f 100644
--- a/src/modules/hip/hip_tensor_statistical_operations.hpp
+++ b/src/modules/hip/hip_tensor_statistical_operations.hpp
@@ -27,5 +27,6 @@ SOFTWARE.
#include "kernel/tensor_sum.hpp"
#include "kernel/tensor_min.hpp"
#include "kernel/tensor_max.hpp"
+#include "kernel/normalize.hpp"
#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/normalize.hpp b/src/modules/hip/kernel/normalize.hpp
new file mode 100644
index 000000000..384454ca7
--- /dev/null
+++ b/src/modules/hip/kernel/normalize.hpp
@@ -0,0 +1,1921 @@
+#include
+#include "rpp_hip_common.hpp"
+
+#define MAX_SHARED_MEMORY_SIZE 1024
+
+// -------------------- Set 0 - normalization kernels device helpers --------------------
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, uchar* dst)
+{
+ outVal = fmax(fminf(outVal, 255), 0);
+ *dst = static_cast(outVal);
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, schar* dst)
+{
+ outVal = fmax(fminf(outVal, 127), -128);
+ *dst = static_cast(outVal);
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, float* dst)
+{
+ *dst = outVal;
+}
+
+__device__ __forceinline__ void normalize_check_and_store(float outVal, half* dst)
+{
+ *dst = static_cast(outVal);
+}
+
+// -------------------- Set 1 - normalization kernel host helpers --------------------
+
+// setup function needed for 2D/3D optimized kernel variants when mean and stddev needs to be internally computed
+void normalize_setup_2d_and_3d(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims,
+ Rpp32u axisMask, Rpp32u &maxParamVolume)
+{
+ maxParamVolume = 1;
+ uint axisSet[RPPT_MAX_DIMS];
+ for(int i = 0; i < tensorDims; i++)
+ axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+
+ for(uint i = 0; i < batchSize; i++)
+ {
+ // calculate the max param volume
+ Rpp32u paramVolume = 1;
+ Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims];
+ for(uint j = 0; j < tensorDims; j++)
+ paramVolume *= (axisSet[j]) ? 1 : roi[j];
+ maxParamVolume = std::max(maxParamVolume, paramVolume);
+ }
+}
+
+// setup function needed for ND generic kernel variants
+void normalize_setup_nd(Rpp32u *roiTensor, Rpp32u batchSize, Rpp32u tensorDims, Rpp32u axisMask,
+ Rpp32u *paramShapeTensor, Rpp32u *paramStridesTensor, Rpp32u &maxParamVolume)
+{
+ maxParamVolume = 1;
+ uint axisSet[RPPT_MAX_DIMS];
+ for(int i = 0; i < tensorDims; i++)
+ axisSet[i] = ((axisMask & (int)(pow(2, i))) >= 1) ? 1 : 0;
+
+ for(uint i = 0; i < batchSize; i++)
+ {
+ // calculate the param shape and param volume based on the axis mask
+ Rpp32u paramVolume = 1;
+ Rpp32u *roi = &roiTensor[tensorDims * 2 * i + tensorDims];
+ Rpp32u *paramShape = ¶mShapeTensor[i * tensorDims];
+ for(uint j = 0; j < tensorDims; j++)
+ {
+ paramShape[j] = (axisSet[j]) ? 1 : roi[j];
+ paramVolume *= paramShape[j];
+ }
+ maxParamVolume = std::max(maxParamVolume, paramVolume);
+
+ // calculate the param strides from the param shape
+ Rpp32u *paramStrides = ¶mStridesTensor[i * tensorDims];
+ Rpp32u val = 1;
+ for(uint j = tensorDims - 1; j > 0; j--)
+ {
+ paramStrides[j] = val;
+ val *= paramShape[j];
+ }
+ paramStrides[0] = val;
+ }
+}
+
+// -------------------- Set 2 - normalization kernels --------------------
+
+template
+__global__ void normalize_2d_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ float *meanTensor,
+ float *stdDevTensor,
+ float2 scaleAndShift,
+ uint *roiTensor,
+ uint2 maxParamVolumeAndAxisMask,
+ bool computeStdDev)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // width
+ uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+ uint *roi = &roiTensor[id_z * 4];
+ uint yBegin = roi[0];
+ uint xBegin = roi[1];
+ uint height = roi[2];
+ uint width = roi[3];
+
+ if (id_x >= width || id_y >= height)
+ return;
+
+ uint maxParamVolume = maxParamVolumeAndAxisMask.x;
+ uint axisMask = maxParamVolumeAndAxisMask.y;
+ uint paramIndex = id_z * maxParamVolume;
+ // update paramIndex based on axisMask value
+ if (axisMask == 1)
+ paramIndex += id_x;
+ else if (axisMask == 2)
+ paramIndex += id_y;
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + id_x + xBegin;
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
+ float mean = meanTensor[paramIndex];
+ float stdDev = stdDevTensor[paramIndex];
+ float scale = scaleAndShift.x;
+ float shift = scaleAndShift.y;
+ float invStdDev;
+ if (computeStdDev)
+ {
+ float stdDevSquare = stdDev * stdDev;
+ invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+ }
+ else
+ {
+ invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+ }
+ float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift);
+ normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+template
+__global__ void normalize_3d_hip_tensor(T *srcPtr,
+ uint2 srcStridesDH,
+ T *dstPtr,
+ uint2 dstStridesDH,
+ float *meanTensor,
+ float *stdDevTensor,
+ float2 scaleAndShift,
+ uint *roiTensor,
+ uint axisMask,
+ bool computeStdDev)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX
+ uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ
+
+ uint *roi = roiTensor;
+ uint zBegin = roi[0];
+ uint yBegin = roi[1];
+ uint xBegin = roi[2];
+ uint lengthZ = roi[3];
+ uint lengthY = roi[4];
+ uint lengthX = roi[5];
+
+ if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ)
+ return;
+
+ uint paramIndex = 0;
+ // update paramIndex based on axisMask value
+ if (axisMask == 1)
+ paramIndex += id_y * lengthX + id_x;
+ else if (axisMask == 2)
+ paramIndex += id_z * lengthX + id_x;
+ else if (axisMask == 4)
+ paramIndex += id_z * lengthY + id_y;
+ else if (axisMask == 3)
+ paramIndex += id_x;
+ else if (axisMask == 5)
+ paramIndex += id_y;
+ else if (axisMask == 6)
+ paramIndex += id_z;
+
+ uint srcIdx = ((id_z + zBegin) * srcStridesDH.x) + ((id_y + yBegin) * srcStridesDH.y) + id_x + xBegin;
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x;
+ float mean = meanTensor[paramIndex];
+ float stdDev = stdDevTensor[paramIndex];
+ float scale = scaleAndShift.x;
+ float shift = scaleAndShift.y;
+ float invStdDev;
+ if (computeStdDev)
+ {
+ float stdDevSquare = stdDev * stdDev;
+ invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+ }
+ else
+ {
+ invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+ }
+ float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift);
+ normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+template
+__global__ void normalize_nd_hip_tensor(T *srcPtr,
+ uint *srcMaxDims,
+ uint *srcStrides,
+ T *dstPtr,
+ float *meanTensor,
+ float *stdDevTensor,
+ float2 scaleAndShift,
+ uint *roiTensor,
+ uint *paramShapeTensor,
+ uint *paramStridesTensor,
+ uint2 maxParamVolumeAndBufferLength,
+ uint tensorDims,
+ bool computeStdDev)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint maxBufferLength = maxParamVolumeAndBufferLength.y;
+
+ if (id_x >= maxBufferLength)
+ return;
+
+ uint *begin = &roiTensor[id_z * tensorDims * 2];
+ uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+ uint *paramShape = ¶mShapeTensor[id_z * tensorDims];
+ uint *paramStrides = ¶mStridesTensor[id_z * tensorDims];
+ uint maxParamVolume = maxParamVolumeAndBufferLength.x;
+ uint srcIdx = id_z * maxBufferLength;
+
+ uint paramIndex = id_z * maxParamVolume;
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ return;
+ paramIndex += (maxParamVolume != 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+
+ float mean = meanTensor[paramIndex];
+ float stdDev = stdDevTensor[paramIndex];
+ float scale = scaleAndShift.x;
+ float shift = scaleAndShift.y;
+ float invStdDev;
+ if (computeStdDev)
+ {
+ float stdDevSquare = stdDev * stdDev;
+ invStdDev = stdDevSquare ? rsqrtf(stdDevSquare) * scale : 0;
+ }
+ else
+ {
+ invStdDev = (stdDev) ? (scale * (1.0f / stdDev)) : 1.0f;
+ }
+ uint dstIdx = id_z * maxBufferLength + id_x;
+ float outVal = fmaf((static_cast(srcPtr[srcIdx]) - mean), invStdDev, shift);
+ normalize_check_and_store(outVal, &dstPtr[dstIdx]);
+}
+
+// -------------------- Set 3 - mean and stddev compute kernels device helpers --------------------
+
+__device__ __forceinline__ void reduction_sum_x_hip(float *partialSum_smem)
+{
+ for(uint threadMax = hipBlockDim_x / 2; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialSum_smem[hipThreadIdx_x] += partialSum_smem[hipThreadIdx_x + threadMax];
+ __syncthreads();
+ }
+}
+
+// -------------------- Set 4 - mean compute kernels (reduction stage 1) --------------------
+
+template
+__global__ void compute_mean_2d_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ float *meanTensor,
+ float *partialSumTensor,
+ uint *roiTensor,
+ uint maxParamVolume,
+ uint axisMask)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *roi = &roiTensor[id_z * 4];
+ uint yBegin = roi[0];
+ uint xBegin = roi[1];
+ uint height = roi[2];
+ uint width = roi[3];
+
+ // compute column wise mean
+ if (axisMask == 1)
+ {
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin);
+ uint dstIdx = id_z * maxParamVolume + id_x;
+ if (id_x < width)
+ {
+ float accum = 0.0f;
+ for(int i = 0; i < height; i++)
+ {
+ accum += static_cast(srcPtr[srcIdx]);
+ srcIdx += srcStridesNH.y;
+ }
+ meanTensor[dstIdx] = accum / static_cast(height);
+ }
+ }
+ // compute partial sums needed for row wise mean
+ else if (axisMask == 2)
+ {
+ id_x *= 8;
+ __shared__ float partialRowSum_smem[256];
+ partialRowSum_smem[hipThreadIdx_x] = 0.0f;
+
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ int xAlignedLength = width & ~7; // alignedLength for vectorized global loads
+ int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ if (id_x + 8 > width)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 256 threads and store in Shared
+ __syncthreads();
+
+ // Now do block level reduction sum
+ reduction_sum_x_hip(partialRowSum_smem);
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x;
+ partialSumTensor[paramIndex] = partialRowSum_smem[0];
+ }
+ }
+ // compute partial sums need for computing mean over entire rows and columns
+ else if (axisMask == 3)
+ {
+ id_x *= 8;
+ __shared__ float partialSum_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ int xAlignedLength = width & ~7; // alignedLength for vectorized global loads
+ int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ if (id_x + 8 > width)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSumRowPtr_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void compute_mean_3d_hip_tensor(T *srcPtr,
+ uint3 srcStridesNZY,
+ float *meanTensor,
+ uint *roiTensor,
+ float *partialSumTensor,
+ uint maxParamVolume,
+ uint axisMask)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *roi = &roiTensor[id_z * 6];
+ uint zBegin = roi[0];
+ uint yBegin = roi[1];
+ uint xBegin = roi[2];
+ uint lengthZ = roi[3];
+ uint lengthY = roi[4];
+ uint lengthX = roi[5];
+
+ // compute mean along z direction
+ if (axisMask == 1)
+ {
+ if (id_x >= lengthX || id_y >= lengthY)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin);
+ uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x;
+ float accum = 0.0f;
+ for(uint i = 0; i < lengthZ; i++)
+ {
+ accum += static_cast(srcPtr[srcIdx]);
+ srcIdx += srcStridesNZY.y;
+ }
+ meanTensor[dstIdx] = accum / static_cast(lengthZ);
+ }
+ // compute mean along y direction
+ else if (axisMask == 2)
+ {
+ if (id_x >= lengthX || id_y >= lengthZ)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin);
+ uint dstIdx = id_z * maxParamVolume + id_y * lengthX + id_x;
+ float accum = 0.0f;
+ for(uint i = 0; i < lengthY; i++)
+ {
+ accum += static_cast(srcPtr[srcIdx]);
+ srcIdx += srcStridesNZY.z;
+ }
+ meanTensor[dstIdx] = accum / static_cast(lengthY);
+ }
+ // compute mean along x direction
+ else if (axisMask == 4)
+ {
+ if (id_x >= lengthY || id_y >= lengthZ)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+ d_float8 accum_f8;
+ accum_f8.f4[0] = (float4)0.0f;
+ accum_f8.f4[1] = (float4)0.0f;
+ for(int i = 0; i < lengthX; i += 8)
+ {
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ if (i + 8 > lengthX)
+ {
+ int xDiff = i + 8 - lengthX;
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f;
+ }
+ accum_f8.f4[0] += src_f8.f4[0];
+ accum_f8.f4[1] += src_f8.f4[1];
+ srcIdx += 8;
+ }
+ accum_f8.f4[0] += accum_f8.f4[1];
+ accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+ uint dstIdx = id_z * maxParamVolume + id_y * lengthY + id_x;
+ meanTensor[dstIdx] = accum_f8.f1[0] / static_cast(lengthX);
+ }
+ // compute partial sums required for computing mean along z-y direction
+ else if (axisMask == 3)
+ {
+ for(uint xIndex = 0; xIndex < lengthX; xIndex++)
+ {
+ __shared__ float partialSum_smem[16][16];
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ if ((id_x >= lengthY) || (id_y >= lengthZ))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex);
+ partialSumRowPtr_smem[hipThreadIdx_x] = static_cast(srcPtr[srcIdx]);
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax];
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in z dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x);
+ partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+ }
+ }
+ __syncthreads();
+ }
+ }
+ // compute partial sums required for computing mean along y-x direction
+ else if (axisMask == 6)
+ {
+ __shared__ float partialSum_smem[256];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+ __syncthreads();
+
+ if (id_x >= lengthY || id_y >= lengthZ)
+ return;
+
+ uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y;
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+ d_float8 accum_f8;
+ accum_f8.f4[0] = (float4)0.0f;
+ accum_f8.f4[1] = (float4)0.0f;
+ for(int i = 0; i < lengthX; i += 8)
+ {
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ if (i + 8 > lengthX)
+ {
+ int xDiff = lengthX - i;
+ for(int j = xDiff; j < 8; j++)
+ src_f8.f1[j] = 0.0f;
+ }
+ accum_f8.f4[0] += src_f8.f4[0];
+ accum_f8.f4[1] += src_f8.f4[1];
+ srcIdx += 8;
+ }
+ accum_f8.f4[0] += accum_f8.f4[1];
+ accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+ partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0];
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSum_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
+ partialSumTensor[dstIdx] = partialSum_smem[0];
+ }
+ }
+ // compute mean along z-x direction
+ else if (axisMask == 5)
+ {
+ __shared__ float partialSum_smem[32];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+ if (hipBlockIdx_x >= lengthY)
+ return;
+
+ uint dstIdx = id_z * maxParamVolume + hipBlockIdx_x;
+ float accum = 0.0f;
+ for (uint i = 0; i < lengthZ; i++)
+ {
+ uint tid_x = hipThreadIdx_x;
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin;
+ while (tid_x < lengthX)
+ {
+ accum += static_cast(srcPtr[srcIdx + tid_x]);
+ tid_x += hipBlockDim_x;
+ }
+ }
+ partialSum_smem[hipThreadIdx_x] = accum;
+ __syncthreads();
+
+ // perform reduction on shared memory sums
+ reduction_sum_x_hip(partialSum_smem);
+
+ if (hipThreadIdx_x == 0)
+ meanTensor[dstIdx] = partialSum_smem[0] / static_cast(lengthX * lengthZ);
+ }
+ // compute partial sums required for computing mean along z-y-x direction
+ else if (axisMask == 7)
+ {
+ id_x *= 8;
+ __shared__ float partialSum_smem[16][16];
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ uint xIndex = id_x % srcStridesNZY.z;
+ uint yIndex = id_x / srcStridesNZY.z;
+ if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ))
+ {
+ return;
+ }
+
+ int xAlignedLength = lengthX & ~7; // alignedLength for vectorized global loads
+ int xDiff = lengthX - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ if (xIndex + 8 > lengthX)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1];
+ partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]);
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSumRowPtr_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x);
+ partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+ }
+ }
+ }
+}
+
+template
+__global__ void compute_mean_nd_hip_tensor(T *srcPtr,
+ uint *srcMaxDims,
+ uint *srcStrides,
+ float *meanTensor,
+ uint *roiTensor,
+ uint *paramShapeTensor,
+ uint *paramStridesTensor,
+ uint maxParamVolume,
+ uint tensorDims,
+ uint maxBufferLength)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *begin = &roiTensor[id_z * tensorDims * 2];
+ uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+ uint *paramShape = ¶mShapeTensor[id_z * tensorDims];
+ uint *paramStrides = ¶mStridesTensor[id_z * tensorDims];
+ uint srcIdx = id_z * maxBufferLength;
+ uint paramBase = id_z * maxParamVolume;
+ uint paramIndex = 0;
+
+ if (maxParamVolume > MAX_SHARED_MEMORY_SIZE)
+ {
+ if (id_x >= maxBufferLength)
+ return;
+
+ // validate if id_x is within the roi of input and compute paramIndex if valid
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ return;
+ paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+ atomicAdd(&meanTensor[paramBase + paramIndex], static_cast(srcPtr[srcIdx]));
+ }
+ else
+ {
+
+ if (id_x >= (hipBlockDim_x * hipGridDim_x))
+ return;
+
+ // if number of means needed to compute is within in the max shared memory size
+ // use shared memory for atomic addition to reduce global memory traffic
+ bool isValid = true;
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ {
+ isValid = false;
+ break;
+ }
+ paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+
+ extern __shared__ float sh_mem[];
+ sh_mem[hipThreadIdx_x] = 0.0f;
+ __syncthreads();
+
+ if (isValid && id_x < maxBufferLength)
+ atomicAdd(&sh_mem[paramIndex], static_cast(srcPtr[srcIdx]));
+ __syncthreads();
+
+ if (hipThreadIdx_x < maxParamVolume)
+ atomicAdd(&meanTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]);
+ }
+}
+
+// -------------------- Set 5 - stddev compute kernels (reduction stage 1) --------------------
+
+template
+__global__ void compute_stddev_2d_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ float *meanTensor,
+ float *stdDevTensor,
+ float *partialSumTensor,
+ uint *roiTensor,
+ uint maxParamVolume,
+ uint axisMask)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *roi = &roiTensor[id_z * 4];
+ uint yBegin = roi[0];
+ uint xBegin = roi[1];
+ uint height = roi[2];
+ uint width = roi[3];
+
+ // compute column wise stdDev
+ if (axisMask == 1)
+ {
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + (yBegin * srcStridesNH.y) + (id_x + xBegin);
+ uint paramIndex = id_z * maxParamVolume + id_x;
+ float mean = meanTensor[paramIndex];
+ if (id_x < width)
+ {
+ float accum = 0.0f;
+ for(int i = 0; i < height; i++)
+ {
+ float val = (static_cast(srcPtr[srcIdx]) - mean);
+ accum += (val * val);
+ srcIdx += srcStridesNH.y;
+ }
+ stdDevTensor[paramIndex] = sqrtf(accum / static_cast(height));
+ }
+ }
+ // compute partial mean subtracted squared sums needed for row wise stdDev
+ else if (axisMask == 2)
+ {
+ id_x *= 8;
+ __shared__ float partialRowSum_smem[256];
+ partialRowSum_smem[hipThreadIdx_x] = 0.0f;
+
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ int xAlignedLength = width & ~7; // alignedLength for vectorized global loads
+ int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+ uint paramIndex = id_z * maxParamVolume + id_y;
+ float mean = meanTensor[paramIndex];
+ float4 mean_f4 = static_cast(mean);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+
+ if (id_x + 8 > width)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialRowSum_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+ __syncthreads();
+
+ // Now do block level reduction sum
+ reduction_sum_x_hip(partialRowSum_smem);
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ uint paramIndex = (id_z * hipGridDim_y * hipGridDim_x) + (id_y * hipGridDim_x) + hipBlockIdx_x;
+ partialSumTensor[paramIndex] = partialRowSum_smem[0];
+ }
+ }
+ // compute partial mean subtracted squared sums need for computing stdDev over entire rows and columns
+ else if (axisMask == 3)
+ {
+ id_x *= 8;
+ __shared__ float partialSum_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ if ((id_y >= height) || (id_x >= width))
+ {
+ return;
+ }
+
+ int xAlignedLength = width & ~7; // alignedLength for vectorized global loads
+ int xDiff = width - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + yBegin) * srcStridesNH.y) + (id_x + xBegin);
+
+ float mean = meanTensor[id_z];
+ float4 mean_f4 = static_cast(mean);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+ if (id_x + 8 > width)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1]; // perform small work of vectorized float4 addition
+ partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSumRowPtr_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ partialSumTensor[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialSumRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void compute_stddev_3d_hip_tensor(T *srcPtr,
+ uint3 srcStridesNZY,
+ float *meanTensor,
+ float *stdDevTensor,
+ uint *roiTensor,
+ float *partialSumTensor,
+ uint maxParamVolume,
+ uint axisMask)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *roi = &roiTensor[id_z * 6];
+ uint zBegin = roi[0];
+ uint yBegin = roi[1];
+ uint xBegin = roi[2];
+ uint lengthZ = roi[3];
+ uint lengthY = roi[4];
+ uint lengthX = roi[5];
+
+ // compute stddev along z direction
+ if (axisMask == 1)
+ {
+ if (id_x >= lengthX || id_y >= lengthY)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + (zBegin * srcStridesNZY.y) + ((id_y + yBegin) * srcStridesNZY.z) + (id_x + xBegin);
+ uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x;
+ float mean = meanTensor[paramIndex];
+ float accum = 0.0f;
+ for(uint i = 0; i < lengthZ; i++)
+ {
+ float val = (static_cast(srcPtr[srcIdx]) - mean);
+ accum += (val * val);
+ srcIdx += srcStridesNZY.y;
+ }
+ stdDevTensor[paramIndex] = sqrtf(accum / static_cast(lengthZ));
+ }
+ // compute stddev along y direction
+ else if (axisMask == 2)
+ {
+ if (id_x >= lengthX || id_y >= lengthZ)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + (yBegin * srcStridesNZY.z) + (id_x + xBegin);
+ uint paramIndex = id_z * maxParamVolume + id_y * lengthX + id_x;
+ float mean = meanTensor[paramIndex];
+ float accum = 0.0f;
+ for(uint i = 0; i < lengthY; i++)
+ {
+ float val = (static_cast(srcPtr[srcIdx]) - mean);
+ accum += (val * val);
+ srcIdx += srcStridesNZY.z;
+ }
+ stdDevTensor[paramIndex] = sqrtf(accum / static_cast(lengthY));
+ }
+ // compute stddev along x direction
+ else if (axisMask == 4)
+ {
+ if (id_x >= lengthY || id_y >= lengthZ)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+ uint paramIndex = id_z * maxParamVolume + id_y * lengthY + id_x;
+ float mean = meanTensor[paramIndex];
+ float4 mean_f4 = static_cast(mean);
+ d_float8 accum_f8;
+ accum_f8.f4[0] = (float4)0.0f;
+ accum_f8.f4[1] = (float4)0.0f;
+ for(int i = 0; i < lengthX; i += 8)
+ {
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+ if (i + 8 > lengthX)
+ {
+ int xDiff = i + 8 - lengthX;
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f;
+ }
+ accum_f8.f4[0] += src_f8.f4[0];
+ accum_f8.f4[1] += src_f8.f4[1];
+ srcIdx += 8;
+ }
+ accum_f8.f4[0] += accum_f8.f4[1];
+ accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+
+ stdDevTensor[paramIndex] = sqrtf(accum_f8.f1[0] / static_cast(lengthX));
+ }
+ // compute partial mean subtracted squared sums required for computing stdDev along z-y direction
+ else if (axisMask == 3)
+ {
+ for(uint xIndex = 0; xIndex < lengthX; xIndex++)
+ {
+ __shared__ float partialSum_smem[16][16];
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ if ((id_x >= lengthY) || (id_y >= lengthZ))
+ {
+ return;
+ }
+
+ uint paramIndex = id_z * maxParamVolume + xIndex;
+ float mean = meanTensor[paramIndex];
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + (xBegin + xIndex);
+ float val = static_cast(srcPtr[srcIdx]) - mean;
+ partialSumRowPtr_smem[hipThreadIdx_x] = (val * val);
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialSumRowPtr_smem[hipThreadIdx_x] += partialSumRowPtr_smem[hipThreadIdx_x + threadMax];
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in z dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ uint dstIdx = (id_z * srcStridesNZY.z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x) + (xIndex * hipGridDim_y * hipGridDim_x);
+ partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+ }
+ }
+ __syncthreads();
+ }
+ }
+ // compute partial mean subtracted squared sums required for computing stdDev along y-x direction
+ else if (axisMask == 6)
+ {
+ __shared__ float partialSum_smem[256];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+ __syncthreads();
+
+ if (id_x >= lengthY || id_y >= lengthZ)
+ return;
+
+ uint maxLengthZ = srcStridesNZY.x / srcStridesNZY.y;
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((id_x + yBegin) * srcStridesNZY.z) + xBegin;
+
+ uint paramIndex = id_z * maxParamVolume + id_y;
+ float mean = meanTensor[paramIndex];
+ float4 mean_f4 = static_cast(mean);
+
+ d_float8 accum_f8;
+ accum_f8.f4[0] = (float4)0.0f;
+ accum_f8.f4[1] = (float4)0.0f;
+ for(int i = 0; i < lengthX; i += 8)
+ {
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+ if (i + 8 > lengthX)
+ {
+ int xDiff = lengthX - i;
+ for(int j = xDiff; j < 8; j++)
+ src_f8.f1[j] = 0.0f;
+ }
+ accum_f8.f4[0] += src_f8.f4[0];
+ accum_f8.f4[1] += src_f8.f4[1];
+ srcIdx += 8;
+ }
+ accum_f8.f4[0] += accum_f8.f4[1];
+ accum_f8.f1[0] = (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]);
+ partialSum_smem[hipThreadIdx_x] = accum_f8.f1[0];
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSum_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ uint dstIdx = (id_z * maxLengthZ * hipGridDim_x) + hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x;
+ partialSumTensor[dstIdx] = partialSum_smem[0];
+ }
+ }
+ // compute stddev along z-x direction
+ else if (axisMask == 5)
+ {
+ __shared__ float partialSum_smem[32];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+ if (hipBlockIdx_x >= lengthY)
+ return;
+
+ uint paramIndex = id_z * maxParamVolume + hipBlockIdx_x;
+ float mean = meanTensor[paramIndex];
+ float accum = 0.0f;
+ for (uint i = 0; i < lengthZ; i++)
+ {
+ uint tid_x = hipThreadIdx_x;
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((i + zBegin) * srcStridesNZY.y) + ((hipBlockIdx_x + yBegin) * srcStridesNZY.z) + xBegin;
+ while (tid_x < lengthX)
+ {
+ float val = (static_cast(srcPtr[srcIdx + tid_x]) - mean);
+ accum += (val * val);
+ tid_x += hipBlockDim_x;
+ }
+ }
+ partialSum_smem[hipThreadIdx_x] = accum;
+ __syncthreads();
+
+ // perform reduction on shared memory sums
+ reduction_sum_x_hip(partialSum_smem);
+
+ if (hipThreadIdx_x == 0)
+ stdDevTensor[paramIndex] = sqrtf(partialSum_smem[0] / static_cast(lengthX * lengthZ));
+ }
+ // compute partial mean subtracted squared sums required for computing stdDev along z-y-x direction
+ else if (axisMask == 7)
+ {
+ id_x *= 8;
+ __shared__ float partialSum_smem[16][16];
+ float *partialSumRowPtr_smem = &partialSum_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in Shared
+ partialSumRowPtr_smem[hipThreadIdx_x] = 0.0f; // initialization of Shared to 0.0f using all 16 x 16 threads
+
+ uint xIndex = id_x % srcStridesNZY.z;
+ uint yIndex = id_x / srcStridesNZY.z;
+ if ((xIndex >= lengthX) || (yIndex >= lengthY) || (id_y >= lengthZ))
+ {
+ return;
+ }
+
+ int xAlignedLength = lengthX & ~7; // alignedLength for vectorized global loads
+ int xDiff = lengthX - xAlignedLength; // difference between roiWidth and alignedLength
+ uint srcIdx = (id_z * srcStridesNZY.x) + ((id_y + zBegin) * srcStridesNZY.y) + ((yIndex + yBegin) * srcStridesNZY.z) + (xIndex + xBegin);
+
+ uint paramIndex = id_z * maxParamVolume;
+ float mean = meanTensor[paramIndex];
+ float4 mean_f4 = static_cast(mean);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_subtract8_const(&src_f8, &src_f8, mean_f4);
+ rpp_hip_math_multiply8(&src_f8, &src_f8, &src_f8);
+
+ if (xIndex + 8 > lengthX)
+ {
+ for(int i = xDiff; i < 8; i++)
+ src_f8.f1[i] = 0.0f; // local memory reset of invalid values (from the vectorized global load) to 0.0f
+ }
+ src_f8.f4[0] += src_f8.f4[1];
+ partialSumRowPtr_smem[hipThreadIdx_x] = (src_f8.f1[0] +
+ src_f8.f1[1] +
+ src_f8.f1[2] +
+ src_f8.f1[3]); // perform small work of reducing float4s to float using 16 x 16 threads and store in Shared
+ __syncthreads(); // syncthreads after Shared load
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ reduction_sum_x_hip(partialSumRowPtr_smem);
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialSumRowPtr_smem[0] += partialSumRowPtr_smem[increment];
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ uint dstIdx = (id_z * hipGridDim_y * hipGridDim_x) + (hipBlockIdx_y * hipGridDim_x + hipBlockIdx_x);
+ partialSumTensor[dstIdx] = partialSumRowPtr_smem[0];
+ }
+ }
+ }
+}
+
+template
+__global__ void compute_stddev_nd_hip_tensor(T *srcPtr,
+ uint *srcMaxDims,
+ uint *srcStrides,
+ float *meanTensor,
+ float *stdDevTensor,
+ uint *roiTensor,
+ uint *paramShapeTensor,
+ uint *paramStridesTensor,
+ uint maxParamVolume,
+ uint tensorDims,
+ uint maxBufferLength)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *begin = &roiTensor[id_z * tensorDims * 2];
+ uint *length = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+ uint *paramShape = ¶mShapeTensor[id_z * tensorDims];
+ uint *paramStrides = ¶mStridesTensor[id_z * tensorDims];
+ uint srcIdx = id_z * maxBufferLength;
+ uint paramBase = id_z * maxParamVolume;
+ uint paramIndex = 0;
+
+ if (maxParamVolume > MAX_SHARED_MEMORY_SIZE)
+ {
+ if (id_x >= maxBufferLength)
+ return;
+
+ // validate if id_x is within the roi of input and compute paramIndex if valid
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ return;
+ paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+ float val = static_cast(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex];
+ atomicAdd(&stdDevTensor[paramBase + paramIndex], (val * val));
+ }
+ else
+ {
+
+ if (id_x >= (hipBlockDim_x * hipGridDim_x))
+ return;
+
+ // if number of means needed to compute is within in the max shared memory size
+ // use shared memory for atomic addition to reduce global memory traffic
+ bool isValid = true;
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint coord = id_x / srcStrides[i] % srcMaxDims[i];
+ srcIdx += ((begin[i] + coord) * srcStrides[i]);
+ if (coord >= length[i])
+ {
+ isValid = false;
+ break;
+ }
+ paramIndex += (maxParamVolume > 1) ? ((coord % paramShape[i]) * paramStrides[i]) : 0;
+ }
+
+ extern __shared__ float sh_mem[];
+ sh_mem[hipThreadIdx_x] = 0.0f;
+ __syncthreads();
+
+ if (isValid && id_x < maxBufferLength)
+ {
+ float val = static_cast(srcPtr[srcIdx]) - meanTensor[paramBase + paramIndex];
+ atomicAdd(&sh_mem[paramIndex], (val * val));
+ }
+ __syncthreads();
+
+ if (hipThreadIdx_x < maxParamVolume)
+ atomicAdd(&stdDevTensor[paramBase + hipThreadIdx_x], sh_mem[hipThreadIdx_x]);
+ }
+}
+
+// -------------------- Set 6 - mean and stddev compute kernels (reduction stage 2) --------------------
+
+__global__ void reduce_final_result_hip(float *partialSumTensor,
+ uint numPartialSums,
+ float *meanTensor,
+ float *stdDevTensor,
+ bool isMean,
+ uint *roiTensor,
+ uint axisMask,
+ uint tensorDims)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+
+ uint meanFactor;
+ if (tensorDims == 3)
+ {
+ uint lengthZ = roi[0];
+ uint lengthY = roi[1];
+ uint lengthX = roi[2];
+
+ if (axisMask == 3)
+ meanFactor = lengthZ * lengthY;
+ else if (axisMask == 6)
+ meanFactor = lengthY * lengthX;
+ else if (axisMask == 7)
+ meanFactor = lengthZ * lengthY * lengthX;
+ }
+ else if (tensorDims == 2)
+ {
+ uint lengthY = roi[0];
+ uint lengthX = roi[1];
+
+ if (axisMask == 2)
+ meanFactor = lengthX;
+ else if (axisMask == 3)
+ meanFactor = lengthY * lengthX;
+ }
+
+ __shared__ float partialSum_smem[16];
+ partialSum_smem[hipThreadIdx_x] = 0.0f;
+
+ float accum = 0.0f;
+ while(id_x < numPartialSums)
+ {
+ uint srcIdx = (id_z * hipGridDim_y * numPartialSums) + (id_y * numPartialSums) + id_x;
+ accum += partialSumTensor[srcIdx];
+ id_x += hipBlockDim_x;
+ }
+ partialSum_smem[hipThreadIdx_x] = accum;
+ __syncthreads();
+
+ // Now do block level reduction sum
+ reduction_sum_x_hip(partialSum_smem);
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ if (isMean)
+ meanTensor[id_z * hipGridDim_y + id_y] = partialSum_smem[0] / meanFactor;
+ else
+ stdDevTensor[id_z * hipGridDim_y + id_y] = sqrtf(partialSum_smem[0] / meanFactor);
+ }
+}
+
+__global__ void final_reduction_nd_hip_tensor(float *meanTensor,
+ float *stdDevTensor,
+ uint *paramShapeTensor,
+ uint *roiTensor,
+ uint tensorDims,
+ uint maxParamVolume,
+ bool isMean)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ uint *paramShape = ¶mShapeTensor[id_z * tensorDims];
+ uint *roi = &roiTensor[id_z * tensorDims * 2 + tensorDims];
+
+ uint divisionFactor = 1;
+ uint paramVolume = 1;
+ for(int i = 0; i < tensorDims; i++)
+ {
+ paramVolume *= paramShape[i];
+ if (paramShape[i] == 1)
+ divisionFactor *= roi[i];
+ }
+
+ if (id_x >= paramVolume)
+ return;
+
+ uint paramIndex = id_z * maxParamVolume + id_x;
+ if (isMean)
+ meanTensor[paramIndex] = meanTensor[paramIndex] / divisionFactor;
+ else
+ stdDevTensor[paramIndex] = sqrtf(stdDevTensor[paramIndex] / divisionFactor);
+}
+
+// -------------------- Set 7 - mean and stddev compute kernels launch helpers --------------------
+
+void set_kernel_launch_config_2d(RpptGenericDescPtr srcGenericDescPtr,
+ int &globalThreads_x,
+ int &globalThreads_y,
+ int &globalThreads_z,
+ int &localThreads_x,
+ int &localThreads_y,
+ int &localThreads_z,
+ Rpp32u axisMask,
+ Rpp32f *partialSumArr,
+ rpp::Handle& handle)
+{
+ switch (axisMask)
+ {
+ // compute along Y direction
+ case 1:
+ {
+ localThreads_x = 256;
+ localThreads_y = 1;
+ localThreads_z = 1;
+ globalThreads_x = static_cast(ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+ globalThreads_y = 1;
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along X direction
+ case 2:
+ {
+ localThreads_x = 256;
+ localThreads_y = 1;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / 256));
+ globalThreads_y = srcGenericDescPtr->dims[1];
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = srcGenericDescPtr->dims[0] * srcGenericDescPtr->dims[1] * globalThreads_x;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ // compute along XY direction
+ case 3:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)((srcGenericDescPtr->dims[2] + 7) >> 3) / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ }
+}
+
+void set_kernel_launch_config_3d(RpptGenericDescPtr srcGenericDescPtr,
+ int &globalThreads_x,
+ int &globalThreads_y,
+ int &globalThreads_z,
+ int &localThreads_x,
+ int &localThreads_y,
+ int &localThreads_z,
+ Rpp32u axisMask,
+ Rpp32f *partialSumArr,
+ rpp::Handle& handle)
+{
+ switch (axisMask)
+ {
+ // compute along Z direction
+ case 1:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along Y direction
+ case 2:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[3] / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along YZ direction
+ case 3:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ // compute along X direction
+ case 4:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along XZ direction
+ case 5:
+ {
+ localThreads_x = 32;
+ localThreads_y = 1;
+ localThreads_z = 1;
+ globalThreads_x = srcGenericDescPtr->dims[2];
+ globalThreads_y = 1;
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ break;
+ }
+ // compute along XY direction
+ case 6:
+ {
+ localThreads_x = 256;
+ localThreads_y = 1;
+ localThreads_z = 1;
+ globalThreads_x = static_cast (ceil((float)srcGenericDescPtr->dims[2] / localThreads_x));
+ globalThreads_y = srcGenericDescPtr->dims[1];
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ // compute along XYZ direction
+ case 7:
+ {
+ localThreads_x = 16;
+ localThreads_y = 16;
+ localThreads_z = 1;
+ Rpp32u numValues = (srcGenericDescPtr->dims[2] * srcGenericDescPtr->dims[3] + 7) >> 3;
+ globalThreads_x = static_cast (ceil((float)numValues / localThreads_x));
+ globalThreads_y = static_cast (ceil((float)srcGenericDescPtr->dims[1] / localThreads_y));
+ globalThreads_z = srcGenericDescPtr->dims[0];
+
+ Rpp32u partialSumArrLength = globalThreads_x * globalThreads_y * globalThreads_z;
+ hipMemsetAsync(partialSumArr, 0, partialSumArrLength * sizeof(Rpp32f), handle.GetStream());
+ hipStreamSynchronize(handle.GetStream());
+ break;
+ }
+ }
+}
+
+// -------------------- Set 8 - mean and stddev compute kernels executor --------------------
+
+template
+RppStatus hip_exec_compute_mean_stddev_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *meanTensor,
+ Rpp32f *stdDevTensor,
+ bool isMean,
+ Rpp32u *roiTensor,
+ Rpp32u axisMask,
+ Rpp32u tensorDims,
+ Rpp32u maxParamVolume,
+ Rpp32u *paramShape,
+ Rpp32u *paramStrides,
+ rpp::Handle& handle)
+{
+ Rpp32f *partialSumArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+ Rpp32u partialSumArrLength, partialSumBlocksPerSample;
+
+ int globalThreads_x, globalThreads_y, globalThreads_z;
+ int localThreads_x, localThreads_y, localThreads_z;
+ // based on number of dimensions call the corresponding kernel
+ if (tensorDims == 2)
+ {
+ // set the block and grid configuration based on axisMask
+ set_kernel_launch_config_2d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z,
+ localThreads_x, localThreads_y, localThreads_z, axisMask,
+ partialSumArr, handle);
+
+ if (isMean)
+ {
+ hipLaunchKernelGGL(compute_mean_2d_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(localThreads_x, localThreads_y, localThreads_z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+ meanTensor,
+ partialSumArr,
+ roiTensor,
+ maxParamVolume,
+ axisMask);
+ }
+ else
+ {
+ hipLaunchKernelGGL(compute_stddev_2d_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(localThreads_x, localThreads_y, localThreads_z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+ meanTensor,
+ stdDevTensor,
+ partialSumArr,
+ roiTensor,
+ maxParamVolume,
+ axisMask);
+ }
+
+ if (axisMask == 2)
+ {
+ partialSumBlocksPerSample = globalThreads_x;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ else if (axisMask == 3)
+ {
+ partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), 1, globalThreads_z),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ }
+ else if (tensorDims == 3)
+ {
+ // set the block and grid configuration based on axisMask
+ set_kernel_launch_config_3d(srcGenericDescPtr, globalThreads_x, globalThreads_y, globalThreads_z,
+ localThreads_x, localThreads_y, localThreads_z, axisMask,
+ partialSumArr, handle);
+
+ if (isMean)
+ {
+ hipLaunchKernelGGL(compute_mean_3d_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(localThreads_x, localThreads_y, localThreads_z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ meanTensor,
+ roiTensor,
+ partialSumArr,
+ maxParamVolume,
+ axisMask);
+ }
+ else
+ {
+ hipLaunchKernelGGL(compute_stddev_3d_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(localThreads_x, localThreads_y, localThreads_z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ meanTensor,
+ stdDevTensor,
+ roiTensor,
+ partialSumArr,
+ maxParamVolume,
+ axisMask);
+ }
+
+ // perform final reduction on block wise sums for below cases
+ // reduce on YZ partial sums
+ if (axisMask == 3)
+ {
+ partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[3], srcGenericDescPtr->dims[0]),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ // reduce on XY partial sums
+ if (axisMask == 6)
+ {
+ partialSumBlocksPerSample = globalThreads_x;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), srcGenericDescPtr->dims[1], srcGenericDescPtr->dims[0]),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ // reduce on XYZ block partial sums
+ else if (axisMask == 7)
+ {
+ partialSumBlocksPerSample = globalThreads_x * globalThreads_y;
+ hipLaunchKernelGGL(reduce_final_result_hip,
+ dim3(ceil((float)partialSumBlocksPerSample/16), 1, srcGenericDescPtr->dims[0]),
+ dim3(16, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialSumArr,
+ partialSumBlocksPerSample,
+ meanTensor,
+ stdDevTensor,
+ isMean,
+ roiTensor,
+ axisMask,
+ tensorDims);
+ }
+ }
+ else
+ {
+ // interpret the input as 1D tensor
+ globalThreads_x = srcGenericDescPtr->strides[0];
+ globalThreads_y = 1;
+ globalThreads_z = srcGenericDescPtr->dims[0];
+ Rpp32u batchSize = globalThreads_z;
+
+ // allocate tensor for src strides
+ Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1];
+ Rpp32u *srcStrides = &srcGenericDescPtr->strides[1];
+
+ Rpp32u shared_memory_size = 0;
+ Rpp32u block_size = 1024;
+ if (maxParamVolume <= MAX_SHARED_MEMORY_SIZE)
+ {
+ if (maxParamVolume <= 32)
+ shared_memory_size = 32;
+ else if (maxParamVolume <= 64)
+ shared_memory_size = 64;
+ else if (maxParamVolume <= 128)
+ shared_memory_size = 128;
+ else if (maxParamVolume <= 256)
+ shared_memory_size = 256;
+ else if (maxParamVolume <= 512)
+ shared_memory_size = 512;
+ else
+ shared_memory_size = MAX_SHARED_MEMORY_SIZE;
+ block_size = shared_memory_size;
+ }
+
+ if (isMean)
+ {
+ hipLaunchKernelGGL(compute_mean_nd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+ dim3(block_size, 1, 1),
+ shared_memory_size,
+ handle.GetStream(),
+ srcPtr,
+ srcMaxDims,
+ srcStrides,
+ meanTensor,
+ roiTensor,
+ paramShape,
+ paramStrides,
+ maxParamVolume,
+ tensorDims,
+ srcGenericDescPtr->strides[0]);
+ }
+ else
+ {
+ hipLaunchKernelGGL(compute_stddev_nd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/block_size), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+ dim3(block_size, 1, 1),
+ shared_memory_size,
+ handle.GetStream(),
+ srcPtr,
+ srcMaxDims,
+ srcStrides,
+ meanTensor,
+ stdDevTensor,
+ roiTensor,
+ paramShape,
+ paramStrides,
+ maxParamVolume,
+ tensorDims,
+ srcGenericDescPtr->strides[0]);
+ }
+ hipLaunchKernelGGL(final_reduction_nd_hip_tensor,
+ dim3(ceil((float)maxParamVolume/1024), 1, globalThreads_z),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ meanTensor,
+ stdDevTensor,
+ paramShape,
+ roiTensor,
+ tensorDims,
+ maxParamVolume,
+ isMean);
+ }
+ hipStreamSynchronize(handle.GetStream());
+ return RPP_SUCCESS;
+}
+
+// -------------------- Set 9 - normalization kernel executor --------------------
+
+template
+RppStatus hip_exec_normalize_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ T *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensor,
+ Rpp32f *stdDevTensor,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u batchSize = srcGenericDescPtr->dims[0];
+ Rpp32u tensorDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+
+ // create buffer for paramShape and paramStride needed for generic kernel
+ Rpp32u *paramShape, *paramStrides;
+ paramShape = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem;
+ paramStrides = handle.GetInitHandle()->mem.mgpu.scratchBuf.uintmem + (batchSize * tensorDims);
+
+ // do initial preprocessing, compute maxParamVolue and fill the values for paramShape and paramStrides
+ Rpp32u maxParamVolume;
+ if (tensorDims == 2 || tensorDims == 3)
+ normalize_setup_2d_and_3d(roiTensor, batchSize, tensorDims,
+ axisMask, maxParamVolume);
+ else
+ normalize_setup_nd(roiTensor, batchSize, tensorDims, axisMask,
+ paramShape, paramStrides, maxParamVolume);
+
+ bool computeMean = computeMeanStddev & 1; // if 0th bit in computeMeanStddev is set, computeMean is set to true. Otherwise it is set to false
+ bool computeStdDev = computeMeanStddev & 2; // if 1st bit in computeMeanStddev is set, computeStdDev is set to true. Otherwise it is set to false
+ if ((!computeMean) && (!computeStdDev))
+ maxParamVolume = 0;
+
+ // if computeMean is set, compute mean values by processing over input based on axisMask values
+ if (computeMean)
+ hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, true,
+ roiTensor, axisMask, tensorDims, maxParamVolume,
+ paramShape, paramStrides, handle);
+
+ // if computeStdDev is set, compute stdDev values by processing over input based on axisMask values
+ if (computeStdDev)
+ hip_exec_compute_mean_stddev_tensor(srcPtr, srcGenericDescPtr, meanTensor, stdDevTensor, false,
+ roiTensor, axisMask, tensorDims, maxParamVolume,
+ paramShape, paramStrides, handle);
+
+ // based on number of dimensions call the corresponding kernel
+ if (tensorDims == 2)
+ {
+ // NHW
+ int globalThreads_x = dstGenericDescPtr->dims[2];
+ int globalThreads_y = dstGenericDescPtr->dims[1];
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ hipLaunchKernelGGL(normalize_2d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+ dstPtr,
+ make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]),
+ meanTensor,
+ stdDevTensor,
+ make_float2(scale, shift),
+ roiTensor,
+ make_uint2(maxParamVolume, axisMask),
+ computeStdDev);
+ }
+ else if (tensorDims == 3)
+ {
+ // NDHW
+ int globalThreads_x = dstGenericDescPtr->dims[3];
+ int globalThreads_y = dstGenericDescPtr->dims[2];
+ int globalThreads_z = dstGenericDescPtr->dims[1];
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(normalize_3d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ &meanTensor[batchCount * maxParamVolume],
+ &stdDevTensor[batchCount * maxParamVolume],
+ make_float2(scale, shift),
+ &roiTensor[batchCount * 6],
+ axisMask,
+ computeStdDev);
+ }
+ }
+ else
+ {
+ // interpret the input as 1D tensor
+ int globalThreads_x = dstGenericDescPtr->strides[0];
+ int globalThreads_y = 1;
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ // allocate tensor for src strides
+ Rpp32u *srcMaxDims = &srcGenericDescPtr->dims[1];
+ Rpp32u *srcStrides = &srcGenericDescPtr->strides[1];
+ hipLaunchKernelGGL(normalize_nd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y), ceil((float)globalThreads_z)),
+ dim3(1024, 1, 1),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ srcMaxDims,
+ srcStrides,
+ dstPtr,
+ meanTensor,
+ stdDevTensor,
+ make_float2(scale, shift),
+ roiTensor,
+ paramShape,
+ paramStrides,
+ make_uint2(maxParamVolume, srcGenericDescPtr->strides[0]),
+ tensorDims,
+ computeStdDev);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/rppi_validate.hpp b/src/modules/rppi_validate.hpp
index e35e5c514..3285ee756 100644
--- a/src/modules/rppi_validate.hpp
+++ b/src/modules/rppi_validate.hpp
@@ -56,11 +56,9 @@ inline RppLayoutParams get_layout_params(RpptLayout layout, Rpp32u channels)
}
else if(layout == RpptLayout::NHWC || layout == RpptLayout::NDHWC)
{
- if (channels == 3) // PKD3
- {
- layoutParams.channelParam = 1;
- layoutParams.bufferMultiplier = 3;
- }
+ //PKD
+ layoutParams.channelParam = 1;
+ layoutParams.bufferMultiplier = channels;
}
return layoutParams;
}
diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp
index 28313a88f..ef69a49bb 100644
--- a/src/modules/rppt_tensor_statistical_operations.cpp
+++ b/src/modules/rppt_tensor_statistical_operations.cpp
@@ -241,13 +241,109 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** normalize_ND ********************/
+
+RppStatus rppt_normalize_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensor,
+ Rpp32f *stdDevTensor,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams;
+ Rpp32u tensorDim = srcGenericDescPtr->numDims - 1;
+ if (tensorDim == 3 && (srcGenericDescPtr->layout == RpptLayout::NHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[3]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NCDHW) && (dstGenericDescPtr->layout == RpptLayout::NCDHW))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[1]);
+ else if ((srcGenericDescPtr->layout == RpptLayout::NDHWC) && (dstGenericDescPtr->layout == RpptLayout::NDHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[4]);
+ else if(tensorDim == 2 && (srcGenericDescPtr->layout == RpptLayout::NHWC))
+ layoutParams = get_layout_params(srcGenericDescPtr->layout, srcGenericDescPtr->dims[2]);
+
+ if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+ {
+ normalize_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+ {
+ normalize_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ normalize_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+ {
+ normalize_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
+#ifdef GPU_SUPPORT
+
/******************** tensor_sum ********************/
-#ifdef HIP_COMPILE
+
RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
RppPtr_t tensorSumArr,
@@ -256,6 +352,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
RpptRoiType roiType,
rppHandle_t rppHandle)
{
+#ifdef HIP_COMPILE
if (srcDescPtr->c == 1)
{
if (tensorSumArrLength < srcDescPtr->n) // sum of single channel
@@ -317,10 +414,12 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
}
return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
}
/******************** tensor_min ********************/
-
RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
RppPtr_t imageMinArr,
@@ -329,6 +428,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
RpptRoiType roiType,
rppHandle_t rppHandle)
{
+#ifdef HIP_COMPILE
if (srcDescPtr->c == 1)
{
if (imageMinArrLength < srcDescPtr->n) // min of single channel
@@ -378,6 +478,9 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
}
return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
}
/******************** tensor_max ********************/
@@ -390,6 +493,7 @@ RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
RpptRoiType roiType,
rppHandle_t rppHandle)
{
+#ifdef HIP_COMPILE
if (srcDescPtr->c == 1)
{
if (imageMaxArrLength < srcDescPtr->n) // max of single channel
@@ -439,5 +543,90 @@ RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
}
return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
}
+
+RppStatus rppt_normalize_gpu(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u axisMask,
+ Rpp32f *meanTensor,
+ Rpp32f *stdDevTensor,
+ Rpp8u computeMeanStddev,
+ Rpp32f scale,
+ Rpp32f shift,
+ Rpp32u *roiTensor,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_normalize_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_normalize_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_normalize_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_normalize_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ axisMask,
+ meanTensor,
+ stdDevTensor,
+ computeMeanStddev,
+ scale,
+ shift,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
#endif // backend
+}
+
+#endif // GPU_SUPPORT
diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt
index 82ed65309..77052cabe 100644
--- a/utilities/test_suite/CMakeLists.txt
+++ b/utilities/test_suite/CMakeLists.txt
@@ -83,7 +83,7 @@ if(Python3_FOUND)
if(NIFTI_FOUND)
add_test(
NAME rpp_qa_tests_tensor_voxel_host_all
- COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runTests_voxel.py --qa_mode 1 --batch_size 3
+ COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runVoxelTests.py --qa_mode 1 --batch_size 3
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endif(NIFTI_FOUND)
@@ -105,7 +105,7 @@ if(Python3_FOUND)
if(NIFTI_FOUND)
add_test(
NAME rpp_qa_tests_tensor_voxel_hip_all
- COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runTests_voxel.py --qa_mode 1 --batch_size 3
+ COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runVoxelTests.py --qa_mode 1 --batch_size 3
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endif(NIFTI_FOUND)
diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt
index 26017065e..a0bd42fa0 100644
--- a/utilities/test_suite/HIP/CMakeLists.txt
+++ b/utilities/test_suite/HIP/CMakeLists.txt
@@ -83,8 +83,10 @@ if (hip_FOUND AND OpenCV_FOUND)
link_directories(${ROCM_PATH}/lib /usr/local/lib)
add_executable(Tensor_hip Tensor_hip.cpp)
+ add_executable(Tensor_misc_hip Tensor_misc_hip.cpp)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPU_SUPPORT=1 -DRPP_BACKEND_HIP=1 -std=gnu++17")
target_link_libraries(Tensor_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device)
+ target_link_libraries(Tensor_misc_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device)
else()
message(FATAL_ERROR "-- ${Red}Error: OpenCV and hip must be installed to install ${PROJECT_NAME} successfully!${ColourReset}")
endif()
diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
new file mode 100644
index 000000000..96197f432
--- /dev/null
+++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
@@ -0,0 +1,231 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "../rpp_test_suite_misc.h"
+
+int main(int argc, char **argv)
+{
+ // Handle inputs
+ const int MIN_ARG_COUNT = 9;
+ if (argc < MIN_ARG_COUNT)
+ {
+ printf("\nImproper Usage! Needs all arguments!\n");
+ printf("\nUsage: ./Tensor_misc_hip