Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Water - HOST Tensor AVX2 Support and Vectorized HIP support #126

Merged
merged 26 commits into from
Jul 18, 2023
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
52b83c6
added water HOST and HIP codes
sampath1117 May 4, 2023
8dafad2
added water case in test suite
sampath1117 May 4, 2023
890437a
added golden outputs for water
sampath1117 May 4, 2023
643281c
added omp thread changes for water augmentation
sampath1117 May 22, 2023
336188d
experimental changes
sampath1117 May 22, 2023
87ab19f
fixed output issue with AVX2 instructions
sampath1117 May 23, 2023
94c8340
added AVX2 support for PKD3 load function
sampath1117 May 23, 2023
56190d8
nwc commit - added avx2 changes for u8 layout toggle variants but nee…
sampath1117 May 29, 2023
3b18a58
Add Avx2 implementation for F32 and U8 toggle variants
HazarathKumarM Jun 7, 2023
754e353
Add AVX2 support for u8 pkd3-pln3 and i8 pkd3-pln3 for water augmenta…
HazarathKumarM Jun 19, 2023
c4f69a9
change F32 load and store logic
HazarathKumarM Jul 3, 2023
e9e74a2
optimized the store function for F32 PLN3-PKD3
sampath1117 Jul 11, 2023
6c8fa57
Merge branch 'master' into water_avx_exp
sampath1117 Jul 12, 2023
fb1fdb4
reverted back irrelevant changes
sampath1117 Jul 12, 2023
4ffe9f6
minor change
sampath1117 Jul 12, 2023
6e0756a
optimized load and store functions for water U8 and F32 variants in host
sampath1117 Jul 12, 2023
0cf2626
merge with master
sampath1117 Jul 13, 2023
81553d3
removed golden outputs for water
sampath1117 Jul 13, 2023
a5567e6
minor changes
sampath1117 Jul 13, 2023
89380a5
renamed few functions and removed unused functions
sampath1117 Jul 13, 2023
27b318b
fixed bug in i8 load function
sampath1117 Jul 13, 2023
d3943b5
changed cast to c++ style
sampath1117 Jul 14, 2023
31d1624
added golden outputs for water
sampath1117 May 4, 2023
2729b1e
updated golden outputs with latest changes
sampath1117 Jul 14, 2023
8b763ad
modified the u8, i8 pkd3-pln3 function and added comments for the vec…
sampath1117 Jul 14, 2023
b418e24
fixed minor bug in I8 variants
sampath1117 Jul 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions include/rppt_tensor_effects_augmentations.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,29 @@ RppStatus rppt_non_linear_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDes
RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *stdDevTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/******************** water ********************/

// *param[in] srcPtr source tensor memory
// *param[in] srcDescPtr source tensor descriptor
// *param[out] dstPtr destination tensor memory
// *param[in] dstDescPtr destination tensor descriptor
// *param[in] amplitudeXTensor amplitudeX values for water effect (1D tensor of size batchSize)
// *param[in] amplitudeYTensor amplitudeY values for water effect (1D tensor of size batchSize)
// *param[in] freqXTensor freqX values for water effect (1D tensor of size batchSize)
// *param[in] freqYTensor freqY values for water effect (1D tensor of size batchSize)
// *param[in] phaseXTensor amplitudeY values for water effect (1D tensor of size batchSize)
// *param[in] phaseYTensor amplitudeY values for water effect (1D tensor of size batchSize)
// *param[in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
// *param[in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
// *returns a RppStatus enumeration.
// *retval RPP_SUCCESS : succesful completion
// *retval RPP_ERROR : Error

RppStatus rppt_water_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *amplitudeXTensor, Rpp32f *amplitudeYTensor, Rpp32f *frequencyXTensor, Rpp32f *frequencyYTensor, Rpp32f *phaseXTensor, Rpp32f *phaseYTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
RppStatus rppt_water_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *amplitudeXTensor, Rpp32f *amplitudeYTensor, Rpp32f *frequencyXTensor, Rpp32f *frequencyYTensor, Rpp32f *phaseXTensor, Rpp32f *phaseYTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

#ifdef __cplusplus
}
#endif
Expand Down
14 changes: 14 additions & 0 deletions src/include/cpu/rpp_cpu_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5044,6 +5044,20 @@ inline void compute_generic_nn_srclocs_and_validate_sse(__m128 pSrcY, __m128 pSr
_mm_storeu_si128((__m128i*) srcLoc, pxSrcLoc);
}

inline void compute_generic_nn_srclocs_and_validate_avx(__m256 pSrcY, __m256 pSrcX, __m256 *pRoiLTRB, __m256 pSrcStrideH, Rpp32s *srcLoc, Rpp32s *invalidLoad, bool hasRGBChannels = false)
{
pSrcY = _mm256_round_ps(pSrcY, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); // Nearest Neighbor Y location vector
pSrcX = _mm256_round_ps(pSrcX, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); // Nearest Neighbor X location vector
_mm256_storeu_si256((__m256i*) invalidLoad, _mm256_cvtps_epi32(_mm256_or_ps( // Vectorized ROI boundary check
_mm256_or_ps(_mm256_cmp_ps(pSrcX, pRoiLTRB[0], _CMP_LT_OQ), _mm256_cmp_ps(pSrcY, pRoiLTRB[1],_CMP_LT_OQ)),
_mm256_or_ps(_mm256_cmp_ps(pSrcX, pRoiLTRB[2], _CMP_GT_OQ), _mm256_cmp_ps(pSrcY, pRoiLTRB[3], _CMP_GT_OQ))
)));
if (hasRGBChannels)
pSrcX = _mm256_mul_ps(pSrcX, avx_p3);
__m256i pxSrcLoc = _mm256_cvtps_epi32(_mm256_fmadd_ps(pSrcY, pSrcStrideH, pSrcX));
_mm256_storeu_si256((__m256i*) srcLoc, pxSrcLoc);
}

template <typename T>
inline void compute_generic_nn_interpolation_pkd3_to_pln3(Rpp32f srcY, Rpp32f srcX, RpptROI *roiLTRB, T *dstPtrTempR, T *dstPtrTempG, T *dstPtrTempB, T *srcPtrChannel, RpptDescPtr srcDescPtr)
{
Expand Down
259 changes: 245 additions & 14 deletions src/include/cpu/rpp_cpu_simd.hpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/modules/cpu/host_tensor_effects_augmentations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,6 @@ THE SOFTWARE.
#include "kernel/noise_shot.hpp"
#include "kernel/noise_gaussian.hpp"
#include "kernel/non_linear_blend.hpp"
#include "kernel/water.hpp"

#endif // HOST_TENSOR_EFFECTS_AUGMENTATIONS_HPP
977 changes: 977 additions & 0 deletions src/modules/cpu/kernel/water.hpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/modules/hip/hip_tensor_effects_augmentations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,6 @@ THE SOFTWARE.
#include "kernel/noise_shot.hpp"
#include "kernel/noise_gaussian.hpp"
#include "kernel/non_linear_blend.hpp"
#include "kernel/water.hpp"

#endif // HIP_TENSOR_EFFECTS_AUGMENTATIONS_HPP
315 changes: 315 additions & 0 deletions src/modules/hip/kernel/water.hpp

Large diffs are not rendered by default.

159 changes: 159 additions & 0 deletions src/modules/rppt_tensor_effects_augmentations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,95 @@ RppStatus rppt_non_linear_blend_host(RppPtr_t srcPtr1,
return RPP_SUCCESS;
}

/******************** water ********************/

RppStatus rppt_water_host(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
RppPtr_t dstPtr,
RpptDescPtr dstDescPtr,
Rpp32f *amplitudeXTensor,
Rpp32f *amplitudeYTensor,
Rpp32f *frequencyXTensor,
Rpp32f *frequencyYTensor,
Rpp32f *phaseXTensor,
Rpp32f *phaseYTensor,
RpptROIPtr roiTensorPtrSrc,
RpptRoiType roiType,
rppHandle_t rppHandle)
{
RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
{
water_u8_u8_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
srcDescPtr,
static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
dstDescPtr,
amplitudeXTensor,
amplitudeYTensor,
frequencyXTensor,
frequencyYTensor,
phaseXTensor,
phaseYTensor,
roiTensorPtrSrc,
roiType,
layoutParams,
rpp::deref(rppHandle));
}
else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
{
water_f16_f16_host_tensor((Rpp16f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
srcDescPtr,
(Rpp16f*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
dstDescPtr,
amplitudeXTensor,
amplitudeYTensor,
frequencyXTensor,
frequencyYTensor,
phaseXTensor,
phaseYTensor,
roiTensorPtrSrc,
roiType,
layoutParams,
rpp::deref(rppHandle));
}
else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
{
water_f32_f32_host_tensor((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
srcDescPtr,
(Rpp32f*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
dstDescPtr,
amplitudeXTensor,
amplitudeYTensor,
frequencyXTensor,
frequencyYTensor,
phaseXTensor,
phaseYTensor,
roiTensorPtrSrc,
roiType,
layoutParams,
rpp::deref(rppHandle));
}
else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
{
water_i8_i8_host_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
srcDescPtr,
static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
dstDescPtr,
amplitudeXTensor,
amplitudeYTensor,
frequencyXTensor,
frequencyYTensor,
phaseXTensor,
phaseYTensor,
roiTensorPtrSrc,
roiType,
layoutParams,
rpp::deref(rppHandle));
}

return RPP_SUCCESS;
}

/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
Expand Down Expand Up @@ -1013,4 +1102,74 @@ RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1,
#endif // backend
}

RppStatus rppt_water_gpu(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
RppPtr_t dstPtr,
RpptDescPtr dstDescPtr,
Rpp32f *amplitudeXTensor,
Rpp32f *amplitudeYTensor,
Rpp32f *frequencyXTensor,
Rpp32f *frequencyYTensor,
Rpp32f *phaseXTensor,
Rpp32f *phaseYTensor,
RpptROIPtr roiTensorPtrSrc,
RpptRoiType roiType,
rppHandle_t rppHandle)
{
#ifdef HIP_COMPILE
Rpp32u paramIndex = 0;
copy_param_float(amplitudeXTensor, rpp::deref(rppHandle), paramIndex++);
copy_param_float(amplitudeYTensor, rpp::deref(rppHandle), paramIndex++);
copy_param_float(frequencyXTensor, rpp::deref(rppHandle), paramIndex++);
copy_param_float(frequencyYTensor, rpp::deref(rppHandle), paramIndex++);
copy_param_float(phaseXTensor, rpp::deref(rppHandle), paramIndex++);
copy_param_float(phaseYTensor, rpp::deref(rppHandle), paramIndex);

if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
{
hip_exec_water_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
srcDescPtr,
static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
dstDescPtr,
roiTensorPtrSrc,
roiType,
rpp::deref(rppHandle));
}
else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
{
hip_exec_water_tensor((half*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
srcDescPtr,
(half*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
dstDescPtr,
roiTensorPtrSrc,
roiType,
rpp::deref(rppHandle));
}
else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
{
hip_exec_water_tensor((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
srcDescPtr,
(Rpp32f*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
dstDescPtr,
roiTensorPtrSrc,
roiType,
rpp::deref(rppHandle));
}
else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
{
hip_exec_water_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
srcDescPtr,
static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
dstDescPtr,
roiTensorPtrSrc,
roiType,
rpp::deref(rppHandle));
}

return RPP_SUCCESS;
#elif defined(OCL_COMPILE)
return RPP_ERROR_NOT_IMPLEMENTED;
#endif // backend
}

#endif // GPU_SUPPORT
29 changes: 29 additions & 0 deletions utilities/test_suite/HIP/Tensor_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,35 @@ int main(int argc, char **argv)

break;
}
case 29:
{
testCaseName = "water";

Rpp32f amplX[batchSize];
Rpp32f amplY[batchSize];
Rpp32f freqX[batchSize];
Rpp32f freqY[batchSize];
Rpp32f phaseX[batchSize];
Rpp32f phaseY[batchSize];

for (i = 0; i < images; i++)
{
amplX[i] = 2.0f;
amplY[i] = 5.0f;
freqX[i] = 5.8f;
freqY[i] = 1.2f;
phaseX[i] = 10.0f;
phaseY[i] = 15.0f;
}

startWallTime = omp_get_wtime();
if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
rppt_water_gpu(d_input, srcDescPtr, d_output, dstDescPtr, amplX, amplY, freqX, freqY, phaseX, phaseY, roiTensorPtrSrc, roiTypeSrc, handle);
else
missingFuncFlag = 1;

break;
}
case 31:
{
testCaseName = "color_cast";
Expand Down
2 changes: 1 addition & 1 deletion utilities/test_suite/HIP/runTests.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def rpp_test_suite_parser_and_validator():
print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv")

# print the results of qa tests
supportedCaseList = ['0', '1', '2', '4', '13', '31', '34', '36', '37', '38','84']
supportedCaseList = ['0', '1', '2', '4', '13', '29', '31', '34', '36', '37', '38','84']
supportedCases = 0
for num in caseList:
if num in supportedCaseList:
Expand Down
29 changes: 29 additions & 0 deletions utilities/test_suite/HOST/Tensor_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,35 @@ int main(int argc, char **argv)

break;
}
case 29:
{
testCaseName = "water";

Rpp32f amplX[batchSize];
Rpp32f amplY[batchSize];
Rpp32f freqX[batchSize];
Rpp32f freqY[batchSize];
Rpp32f phaseX[batchSize];
Rpp32f phaseY[batchSize];

for (i = 0; i < batchSize; i++)
{
amplX[i] = 2.0f;
amplY[i] = 5.0f;
freqX[i] = 5.8f;
freqY[i] = 1.2f;
phaseX[i] = 10.0f;
phaseY[i] = 15.0f;
}

startWallTime = omp_get_wtime();
if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
rppt_water_host(input, srcDescPtr, output, dstDescPtr, amplX, amplY, freqX, freqY, phaseX, phaseY, roiTensorPtrSrc, roiTypeSrc, handle);
else
missingFuncFlag = 1;

break;
}
case 31:
{
testCaseName = "color_cast";
Expand Down
2 changes: 1 addition & 1 deletion utilities/test_suite/HOST/runTests.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.call(["./testAllScript.sh", srcPath1, args.input_path2, str(testType), str(numRuns), str(qaMode), str(decoderType), str(preserveOutput), str(batchSize), " ".join(caseList)]) # nosec

# print the results of qa tests
supportedCaseList = ['0', '1', '2', '4', '13', '31', '34', '36', '37', '38', '84']
supportedCaseList = ['0', '1', '2', '4', '13', '29', '31', '34', '36', '37', '38', '84']
supportedCases = 0
for num in caseList:
if num in supportedCaseList:
Expand Down
1 change: 1 addition & 0 deletions utilities/test_suite/rpp_test_suite_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ std::map<int, string> augmentationMap =
{2, "blend"},
{4, "contrast"},
{13, "exposure"},
{29, "water"},
{31, "color_cast"},
{34, "lut"},
{36, "color_twist"},
Expand Down