diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index bdf893dcf..f9d43f956 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -4995,10 +4995,10 @@ inline void compute_generic_bilinear_srclocs_and_interpolate(T *srcPtrChannel, R for (int c = 0; c < srcDescPtr->c; c++) { - dst[c] = (T)((*(srcPtrChannel + srcLoc[0]) * bilinearCoeffs[0]) + // TopRow R01 Pixel * coeff0 + dst[c] = (T)std::nearbyintf(((*(srcPtrChannel + srcLoc[0]) * bilinearCoeffs[0]) + // TopRow R01 Pixel * coeff0 (*(srcPtrChannel + srcLoc[1]) * bilinearCoeffs[1]) + // TopRow R02 Pixel * coeff1 (*(srcPtrChannel + srcLoc[2]) * bilinearCoeffs[2]) + // BottomRow R01 Pixel * coeff2 - (*(srcPtrChannel + srcLoc[3]) * bilinearCoeffs[3])); // BottomRow R02 Pixel * coeff3 + (*(srcPtrChannel + srcLoc[3]) * bilinearCoeffs[3]))); // BottomRow R02 Pixel * coeff3 srcPtrChannel += srcDescPtr->strides.cStride; } } diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 6378e0d8c..d11a893c1 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -2973,7 +2973,8 @@ inline void rpp_store8_u8pln1_to_u8pln1_avx(Rpp8u* dstPtr, __m256i &p) inline void rpp_store8_f32pln1_to_u8pln1_avx(Rpp8u* dstPtr, __m256 &p) { __m256i px1 = _mm256_permute4x64_epi64(_mm256_packus_epi32(_mm256_cvtps_epi32(p), avx_px0), _MM_SHUFFLE(3,1,2,0)); - rpp_storeu_si64((__m128i *)(dstPtr), _mm256_packus_epi16(px1, avx_px0)); + px1 = _mm256_packus_epi16(px1, avx_px0); + rpp_store8_u8pln1_to_u8pln1_avx(dstPtr, px1); } inline void rpp_store24_f32pln3_to_u8pln3_avx(Rpp8u* dstRPtr, Rpp8u* dstGPtr, Rpp8u* dstBPtr, __m256* p)