Skip to content

Commit

Permalink
sse, sse2: add range checks to several conversion functions
Browse files Browse the repository at this point in the history
For value outside of (INT32_MIN, INT32_MAX), as well as NaNs, x86
returns INT32_MIN.

I added a SIMDE_FAST_CONVERSION_RANGE macro (which is defined by
default if SIMDE_FAST_MATH is defined) to opt-out of the more accurate
behavior since it's also substantially slower on many platforms.

Fixes #685
  • Loading branch information
nemequ committed Dec 31, 2020
1 parent 31983d2 commit c3d7abf
Show file tree
Hide file tree
Showing 6 changed files with 530 additions and 334 deletions.
10 changes: 10 additions & 0 deletions simde/simde-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,16 @@
#define SIMDE_FAST_ROUND_TIES
#endif

/* For functions which convert from one type to another (mostly from
* floating point to integer types), sometimes we need to do a range
* check and potentially return a different result if the value
* falls outside that range. Skipping this check can provide a
* performance boost, at the expense of faithfulness to the API we're
* emulating. */
#if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_NO_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_MATH)
#define SIMDE_FAST_CONVERSION_RANGE
#endif

#if \
HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
Expand Down
32 changes: 24 additions & 8 deletions simde/x86/sse.h
Original file line number Diff line number Diff line change
Expand Up @@ -1948,12 +1948,18 @@ simde_mm_cvtps_pi32 (simde__m128 a) {
simde__m64_private r_;
simde__m128_private a_ = simde__m128_to_private(a);

#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399)
r_.neon_i32 = vcvt_s32_f32(vget_low_f32(vrndiq_f32(a_.neon_f32)));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_roundf(a_.f32[i]));
simde_float32 v = simde_math_roundf(a_.f32[i]);
#if !defined(SIMDE_FAST_CONVERSION_RANGE)
r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#else
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
#endif
}
#endif

Expand Down Expand Up @@ -2161,14 +2167,18 @@ simde_mm_cvtt_ps2pi (simde__m128 a) {
simde__m64_private r_;
simde__m128_private a_ = simde__m128_to_private(a);

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
#elif defined(SIMDE_CONVERT_VECTOR_)
SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
simde_float32 v = a_.f32[i];
#if !defined(SIMDE_FAST_CONVERSION_RANGE)
r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#else
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
#endif
}
#endif

Expand All @@ -2189,10 +2199,16 @@ simde_mm_cvtt_ss2si (simde__m128 a) {
#else
simde__m128_private a_ = simde__m128_to_private(a);

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0));
#else
return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]);
simde_float32 v = a_.f32[0];
#if !defined(SIMDE_FAST_CONVERSION_RANGE)
return ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#else
return SIMDE_CONVERT_FTOI(int32_t, v);
#endif
#endif
#endif
}
Expand Down
138 changes: 84 additions & 54 deletions simde/x86/sse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -2515,46 +2515,48 @@ simde_mm_cvtepi32_ps (simde__m128i a) {
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_cvtpd_epi32 (simde__m128d a) {
#if defined(SIMDE_X86_SSE2_NATIVE)
return _mm_cvtpd_epi32(a);
simde__m64
simde_mm_cvtpd_pi32 (simde__m128d a) {
#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
return _mm_cvtpd_pi32(a);
#else
simde__m128i_private r_;
simde__m64_private r_;
simde__m128d_private a_ = simde__m128d_to_private(a);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i]));
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
simde_float64 v = simde_math_round(a_.f64[i]);
#if defined(SIMDE_FAST_CONVERSION_RANGE)
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
#else
r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#endif
}
simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));

return simde__m128i_from_private(r_);
return simde__m64_from_private(r_);
#endif
}
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
#define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde__m64
simde_mm_cvtpd_pi32 (simde__m128d a) {
#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
return _mm_cvtpd_pi32(a);
simde__m128i
simde_mm_cvtpd_epi32 (simde__m128d a) {
#if defined(SIMDE_X86_SSE2_NATIVE)
return _mm_cvtpd_epi32(a);
#else
simde__m64_private r_;
simde__m128d_private a_ = simde__m128d_to_private(a);
simde__m128i_private r_;

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_nearbyint(a_.f64[i]));
}
r_.m64[0] = simde_mm_cvtpd_pi32(a);
r_.m64[1] = simde_mm_setzero_si64();

return simde__m64_from_private(r_);
return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
#define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
Expand Down Expand Up @@ -2620,11 +2622,11 @@ simde_mm_cvtps_epi32 (simde__m128 a) {
simde__m128i_private r_;
simde__m128_private a_ = simde__m128_to_private(a);

#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_ROUND_TIES)
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_ROUND_TIES)
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
Expand All @@ -2634,7 +2636,13 @@ simde_mm_cvtps_epi32 (simde__m128 a) {
a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1));
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.f32[i]);
simde_float32 v = simde_math_roundf(a_.f32[i]);
#if defined(SIMDE_FAST_CONVERSION_RANGE)
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
#else
r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#endif
}
#endif

Expand Down Expand Up @@ -2679,7 +2687,14 @@ simde_mm_cvtsd_si32 (simde__m128d a) {
return _mm_cvtsd_si32(a);
#else
simde__m128d_private a_ = simde__m128d_to_private(a);
return SIMDE_CONVERT_FTOI(int32_t, simde_math_round(a_.f64[0]));

simde_float64 v = simde_math_round(a_.f64[0]);
#if defined(SIMDE_FAST_CONVERSION_RANGE)
return SIMDE_CONVERT_FTOI(int32_t, v);
#else
return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#endif
#endif
}
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
Expand Down Expand Up @@ -2962,27 +2977,6 @@ simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) {
#define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_cvttpd_epi32 (simde__m128d a) {
#if defined(SIMDE_X86_SSE2_NATIVE)
return _mm_cvttpd_epi32(a);
#else
simde__m128i_private r_;
simde__m128d_private a_ = simde__m128d_to_private(a);

for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
}
simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));

return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde__m64
simde_mm_cvttpd_pi32 (simde__m128d a) {
Expand All @@ -2992,11 +2986,17 @@ simde_mm_cvttpd_pi32 (simde__m128d a) {
simde__m64_private r_;
simde__m128d_private a_ = simde__m128d_to_private(a);

#if defined(SIMDE_CONVERT_VECTOR_)
#if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
#else
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
simde_float64 v = a_.f64[i];
#if defined(SIMDE_FAST_CONVERSION_RANGE)
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
#else
r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#endif
}
#endif

Expand All @@ -3007,6 +3007,24 @@ simde_mm_cvttpd_pi32 (simde__m128d a) {
#define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_cvttpd_epi32 (simde__m128d a) {
#if defined(SIMDE_X86_SSE2_NATIVE)
return _mm_cvttpd_epi32(a);
#else
simde__m128i_private r_;

r_.m64[0] = simde_mm_cvttpd_pi32(a);
r_.m64[1] = simde_mm_setzero_si64();

return simde__m128i_from_private(r_);
#endif
}
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
#define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
#endif

SIMDE_FUNCTION_ATTRIBUTES
simde__m128i
simde_mm_cvttps_epi32 (simde__m128 a) {
Expand All @@ -3016,13 +3034,19 @@ simde_mm_cvttps_epi32 (simde__m128 a) {
simde__m128i_private r_;
simde__m128_private a_ = simde__m128_to_private(a);

#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
#elif defined(SIMDE_CONVERT_VECTOR_)
#elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
#else
for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) {
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
simde_float32 v = a_.f32[i];
#if defined(SIMDE_FAST_CONVERSION_RANGE)
r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
#else
r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#endif
}
#endif

Expand All @@ -3040,7 +3064,13 @@ simde_mm_cvttsd_si32 (simde__m128d a) {
return _mm_cvttsd_si32(a);
#else
simde__m128d_private a_ = simde__m128d_to_private(a);
return SIMDE_CONVERT_FTOI(int32_t, a_.f64[0]);
simde_float64 v = a_.f64[0];
#if defined(SIMDE_FAST_CONVERSION_RANGE)
return SIMDE_CONVERT_FTOI(int32_t, v);
#else
return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ?
SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN;
#endif
#endif
}
#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
Expand Down
40 changes: 40 additions & 0 deletions test/x86/skel.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,46 @@ test_simde_mm_xxx_pd (SIMDE_MUNIT_TEST_ARGS) {
#endif
}

static int
test_simde_mm_xxx_pd (SIMDE_MUNIT_TEST_ARGS) {
#if 0
static const struct {
const simde_float64 a[2];
const simde_float64 b[2];
const simde_float64 r[2];
} test_vec[] = {
#if !defined(SIMDE_FAST_NANS)

#endif

};

for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
simde__m128d a = simde_mm_loadu_pd(test_vec[i].a);
simde__m128d b = simde_mm_loadu_pd(test_vec[i].b);
simde__m128d r = simde_mm_xxx_pd(a, b);
simde_test_x86_assert_equal_f64x2(r, simde_mm_loadu_pd(test_vec[i].r), 1);
}

return 0;
#else
fputc('\n', stdout);
simde_float64 values[8 * 2 * sizeof(simde__m128d)];
simde_test_x86_random_f64x2_full(8, 2, values, -1000.0, 1000.0, SIMDE_TEST_VEC_FLOAT_NAN);

for (size_t i = 0 ; i < 8 ; i++) {
simde__m128d a = simde_test_x86_random_extract_f64x2(i, 2, 0, values);
simde__m128d b = simde_test_x86_random_extract_f64x2(i, 2, 1, values);
simde__m128d r = simde_mm_xxx_pd(a, b);

simde_test_x86_write_f64x2(2, a, SIMDE_TEST_VEC_POS_FIRST);
simde_test_x86_write_f64x2(2, b, SIMDE_TEST_VEC_POS_MIDDLE);
simde_test_x86_write_f64x2(2, r, SIMDE_TEST_VEC_POS_LAST);
}
return 1;
#endif
}

static int
test_simde_mm_xxx_epi8 (SIMDE_MUNIT_TEST_ARGS) {
#if 0
Expand Down
Loading

0 comments on commit c3d7abf

Please sign in to comment.