Skip to content

Commit

Permalink
Use ARM Advanced SIMD (NEON) intrinsics where available
Browse files Browse the repository at this point in the history
NEON support is required on the Aarch64 architecture for standard
implementations. Hardware designers for specialized markets can choose
not to support it, but that's true of floating point as well, which
we assume is supported. As with x86, some SIMD support is available
on 32-bit platforms, but those are not interesting from a performance
standpoint and would require an inconvenient runtime check.

Nathan Bossart

Reviewed by John Naylor, Andres Freund, Thomas Munro, and Tom Lane
Discussion: https://www.postgresql.org/message-id/flat/CAFBsxsEyR9JkfbPcDXBRYEfdfC__OkwVGdwEAgY4Rv0cvw35EA%40mail.gmail.com#aba7a64b11503494ffd8dd27067626a9
  • Loading branch information
j-naylor committed Aug 29, 2022
1 parent f8f19f7 commit 82739d4
Showing 1 changed file with 37 additions and 3 deletions.
40 changes: 37 additions & 3 deletions src/include/port/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,20 @@
typedef __m128i Vector8;
typedef __m128i Vector32;

#elif defined(__aarch64__) && defined(__ARM_NEON)
/*
* We use the Neon instructions if the compiler provides access to them (as
* indicated by __ARM_NEON) and we are on aarch64. While Neon support is
* technically optional for aarch64, it appears that all available 64-bit
* hardware does have it. Neon exists in some 32-bit hardware too, but we
* could not realistically use it there without a run-time check, which seems
* not worth the trouble for now.
*/
#include <arm_neon.h>
#define USE_NEON
typedef uint8x16_t Vector8;
typedef uint32x4_t Vector32;

#else
/*
* If no SIMD instructions are available, we can in some cases emulate vector
Expand Down Expand Up @@ -90,6 +104,8 @@ vector8_load(Vector8 *v, const uint8 *s)
{
#if defined(USE_SSE2)
*v = _mm_loadu_si128((const __m128i *) s);
#elif defined(USE_NEON)
*v = vld1q_u8(s);
#else
memcpy(v, s, sizeof(Vector8));
#endif
Expand All @@ -101,6 +117,8 @@ vector32_load(Vector32 *v, const uint32 *s)
{
#ifdef USE_SSE2
*v = _mm_loadu_si128((const __m128i *) s);
#elif defined(USE_NEON)
*v = vld1q_u32(s);
#endif
}
#endif /* ! USE_NO_SIMD */
Expand All @@ -113,6 +131,8 @@ vector8_broadcast(const uint8 c)
{
#if defined(USE_SSE2)
return _mm_set1_epi8(c);
#elif defined(USE_NEON)
return vdupq_n_u8(c);
#else
return ~UINT64CONST(0) / 0xFF * c;
#endif
Expand All @@ -124,6 +144,8 @@ vector32_broadcast(const uint32 c)
{
#ifdef USE_SSE2
return _mm_set1_epi32(c);
#elif defined(USE_NEON)
return vdupq_n_u32(c);
#endif
}
#endif /* ! USE_NO_SIMD */
Expand Down Expand Up @@ -153,7 +175,7 @@ vector8_has(const Vector8 v, const uint8 c)
#if defined(USE_NO_SIMD)
/* any bytes in v equal to c will evaluate to zero via XOR */
result = vector8_has_zero(v ^ vector8_broadcast(c));
#elif defined(USE_SSE2)
#else
result = vector8_is_highbit_set(vector8_eq(v, vector8_broadcast(c)));
#endif

Expand All @@ -173,7 +195,7 @@ vector8_has_zero(const Vector8 v)
* circular definition.
*/
return vector8_has_le(v, 0);
#elif defined(USE_SSE2)
#else
return vector8_has(v, 0);
#endif
}
Expand Down Expand Up @@ -223,7 +245,7 @@ vector8_has_le(const Vector8 v, const uint8 c)
}
}
}
#elif defined(USE_SSE2)
#else

/*
* Use saturating subtraction to find bytes <= c, which will present as
Expand All @@ -245,6 +267,8 @@ vector8_is_highbit_set(const Vector8 v)
{
#ifdef USE_SSE2
return _mm_movemask_epi8(v) != 0;
#elif defined(USE_NEON)
return vmaxvq_u8(v) > 0x7F;
#else
return v & vector8_broadcast(0x80);
#endif
Expand All @@ -258,6 +282,8 @@ vector8_or(const Vector8 v1, const Vector8 v2)
{
#ifdef USE_SSE2
return _mm_or_si128(v1, v2);
#elif defined(USE_NEON)
return vorrq_u8(v1, v2);
#else
return v1 | v2;
#endif
Expand All @@ -269,6 +295,8 @@ vector32_or(const Vector32 v1, const Vector32 v2)
{
#ifdef USE_SSE2
return _mm_or_si128(v1, v2);
#elif defined(USE_NEON)
return vorrq_u32(v1, v2);
#endif
}
#endif /* ! USE_NO_SIMD */
Expand All @@ -285,6 +313,8 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
{
#ifdef USE_SSE2
return _mm_subs_epu8(v1, v2);
#elif defined(USE_NEON)
return vqsubq_u8(v1, v2);
#endif
}
#endif /* ! USE_NO_SIMD */
Expand All @@ -299,6 +329,8 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
{
#ifdef USE_SSE2
return _mm_cmpeq_epi8(v1, v2);
#elif defined(USE_NEON)
return vceqq_u8(v1, v2);
#endif
}
#endif /* ! USE_NO_SIMD */
Expand All @@ -309,6 +341,8 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
{
#ifdef USE_SSE2
return _mm_cmpeq_epi32(v1, v2);
#elif defined(USE_NEON)
return vceqq_u32(v1, v2);
#endif
}
#endif /* ! USE_NO_SIMD */
Expand Down

0 comments on commit 82739d4

Please sign in to comment.