Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Quad] Add trigonometric functions #240

Merged
merged 4 commits into from Mar 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 20 additions & 0 deletions src/arch/helperadvsimd.h
Expand Up @@ -665,6 +665,20 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) };
}

static INLINE vint vuninterleave_vi_vi(vint v) { return v; }
static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }
static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }

static vmask2 vloadu_vm2_p(void *p) {
vmask2 vm2 = {
vld1q_u32((uint32_t *)p),
vld1q_u32((uint32_t *)((uint8_t *)p + sizeof(vmask)))
};
return vm2;
}

static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
union {
vargquad aq;
Expand Down Expand Up @@ -705,3 +719,9 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {

#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))

static INLINE vmask vcast_vm_vi(vint vi) {
vmask m = vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)));
return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vreinterpret_s32_u32(vget_low_u32(vgt_vo_vi_vi(vcast_vi_i(0), vi))))), m);
}
static INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vm))); }
43 changes: 43 additions & 0 deletions src/arch/helperavx.h
Expand Up @@ -575,6 +575,38 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) };
}

static INLINE vint vuninterleave_vi_vi(vint v) {
return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
}

static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
double tmp[4];
vstoreu_v_p_vd(tmp, vd);
double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
return vloadu_vd_p(tmp);
}

static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
double tmp[4];
vstoreu_v_p_vd(tmp, vd);
double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
return vloadu_vd_p(tmp);
}

static INLINE vmask vinterleave_vm_vm(vmask vm) {
double tmp[4];
vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm));
double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
return vreinterpret_vm_vd(vloadu_vd_p(tmp));
}

static INLINE vmask vuninterleave_vm_vm(vmask vm) {
double tmp[4];
vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm));
double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
return vreinterpret_vm_vd(vloadu_vd_p(tmp));
}

static vmask2 vloadu_vm2_p(void *p) {
vmask2 vm2 = {
vcast_vm_vi2(vloadu_vi2_p((int32_t *)p)),
Expand Down Expand Up @@ -645,3 +677,14 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
#define vsrl64_vm_vm_i(x, c) \
_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \
_mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)

static INLINE vmask vcast_vm_vi(vint vi) {
vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1));
vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1));
vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1);
return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1)))), m);
}
static INLINE vint vcast_vi_vm(vmask vm) {
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
}
26 changes: 26 additions & 0 deletions src/arch/helperavx2.h
Expand Up @@ -428,6 +428,26 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
return (vmask2) { _mm256_unpacklo_epi64(v.x, v.y), _mm256_unpackhi_epi64(v.x, v.y) };
}

static INLINE vint vuninterleave_vi_vi(vint v) {
return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
}

static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)));
}

static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)));
}

static INLINE vmask vinterleave_vm_vm(vmask vm) {
return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0));
}

static INLINE vmask vuninterleave_vm_vm(vmask vm) {
return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0));
}

static vmask2 vloadu_vm2_p(void *p) {
vmask2 vm2 = {
vloadu_vi2_p((int32_t *)p),
Expand Down Expand Up @@ -481,3 +501,9 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi

#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)

static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); }
static INLINE vint vcast_vi_vm(vmask vm) {
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
}
27 changes: 27 additions & 0 deletions src/arch/helperavx512f.h
Expand Up @@ -545,6 +545,26 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
return (vmask2) { _mm512_unpacklo_epi64(v.x, v.y), _mm512_unpackhi_epi64(v.x, v.y) };
}

static INLINE vint vuninterleave_vi_vi(vint v) {
return _mm256_permutevar8x32_epi32(v, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
}

static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vreinterpret_vm_vd(vd)));
}

static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vreinterpret_vm_vd(vd)));
}

static INLINE vmask vinterleave_vm_vm(vmask vm) {
return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vm);
}

static INLINE vmask vuninterleave_vm_vm(vmask vm) {
return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vm);
}

static vmask2 vloadu_vm2_p(void *p) {
vmask2 vm2 = {
vloadu_vi2_p((int32_t *)p),
Expand Down Expand Up @@ -600,3 +620,10 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64

#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)

static INLINE vmask vcast_vm_vi(vint vi) {
return _mm512_cvtepi32_epi64(vi);
}
static INLINE vint vcast_vi_vm(vmask vm) {
return _mm512_cvtepi64_epi32(vm);
}
8 changes: 8 additions & 0 deletions src/arch/helperpurec_scalar.h
Expand Up @@ -379,6 +379,11 @@ typedef Sleef_quad1 vargquad;

static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) { return v; }
static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) { return v; }
static INLINE vint vuninterleave_vi_vi(vint v) { return v; }
static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }
static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }

static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
union {
Expand Down Expand Up @@ -408,3 +413,6 @@ static INLINE vmask vneg64_vm_vm(vmask x) { return -(int64_t)x; }
#define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))

static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return (int64_t)x > (int64_t)y ? ~(uint32_t)0 : 0; }

static INLINE vmask vcast_vm_vi(vint vi) { return vi; }
static INLINE vint vcast_vi_vm(vmask vm) { return vm; }
12 changes: 12 additions & 0 deletions src/arch/helpersse2.h
Expand Up @@ -454,6 +454,12 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
return (vmask2) { _mm_unpacklo_epi64(v.x, v.y), _mm_unpackhi_epi64(v.x, v.y) };
}

static INLINE vint vuninterleave_vi_vi(vint v) { return v; }
static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }
static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }

static vmask2 vloadu_vm2_p(void *p) {
vmask2 vm2 = {
vloadu_vi2_p((int32_t *)p),
Expand Down Expand Up @@ -513,3 +519,9 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
_mm_storeu_si128((__m128i *)ay, y);
return _mm_set_epi64x(ax[1] > ay[1] ? -1 : 0, ax[0] > ay[0] ? -1 : 0);
}

static INLINE vmask vcast_vm_vi(vint vi) {
vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
return vor_vm_vm_vm(vcastu_vi2_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
}
static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }
33 changes: 33 additions & 0 deletions src/arch/helpersve.h
Expand Up @@ -791,6 +791,36 @@ static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
svreinterpret_s32_u64(svtrn2_u64(svreinterpret_u64_s32(v.x), svreinterpret_u64_s32(v.y))) };
}

static INLINE vint vuninterleave_vi_vi(vint v) {
return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v)),
svtrn2_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v))));
}

static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
return svtrn1_f64(svzip1_f64(vd, vd), svzip2_f64(vd, vd));
}

static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
return svuzp1_f64(svtrn1_f64(vd, vd), svtrn2_f64(vd, vd));
}

static INLINE vmask vinterleave_vm_vm(vmask vm) {
return svreinterpret_s32_u64(svtrn1_u64(svzip1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)),
svzip2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm))));
}
static INLINE vmask vuninterleave_vm_vm(vmask vm) {
return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)),
svtrn2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm))));
}

static vmask2 vloadu_vm2_p(void *p) {
vmask2 vm2 = {
svld1_s32(ptrue, (int32_t *)p),
svld1_s32(ptrue, (int32_t *)((uint8_t *)p + 8 * svcntd()))
};
return vm2;
}

static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
return vinterleave_vm2_vm2((vmask2) { svld1_s32(ptrue, (int32_t *)&aq), svld1_s32(ptrue, (int32_t *)&(aq.s[svcntd()/2])) });
}
Expand Down Expand Up @@ -827,3 +857,6 @@ static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {

#define vsll64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))
#define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))

static INLINE vmask vcast_vm_vi(vint vi) { return svreinterpret_s32_s64(svextw_s64_z(ptrue, svreinterpret_s64_s32(vi))); }
static INLINE vint vcast_vi_vm(vmask vm) { return vand_vm_vm_vm(vm, vcast_vm_i_i(0, 0xffffffff)); }
2 changes: 1 addition & 1 deletion src/common/misc.h
Expand Up @@ -172,7 +172,7 @@ typedef struct {
#if defined(ENABLEFLOAT128)
typedef __float128 Sleef_quad;
#else
typedef struct { uint64_t x, y; } Sleef_quad;
typedef struct { double x, y; } Sleef_quad;
#endif
#endif

Expand Down
78 changes: 78 additions & 0 deletions src/quad-tester/qiutsimd.c
Expand Up @@ -171,6 +171,61 @@ typedef union {
} \
}

#define func_i_q_q(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
sentinel = 0; \
int lane = xrand() % VECTLENDP; \
cnv128 c0, c1; \
sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64 " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l, &c1.h, &c1.l); \
vargquad a0, a1; \
memrand(&a0, sizeof(vargquad)); \
memrand(&a1, sizeof(vargquad)); \
a0.s[lane] = c0.q; \
a1.s[lane] = c1.q; \
vint vi = funcName(a0, a1); \
int t[VECTLENDP]; \
vstoreu_v_p_vi(t, vi); \
printf("%d\n", t[lane]); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}

#define func_d_q(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
sentinel = 0; \
int lane = xrand() % VECTLENDP; \
cnv128 c0; \
sscanf(buf, funcStr " %" PRIx64 ":%" PRIx64, &c0.h, &c0.l); \
vargquad a0; \
memrand(&a0, sizeof(vargquad)); \
a0.s[lane] = c0.q; \
double d[VECTLENDP]; \
vstoreu_v_p_vd(d, funcName(a0)); \
printf("%" PRIx64 "\n", d2u(d[lane])); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}

#define func_q_d(funcStr, funcName) { \
while (startsWith(buf, funcStr " ")) { \
sentinel = 0; \
int lane = xrand() % VECTLENDP; \
uint64_t u; \
sscanf(buf, funcStr " %" PRIx64, &u); \
double s[VECTLENDP]; \
memrand(s, sizeof(s)); \
s[lane] = u2d(u); \
vargquad a0 = funcName(vloadu_vd_p(s)); \
cnv128 c0; \
c0.q = a0.s[lane]; \
printf("%" PRIx64 ":%" PRIx64 "\n", c0.h, c0.l); \
fflush(stdout); \
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
} \
}

#define func_strtoq(funcStr) { \
while (startsWith(buf, funcStr " ")) { \
sentinel = 0; \
Expand Down Expand Up @@ -224,7 +279,30 @@ int do_test(int argc, char **argv) {
func_q_q_q("mulq_u05", xmulq_u05);
func_q_q_q("divq_u05", xdivq_u05);
func_q_q("sqrtq_u05", xsqrtq_u05);
func_q_q("sinq_u10", xsinq_u10);
func_q_q("cosq_u10", xcosq_u10);
func_q_q("tanq_u10", xtanq_u10);
func_q_q("asinq_u10", xasinq_u10);
func_q_q("acosq_u10", xacosq_u10);
func_q_q("atanq_u10", xatanq_u10);
func_q_q("expq_u10", xexpq_u10);
func_q_q("exp2q_u10", xexp2q_u10);
func_q_q("exp10q_u10", xexp10q_u10);
func_q_q("expm1q_u10", xexpm1q_u10);
func_q_q("logq_u10", xlogq_u10);
func_q_q("log2q_u10", xlog2q_u10);
func_q_q("log10q_u10", xlog10q_u10);
func_q_q("log1pq_u10", xlog1pq_u10);
func_q_q("negq", xnegq);
func_q_d("cast_from_doubleq", xcast_from_doubleq);
func_d_q("cast_to_doubleq", xcast_to_doubleq);
func_i_q_q("cmpltq", xcmpltq);
func_i_q_q("cmpgtq", xcmpgtq);
func_i_q_q("cmpleq", xcmpleq);
func_i_q_q("cmpgeq", xcmpgeq);
func_i_q_q("cmpeqq", xcmpeqq);
func_i_q_q("cmpneqq", xcmpneqq);
func_i_q_q("unordq", xunordq);
func_strtoq("strtoq");
func_qtostr("qtostr");
sentinel++;
Expand Down