Skip to content

Commit

Permalink
target/arm: Implement SVE2 saturating multiply-add high
Browse files Browse the repository at this point in the history
SVE2 has two additional sizes of the operation and unlike NEON,
there is no saturation flag.  Create new entry points for SVE2
that do not set QC.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210525010358.152808-36-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
  • Loading branch information
rth7680 authored and pm215 committed May 25, 2021
1 parent bfc9307 commit ab3ddf3
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 6 deletions.
17 changes: 17 additions & 0 deletions target/arm/helper.h
Expand Up @@ -591,6 +591,23 @@ DEF_HELPER_FLAGS_5(gvec_qrdmlah_s32, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s32, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)

DEF_HELPER_FLAGS_5(sve2_sqrdmlah_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_sqrdmlah_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_sqrdmlah_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_sqrdmlah_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_sqrdmlsh_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)

DEF_HELPER_FLAGS_4(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
Expand Down
5 changes: 5 additions & 0 deletions target/arm/sve.decode
Expand Up @@ -1346,3 +1346,8 @@ SQDMLSLT_zzzw 01000100 .. 0 ..... 0110 11 ..... ..... @rda_rn_rm

SQDMLALBT 01000100 .. 0 ..... 00001 0 ..... ..... @rda_rn_rm
SQDMLSLBT 01000100 .. 0 ..... 00001 1 ..... ..... @rda_rn_rm

## SVE2 saturating multiply-add high

SQRDMLAH_zzzz 01000100 .. 0 ..... 01110 0 ..... ..... @rda_rn_rm
SQRDMLSH_zzzz 01000100 .. 0 ..... 01110 1 ..... ..... @rda_rn_rm
18 changes: 18 additions & 0 deletions target/arm/translate-sve.c
Expand Up @@ -7562,3 +7562,21 @@ static bool trans_SQDMLSLBT(DisasContext *s, arg_rrrr_esz *a)
{
return do_sqdmlsl_zzzw(s, a, false, true);
}

static bool trans_SQRDMLAH_zzzz(DisasContext *s, arg_rrrr_esz *a)
{
static gen_helper_gvec_4 * const fns[] = {
gen_helper_sve2_sqrdmlah_b, gen_helper_sve2_sqrdmlah_h,
gen_helper_sve2_sqrdmlah_s, gen_helper_sve2_sqrdmlah_d,
};
return do_sve2_zzzz_ool(s, a, fns[a->esz], 0);
}

static bool trans_SQRDMLSH_zzzz(DisasContext *s, arg_rrrr_esz *a)
{
static gen_helper_gvec_4 * const fns[] = {
gen_helper_sve2_sqrdmlsh_b, gen_helper_sve2_sqrdmlsh_h,
gen_helper_sve2_sqrdmlsh_s, gen_helper_sve2_sqrdmlsh_d,
};
return do_sve2_zzzz_ool(s, a, fns[a->esz], 0);
}
161 changes: 155 additions & 6 deletions target/arm/vec_helper.c
Expand Up @@ -22,6 +22,7 @@
#include "exec/helper-proto.h"
#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
#include "qemu/int128.h"
#include "vec_internal.h"

/* Note that vector data is stored in host-endian 64-bit chunks,
Expand All @@ -36,19 +37,59 @@
#define H4(x) (x)
#endif

/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
static int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
bool neg, bool round, uint32_t *sat)
/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
static int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
bool neg, bool round)
{
/*
* Simplify:
* = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16
* = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15
* = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
* = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
*/
int32_t ret = (int32_t)src1 * src2;
if (neg) {
ret = -ret;
}
ret += ((int32_t)src3 << 7) + (round << 6);
ret >>= 7;

if (ret != (int8_t)ret) {
ret = (ret < 0 ? INT8_MIN : INT8_MAX);
}
return ret;
}

void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
int8_t *d = vd, *n = vn, *m = vm, *a = va;

for (i = 0; i < opr_sz; ++i) {
d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
}
}

void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
int8_t *d = vd, *n = vn, *m = vm, *a = va;

for (i = 0; i < opr_sz; ++i) {
d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
}
}

/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
static int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
bool neg, bool round, uint32_t *sat)
{
/* Simplify similarly to do_sqrdmlah_b above. */
int32_t ret = (int32_t)src1 * src2;
if (neg) {
ret = -ret;
}
ret += ((int32_t)src3 << 15) + (round << 14);
ret >>= 15;

Expand Down Expand Up @@ -133,11 +174,35 @@ void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}

void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
int16_t *d = vd, *n = vn, *m = vm, *a = va;
uint32_t discard;

for (i = 0; i < opr_sz / 2; ++i) {
d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
}
}

void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
int16_t *d = vd, *n = vn, *m = vm, *a = va;
uint32_t discard;

for (i = 0; i < opr_sz / 2; ++i) {
d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
}
}

/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
static int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
bool neg, bool round, uint32_t *sat)
{
/* Simplify similarly to int_qrdmlah_s16 above. */
/* Simplify similarly to do_sqrdmlah_b above. */
int64_t ret = (int64_t)src1 * src2;
if (neg) {
ret = -ret;
Expand Down Expand Up @@ -220,6 +285,90 @@ void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}

void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
int32_t *d = vd, *n = vn, *m = vm, *a = va;
uint32_t discard;

for (i = 0; i < opr_sz / 4; ++i) {
d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
}
}

void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
int32_t *d = vd, *n = vn, *m = vm, *a = va;
uint32_t discard;

for (i = 0; i < opr_sz / 4; ++i) {
d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
}
}

/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
static int64_t do_sat128_d(Int128 r)
{
int64_t ls = int128_getlo(r);
int64_t hs = int128_gethi(r);

if (unlikely(hs != (ls >> 63))) {
return hs < 0 ? INT64_MIN : INT64_MAX;
}
return ls;
}

static int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a,
bool neg, bool round)
{
uint64_t l, h;
Int128 r, t;

/* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
muls64(&l, &h, m, n);
r = int128_make128(l, h);
if (neg) {
r = int128_neg(r);
}
if (a) {
t = int128_exts64(a);
t = int128_lshift(t, 63);
r = int128_add(r, t);
}
if (round) {
t = int128_exts64(1ll << 62);
r = int128_add(r, t);
}
r = int128_rshift(r, 63);

return do_sat128_d(r);
}

void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
int64_t *d = vd, *n = vn, *m = vm, *a = va;

for (i = 0; i < opr_sz / 8; ++i) {
d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
}
}

void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
int64_t *d = vd, *n = vn, *m = vm, *a = va;

for (i = 0; i < opr_sz / 8; ++i) {
d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
}
}

/* Integer 8 and 16-bit dot-product.
*
* Note that for the loops herein, host endianness does not matter
Expand Down

0 comments on commit ab3ddf3

Please sign in to comment.