Skip to content

Commit

Permalink
target/arm: Use clmul_8* routines
Browse files Browse the repository at this point in the history
Use generic routines for 8-bit carry-less multiply.
Remove our local version of pmull_h.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
  • Loading branch information
rth7680 committed Sep 15, 2023
1 parent 07f348d commit 8e3da4c
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 57 deletions.
8 changes: 3 additions & 5 deletions target/arm/tcg/mve_helper.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "exec/exec-all.h"
#include "tcg/tcg.h"
#include "fpu/softfloat.h"
#include "crypto/clmul.h"

static uint16_t mve_eci_mask(CPUARMState *env)
{
Expand Down Expand Up @@ -984,15 +985,12 @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
* Polynomial multiply. We can always do this generating 64 bits
* of the result at a time, so we don't need to use DO_2OP_L.
*/
#define VMULLPH_MASK 0x00ff00ff00ff00ffULL
#define VMULLPW_MASK 0x0000ffff0000ffffULL
#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK)
#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8)
#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)

DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH)
DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH)
DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even)
DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd)
DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)

Expand Down
53 changes: 6 additions & 47 deletions target/arm/tcg/vec_helper.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
#include "qemu/int128.h"
#include "crypto/clmul.h"
#include "vec_internal.h"

/*
Expand Down Expand Up @@ -1986,21 +1987,11 @@ void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
*/
void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
{
intptr_t i, j, opr_sz = simd_oprsz(desc);
intptr_t i, opr_sz = simd_oprsz(desc);
uint64_t *d = vd, *n = vn, *m = vm;

for (i = 0; i < opr_sz / 8; ++i) {
uint64_t nn = n[i];
uint64_t mm = m[i];
uint64_t rr = 0;

for (j = 0; j < 8; ++j) {
uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
rr ^= mm & mask;
mm = (mm << 1) & 0xfefefefefefefefeull;
nn >>= 1;
}
d[i] = rr;
d[i] = clmul_8x8_low(n[i], m[i]);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
Expand Down Expand Up @@ -2038,22 +2029,6 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
clear_tail(d, opr_sz, simd_maxsz(desc));
}

/*
* 8x8->16 polynomial multiply.
*
* The byte inputs are expanded to (or extracted from) half-words.
* Note that neon and sve2 get the inputs from different positions.
* This allows 4 bytes to be processed in parallel with uint64_t.
*/

static uint64_t expand_byte_to_half(uint64_t x)
{
return (x & 0x000000ff)
| ((x & 0x0000ff00) << 8)
| ((x & 0x00ff0000) << 16)
| ((x & 0xff000000) << 24);
}

uint64_t pmull_w(uint64_t op1, uint64_t op2)
{
uint64_t result = 0;
Expand All @@ -2067,29 +2042,16 @@ uint64_t pmull_w(uint64_t op1, uint64_t op2)
return result;
}

uint64_t pmull_h(uint64_t op1, uint64_t op2)
{
uint64_t result = 0;
int i;
for (i = 0; i < 8; ++i) {
uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
result ^= op2 & mask;
op1 >>= 1;
op2 <<= 1;
}
return result;
}

void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
{
int hi = simd_data(desc);
uint64_t *d = vd, *n = vn, *m = vm;
uint64_t nn = n[hi], mm = m[hi];

d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
d[0] = clmul_8x4_packed(nn, mm);
nn >>= 32;
mm >>= 32;
d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
d[1] = clmul_8x4_packed(nn, mm);

clear_tail(d, 16, simd_maxsz(desc));
}
Expand All @@ -2102,10 +2064,7 @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
uint64_t *d = vd, *n = vn, *m = vm;

for (i = 0; i < opr_sz / 8; ++i) {
uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;

d[i] = pmull_h(nn, mm);
d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
}
}

Expand Down
5 changes: 0 additions & 5 deletions target/arm/tcg/vec_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,6 @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);

/*
* 8 x 8 -> 16 vector polynomial multiply where the inputs are
* in the low 8 bits of each 16-bit element
*/
uint64_t pmull_h(uint64_t op1, uint64_t op2);
/*
* 16 x 16 -> 32 vector polynomial multiply where the inputs are
* in the low 16 bits of each 32-bit element
Expand Down

0 comments on commit 8e3da4c

Please sign in to comment.