Skip to content

Commit

Permalink
target/ppc: Move V(ADD|SUB)CUW to decodetree and use gvec
Browse files Browse the repository at this point in the history
This patch moves VADDCUW and VSUBCUW to decodtree with gvec using an
implementation based on the helper, with the main difference being
changing the -1 (aka all bits set to 1) result returned by cmp when
true to +1. It also implemented a .fni4 version of those instructions
and dropped the helper.

vaddcuw:
rept    loop    master             patch
8       12500   0,01008200         0,00612400 (-39.3%)
25      4000    0,01091500         0,00471600 (-56.8%)
100     1000    0,01332500         0,00593700 (-55.4%)
500     200     0,01998500         0,01275700 (-36.2%)
2500    40      0,04704300         0,04364300 (-7.2%)
8000    12      0,10748200         0,11241000 (+4.6%)

vsubcuw:
rept    loop    master             patch
8       12500   0,01226200         0,00571600 (-53.4%)
25      4000    0,01493500         0,00462100 (-69.1%)
100     1000    0,01522700         0,00455100 (-70.1%)
500     200     0,02384600         0,01133500 (-52.5%)
2500    40      0,04935200         0,03178100 (-35.6%)
8000    12      0,09039900         0,09440600 (+4.4%)

Overall there was a gain in performance, but the TCGop code was still
slightly bigger in the new version (it went from 4 to 5).

Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20221019125040.48028-4-lucas.araujo@eldorado.org.br>
Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
  • Loading branch information
Lucas Mateus Castro (alqotel) authored and danielhb committed Oct 28, 2022
1 parent 306e475 commit 611bc69
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 26 deletions.
2 changes: 0 additions & 2 deletions target/ppc/helper.h
Expand Up @@ -193,11 +193,9 @@ DEF_HELPER_FLAGS_3(vslo, TCG_CALL_NO_RWG, void, avr, avr, avr)
DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, avr)
DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
DEF_HELPER_FLAGS_3(vaddcuw, TCG_CALL_NO_RWG, void, avr, avr, avr)
DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
DEF_HELPER_FLAGS_3(vsubcuw, TCG_CALL_NO_RWG, void, avr, avr, avr)
DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
DEF_HELPER_FLAGS_5(vaddsws, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
Expand Down
2 changes: 2 additions & 0 deletions target/ppc/insn32.decode
Expand Up @@ -608,12 +608,14 @@ VRLQNM 000100 ..... ..... ..... 00101000101 @VX

## Vector Integer Arithmetic Instructions

VADDCUW 000100 ..... ..... ..... 00110000000 @VX
VADDCUQ 000100 ..... ..... ..... 00101000000 @VX
VADDUQM 000100 ..... ..... ..... 00100000000 @VX

VADDEUQM 000100 ..... ..... ..... ..... 111100 @VA
VADDECUQ 000100 ..... ..... ..... ..... 111101 @VA

VSUBCUW 000100 ..... ..... ..... 10110000000 @VX
VSUBCUQ 000100 ..... ..... ..... 10101000000 @VX
VSUBUQM 000100 ..... ..... ..... 10100000000 @VX

Expand Down
18 changes: 0 additions & 18 deletions target/ppc/int_helper.c
Expand Up @@ -492,15 +492,6 @@ static inline void set_vscr_sat(CPUPPCState *env)
env->vscr_sat.u32[0] = 1;
}

void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
int i;

for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
r->u32[i] = ~a->u32[i] < b->u32[i];
}
}

/* vprtybw */
void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
{
Expand Down Expand Up @@ -1962,15 +1953,6 @@ void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
#endif
}

void helper_vsubcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
int i;

for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
r->u32[i] = a->u32[i] >= b->u32[i];
}
}

void helper_vsumsws(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
int64_t t;
Expand Down
61 changes: 57 additions & 4 deletions target/ppc/translate/vmx-impl.c.inc
Expand Up @@ -803,8 +803,6 @@ GEN_VXFORM(vsrv, 2, 28);
GEN_VXFORM(vslv, 2, 29);
GEN_VXFORM(vslo, 6, 16);
GEN_VXFORM(vsro, 6, 17);
GEN_VXFORM(vaddcuw, 0, 6);
GEN_VXFORM(vsubcuw, 0, 22);

static bool do_vector_gvec3_VX(DisasContext *ctx, arg_VX *a, int vece,
void (*gen_gvec)(unsigned, uint32_t, uint32_t,
Expand Down Expand Up @@ -2847,8 +2845,6 @@ static void gen_xpnd04_2(DisasContext *ctx)
}


GEN_VXFORM_DUAL(vsubcuw, PPC_ALTIVEC, PPC_NONE, \
xpnd04_1, PPC_NONE, PPC2_ISA300)
GEN_VXFORM_DUAL(vsubsws, PPC_ALTIVEC, PPC_NONE, \
xpnd04_2, PPC_NONE, PPC2_ISA300)

Expand Down Expand Up @@ -3110,6 +3106,63 @@ TRANS_FLAGS2(ALTIVEC_207, VPMSUMD, do_vx_helper, gen_helper_VPMSUMD)
TRANS_FLAGS2(ALTIVEC_207, VSUBCUQ, do_vx_helper, gen_helper_VSUBCUQ)
TRANS_FLAGS2(ALTIVEC_207, VSUBUQM, do_vx_helper, gen_helper_VSUBUQM)

static void gen_VADDCUW_vec(unsigned vece, TCGv_vec t, TCGv_vec a, TCGv_vec b)
{
tcg_gen_not_vec(vece, a, a);
tcg_gen_cmp_vec(TCG_COND_LTU, vece, t, a, b);
tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(t, vece, 1));
}

static void gen_VADDCUW_i32(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b)
{
tcg_gen_not_i32(a, a);
tcg_gen_setcond_i32(TCG_COND_LTU, t, a, b);
}

static void gen_VSUBCUW_vec(unsigned vece, TCGv_vec t, TCGv_vec a, TCGv_vec b)
{
tcg_gen_cmp_vec(TCG_COND_GEU, vece, t, a, b);
tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(t, vece, 1));
}

static void gen_VSUBCUW_i32(TCGv_i32 t, TCGv_i32 a, TCGv_i32 b)
{
tcg_gen_setcond_i32(TCG_COND_GEU, t, a, b);
}

static bool do_vx_vaddsubcuw(DisasContext *ctx, arg_VX *a, int add)
{
static const TCGOpcode vecop_list[] = {
INDEX_op_cmp_vec, 0
};

static const GVecGen3 op[] = {
{
.fniv = gen_VSUBCUW_vec,
.fni4 = gen_VSUBCUW_i32,
.opt_opc = vecop_list,
.vece = MO_32
},
{
.fniv = gen_VADDCUW_vec,
.fni4 = gen_VADDCUW_i32,
.opt_opc = vecop_list,
.vece = MO_32
},
};

REQUIRE_INSNS_FLAGS(ctx, ALTIVEC);
REQUIRE_VECTOR(ctx);

tcg_gen_gvec_3(avr_full_offset(a->vrt), avr_full_offset(a->vra),
avr_full_offset(a->vrb), 16, 16, &op[add]);

return true;
}

TRANS(VSUBCUW, do_vx_vaddsubcuw, 0)
TRANS(VADDCUW, do_vx_vaddsubcuw, 1)

static bool do_vx_vmuleo(DisasContext *ctx, arg_VX *a, bool even,
void (*gen_mul)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
{
Expand Down
3 changes: 1 addition & 2 deletions target/ppc/translate/vmx-ops.c.inc
Expand Up @@ -106,12 +106,11 @@ GEN_VXFORM_300(vsrv, 2, 28),
GEN_VXFORM_300(vslv, 2, 29),
GEN_VXFORM(vslo, 6, 16),
GEN_VXFORM(vsro, 6, 17),
GEN_VXFORM(vaddcuw, 0, 6),
GEN_HANDLER_E_2(vprtybw, 0x4, 0x1, 0x18, 8, 0, PPC_NONE, PPC2_ISA300),
GEN_HANDLER_E_2(vprtybd, 0x4, 0x1, 0x18, 9, 0, PPC_NONE, PPC2_ISA300),
GEN_HANDLER_E_2(vprtybq, 0x4, 0x1, 0x18, 10, 0, PPC_NONE, PPC2_ISA300),

GEN_VXFORM_DUAL(vsubcuw, xpnd04_1, 0, 22, PPC_ALTIVEC, PPC_NONE),
GEN_VXFORM(xpnd04_1, 0, 22),
GEN_VXFORM_300(bcdsr, 0, 23),
GEN_VXFORM_300(bcdsr, 0, 31),
GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE),
Expand Down

0 comments on commit 611bc69

Please sign in to comment.