Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
tcg/i386: Support 128-bit load/store with have_atomic16
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
  • Loading branch information
rth7680 committed May 16, 2023
1 parent 8f04e29 commit caa2e59
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 4 deletions.
181 changes: 178 additions & 3 deletions tcg/i386/tcg-target.c.inc
Expand Up @@ -91,6 +91,8 @@ static const int tcg_target_reg_alloc_order[] = {
#endif
};

#define TCG_TMP_VEC TCG_REG_XMM5

static const int tcg_target_call_iarg_regs[] = {
#if TCG_TARGET_REG_BITS == 64
#if defined(_WIN64)
Expand Down Expand Up @@ -347,6 +349,8 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
#define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16)
#define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16)
#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
#define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
Expand Down Expand Up @@ -1783,7 +1787,21 @@ typedef struct {

bool tcg_target_has_memory_bswap(MemOp memop)
{
return have_movbe;
TCGAtomAlign aa;

if (!have_movbe) {
return false;
}
if ((memop & MO_SIZE) <= MO_64) {
return true;
}

/*
* Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
* but do allow a pair of 64-bit operations, i.e. MOVBEQ.
*/
aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
return aa.atom <= MO_64;
}

/*
Expand Down Expand Up @@ -1811,6 +1829,30 @@ static const TCGLdstHelperParam ldst_helper_param = {
static const TCGLdstHelperParam ldst_helper_param = { };
#endif

static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
TCGReg l, TCGReg h, TCGReg v)
{
int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;

/* vpmov{d,q} %v, %l */
tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
/* vpextr{d,q} $1, %v, %h */
tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
tcg_out8(s, 1);
}

static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
TCGReg v, TCGReg l, TCGReg h)
{
int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;

/* vmov{d,q} %l, %v */
tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
/* vpinsr{d,q} $1, %h, %v, %v */
tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
tcg_out8(s, 1);
}

/*
* Generate code for the slow path for a load at the end of block
*/
Expand Down Expand Up @@ -1900,6 +1942,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
{
TCGLabelQemuLdst *ldst = NULL;
MemOp opc = get_memop(oi);
MemOp s_bits = opc & MO_SIZE;
unsigned a_mask;

#ifdef CONFIG_SOFTMMU
Expand All @@ -1910,7 +1953,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
*h = x86_guest_base;
#endif
h->base = addrlo;
h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false);
h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
a_mask = (1 << h->aa.align) - 1;

#ifdef CONFIG_SOFTMMU
Expand All @@ -1920,7 +1963,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
TCGType tlbtype = TCG_TYPE_I32;
int trexw = 0, hrexw = 0, tlbrexw = 0;
unsigned mem_index = get_mmuidx(oi);
unsigned s_bits = opc & MO_SIZE;
unsigned s_mask = (1 << s_bits) - 1;
target_ulong tlb_mask;

Expand Down Expand Up @@ -2115,6 +2157,69 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
h.base, h.index, 0, h.ofs + 4);
}
break;

case MO_128:
{
TCGLabel *l1 = NULL, *l2 = NULL;
bool use_pair = h.aa.atom < MO_128;

tcg_debug_assert(TCG_TARGET_REG_BITS == 64);

if (!use_pair) {
tcg_debug_assert(!use_movbe);
/*
* Atomicity requires that we use use VMOVDQA.
* If we've already checked for 16-byte alignment, that's all
* we need. If we arrive here with lesser alignment, then we
* have determined that less than 16-byte alignment can be
* satisfied with two 8-byte loads.
*/
if (h.aa.align < MO_128) {
use_pair = true;
l1 = gen_new_label();
l2 = gen_new_label();

tcg_out_testi(s, h.base, 15);
tcg_out_jxx(s, JCC_JNE, l2, true);
}

tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
TCG_TMP_VEC, 0,
h.base, h.index, 0, h.ofs);
tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo,
datahi, TCG_TMP_VEC);

if (use_pair) {
tcg_out_jxx(s, JCC_JMP, l1, true);
tcg_out_label(s, l2);
}
}
if (use_pair) {
if (use_movbe) {
TCGReg t = datalo;
datalo = datahi;
datahi = t;
}
if (h.base == datalo || h.index == datalo) {
tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
h.base, h.index, 0, h.ofs);
tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
datalo, datahi, 0);
tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
datahi, datahi, 8);
} else {
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
h.base, h.index, 0, h.ofs);
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
h.base, h.index, 0, h.ofs + 8);
}
}
if (l1) {
tcg_out_label(s, l1);
}
}
break;

default:
g_assert_not_reached();
}
Expand Down Expand Up @@ -2200,6 +2305,60 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
h.base, h.index, 0, h.ofs + 4);
}
break;

case MO_128:
{
TCGLabel *l1 = NULL, *l2 = NULL;
bool use_pair = h.aa.atom < MO_128;

tcg_debug_assert(TCG_TARGET_REG_BITS == 64);

if (!use_pair) {
tcg_debug_assert(!use_movbe);
/*
* Atomicity requires that we use use VMOVDQA.
* If we've already checked for 16-byte alignment, that's all
* we need. If we arrive here with lesser alignment, then we
* have determined that less that 16-byte alignment can be
* satisfied with two 8-byte loads.
*/
if (h.aa.align < MO_128) {
use_pair = true;
l1 = gen_new_label();
l2 = gen_new_label();

tcg_out_testi(s, h.base, 15);
tcg_out_jxx(s, JCC_JNE, l2, true);
}

tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC,
datalo, datahi);
tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
TCG_TMP_VEC, 0,
h.base, h.index, 0, h.ofs);

if (use_pair) {
tcg_out_jxx(s, JCC_JMP, l1, true);
tcg_out_label(s, l2);
}
}
if (use_pair) {
if (use_movbe) {
TCGReg t = datalo;
datalo = datahi;
datahi = t;
}
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
h.base, h.index, 0, h.ofs);
tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
h.base, h.index, 0, h.ofs + 8);
}
if (l1) {
tcg_out_label(s, l1);
}
}
break;

default:
g_assert_not_reached();
}
Expand Down Expand Up @@ -2523,6 +2682,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
}
break;
case INDEX_op_qemu_ld_i128:
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
break;
case INDEX_op_qemu_st_i32:
case INDEX_op_qemu_st8_i32:
if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
Expand All @@ -2540,6 +2703,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
}
break;
case INDEX_op_qemu_st_i128:
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
break;

OP_32_64(mulu2):
tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
Expand Down Expand Up @@ -3234,6 +3401,13 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
: TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
: C_O0_I4(L, L, L, L));

case INDEX_op_qemu_ld_i128:
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
return C_O2_I1(r, r, L);
case INDEX_op_qemu_st_i128:
tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
return C_O0_I3(L, L, L);

case INDEX_op_brcond2_i32:
return C_O0_I4(r, r, ri, ri);

Expand Down Expand Up @@ -4090,6 +4264,7 @@ static void tcg_target_init(TCGContext *s)

s->reserved_regs = 0;
tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
#ifdef _WIN64
/* These are call saved, and we don't save them, so don't use them. */
tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
Expand Down
3 changes: 2 additions & 1 deletion tcg/i386/tcg-target.h
Expand Up @@ -194,7 +194,8 @@ extern bool have_atomic16;
#define TCG_TARGET_HAS_qemu_st8_i32 1
#endif

#define TCG_TARGET_HAS_qemu_ldst_i128 0
#define TCG_TARGET_HAS_qemu_ldst_i128 \
(TCG_TARGET_REG_BITS == 64 && have_atomic16)

/* We do not support older SSE systems, only beginning with AVX1. */
#define TCG_TARGET_HAS_v64 have_avx1
Expand Down

0 comments on commit caa2e59

Please sign in to comment.