Skip to content

Commit

Permalink
tcg/sparc: Support unaligned access for user-only
Browse files Browse the repository at this point in the history
This is kinda sorta the opposite of the other tcg hosts, where
we get (normal) alignment checks for free with host SIGBUS and
need to add code to support unaligned accesses.

This inline code expansion is somewhat large, but it takes quite
a few instructions to make a function call to a helper anyway.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
  • Loading branch information
rth7680 committed Feb 8, 2022
1 parent e01d60f commit 321dbde
Showing 1 changed file with 211 additions and 8 deletions.
219 changes: 211 additions & 8 deletions tcg/sparc/tcg-target.c.inc
Expand Up @@ -211,6 +211,7 @@ static const int tcg_target_call_oarg_regs[] = {
#define ARITH_ADD (INSN_OP(2) | INSN_OP3(0x00))
#define ARITH_ADDCC (INSN_OP(2) | INSN_OP3(0x10))
#define ARITH_AND (INSN_OP(2) | INSN_OP3(0x01))
#define ARITH_ANDCC (INSN_OP(2) | INSN_OP3(0x11))
#define ARITH_ANDN (INSN_OP(2) | INSN_OP3(0x05))
#define ARITH_OR (INSN_OP(2) | INSN_OP3(0x02))
#define ARITH_ORCC (INSN_OP(2) | INSN_OP3(0x12))
Expand Down Expand Up @@ -1025,6 +1026,38 @@ static void build_trampolines(TCGContext *s)
tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
}
}
#else
static const tcg_insn_unit *qemu_unalign_ld_trampoline;
static const tcg_insn_unit *qemu_unalign_st_trampoline;

static void build_trampolines(TCGContext *s)
{
for (int ld = 0; ld < 2; ++ld) {
void *helper;

while ((uintptr_t)s->code_ptr & 15) {
tcg_out_nop(s);
}

if (ld) {
helper = helper_unaligned_ld;
qemu_unalign_ld_trampoline = tcg_splitwx_to_rx(s->code_ptr);
} else {
helper = helper_unaligned_st;
qemu_unalign_st_trampoline = tcg_splitwx_to_rx(s->code_ptr);
}

if (!SPARC64 && TARGET_LONG_BITS == 64) {
/* Install the high part of the address. */
tcg_out_arithi(s, TCG_REG_O1, TCG_REG_O2, 32, SHIFT_SRLX);
}

/* Tail call. */
tcg_out_jmpl_const(s, helper, true, true);
/* delay slot -- set the env argument */
tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
}
}
#endif

/* Generate global QEMU prologue and epilogue code */
Expand Down Expand Up @@ -1075,9 +1108,7 @@ static void tcg_target_qemu_prologue(TCGContext *s)
/* delay slot */
tcg_out_movi_imm13(s, TCG_REG_O0, 0);

#ifdef CONFIG_SOFTMMU
build_trampolines(s);
#endif
}

static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
Expand Down Expand Up @@ -1162,18 +1193,22 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addr, int mem_index,
static const int qemu_ld_opc[(MO_SSIZE | MO_BSWAP) + 1] = {
[MO_UB] = LDUB,
[MO_SB] = LDSB,
[MO_UB | MO_LE] = LDUB,
[MO_SB | MO_LE] = LDSB,

[MO_BEUW] = LDUH,
[MO_BESW] = LDSH,
[MO_BEUL] = LDUW,
[MO_BESL] = LDSW,
[MO_BEUQ] = LDX,
[MO_BESQ] = LDX,

[MO_LEUW] = LDUH_LE,
[MO_LESW] = LDSH_LE,
[MO_LEUL] = LDUW_LE,
[MO_LESL] = LDSW_LE,
[MO_LEUQ] = LDX_LE,
[MO_LESQ] = LDX_LE,
};

static const int qemu_st_opc[(MO_SIZE | MO_BSWAP) + 1] = {
Expand All @@ -1192,11 +1227,12 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
MemOpIdx oi, bool is_64)
{
MemOp memop = get_memop(oi);
tcg_insn_unit *label_ptr;

#ifdef CONFIG_SOFTMMU
unsigned memi = get_mmuidx(oi);
TCGReg addrz, param;
const tcg_insn_unit *func;
tcg_insn_unit *label_ptr;

addrz = tcg_out_tlb_load(s, addr, memi, memop,
offsetof(CPUTLBEntry, addr_read));
Expand Down Expand Up @@ -1260,25 +1296,112 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,

*label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
#else
TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0);
unsigned a_bits = get_alignment_bits(memop);
unsigned s_bits = memop & MO_SIZE;
unsigned t_bits;

if (SPARC64 && TARGET_LONG_BITS == 32) {
tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
addr = TCG_REG_T1;
}
tcg_out_ldst_rr(s, data, addr,
(guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),

/*
* Normal case: alignment equal to access size.
*/
if (a_bits == s_bits) {
tcg_out_ldst_rr(s, data, addr, index,
qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);
return;
}

/*
* Test for at least natural alignment, and assume most accesses
* will be aligned -- perform a straight load in the delay slot.
* This is required to preserve atomicity for aligned accesses.
*/
t_bits = MAX(a_bits, s_bits);
tcg_debug_assert(t_bits < 13);
tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC);

/* beq,a,pt %icc, label */
label_ptr = s->code_ptr;
tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0);
/* delay slot */
tcg_out_ldst_rr(s, data, addr, index,
qemu_ld_opc[memop & (MO_BSWAP | MO_SSIZE)]);

if (a_bits >= s_bits) {
/*
* Overalignment: A successful alignment test will perform the memory
* operation in the delay slot, and failure need only invoke the
* handler for SIGBUS.
*/
TCGReg arg_low = TCG_REG_O1 + (!SPARC64 && TARGET_LONG_BITS == 64);
tcg_out_call_nodelay(s, qemu_unalign_ld_trampoline, false);
/* delay slot -- move to low part of argument reg */
tcg_out_mov_delay(s, arg_low, addr);
} else {
/* Underalignment: load by pieces of minimum alignment. */
int ld_opc, a_size, s_size, i;

/*
* Force full address into T1 early; avoids problems with
* overlap between @addr and @data.
*/
tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD);

a_size = 1 << a_bits;
s_size = 1 << s_bits;
if ((memop & MO_BSWAP) == MO_BE) {
ld_opc = qemu_ld_opc[a_bits | MO_BE | (memop & MO_SIGN)];
tcg_out_ldst(s, data, TCG_REG_T1, 0, ld_opc);
ld_opc = qemu_ld_opc[a_bits | MO_BE];
for (i = a_size; i < s_size; i += a_size) {
tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, ld_opc);
tcg_out_arithi(s, data, data, a_size, SHIFT_SLLX);
tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
}
} else if (a_bits == 0) {
ld_opc = LDUB;
tcg_out_ldst(s, data, TCG_REG_T1, 0, ld_opc);
for (i = a_size; i < s_size; i += a_size) {
if ((memop & MO_SIGN) && i == s_size - a_size) {
ld_opc = LDSB;
}
tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, ld_opc);
tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, i * 8, SHIFT_SLLX);
tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
}
} else {
ld_opc = qemu_ld_opc[a_bits | MO_LE];
tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, ld_opc);
for (i = a_size; i < s_size; i += a_size) {
tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, a_size, ARITH_ADD);
if ((memop & MO_SIGN) && i == s_size - a_size) {
ld_opc = qemu_ld_opc[a_bits | MO_LE | MO_SIGN];
}
tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, ld_opc);
tcg_out_arithi(s, TCG_REG_T2, TCG_REG_T2, i * 8, SHIFT_SLLX);
tcg_out_arith(s, data, data, TCG_REG_T2, ARITH_OR);
}
}
}

*label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
#endif /* CONFIG_SOFTMMU */
}

static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
MemOpIdx oi)
{
MemOp memop = get_memop(oi);
tcg_insn_unit *label_ptr;

#ifdef CONFIG_SOFTMMU
unsigned memi = get_mmuidx(oi);
TCGReg addrz, param;
const tcg_insn_unit *func;
tcg_insn_unit *label_ptr;

addrz = tcg_out_tlb_load(s, addr, memi, memop,
offsetof(CPUTLBEntry, addr_write));
Expand Down Expand Up @@ -1315,13 +1438,93 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,

*label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
#else
TCGReg index = (guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0);
unsigned a_bits = get_alignment_bits(memop);
unsigned s_bits = memop & MO_SIZE;
unsigned t_bits;

if (SPARC64 && TARGET_LONG_BITS == 32) {
tcg_out_arithi(s, TCG_REG_T1, addr, 0, SHIFT_SRL);
addr = TCG_REG_T1;
}
tcg_out_ldst_rr(s, data, addr,
(guest_base ? TCG_GUEST_BASE_REG : TCG_REG_G0),

/*
* Normal case: alignment equal to access size.
*/
if (a_bits == s_bits) {
tcg_out_ldst_rr(s, data, addr, index,
qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);
return;
}

/*
* Test for at least natural alignment, and assume most accesses
* will be aligned -- perform a straight store in the delay slot.
* This is required to preserve atomicity for aligned accesses.
*/
t_bits = MAX(a_bits, s_bits);
tcg_debug_assert(t_bits < 13);
tcg_out_arithi(s, TCG_REG_G0, addr, (1u << t_bits) - 1, ARITH_ANDCC);

/* beq,a,pt %icc, label */
label_ptr = s->code_ptr;
tcg_out_bpcc0(s, COND_E, BPCC_A | BPCC_PT | BPCC_ICC, 0);
/* delay slot */
tcg_out_ldst_rr(s, data, addr, index,
qemu_st_opc[memop & (MO_BSWAP | MO_SIZE)]);

if (a_bits >= s_bits) {
/*
* Overalignment: A successful alignment test will perform the memory
* operation in the delay slot, and failure need only invoke the
* handler for SIGBUS.
*/
TCGReg arg_low = TCG_REG_O1 + (!SPARC64 && TARGET_LONG_BITS == 64);
tcg_out_call_nodelay(s, qemu_unalign_st_trampoline, false);
/* delay slot -- move to low part of argument reg */
tcg_out_mov_delay(s, arg_low, addr);
} else {
/* Underalignment: store by pieces of minimum alignment. */
int st_opc, a_size, s_size, i;

/*
* Force full address into T1 early; avoids problems with
* overlap between @addr and @data.
*/
tcg_out_arith(s, TCG_REG_T1, addr, index, ARITH_ADD);

a_size = 1 << a_bits;
s_size = 1 << s_bits;
if ((memop & MO_BSWAP) == MO_BE) {
st_opc = qemu_st_opc[a_bits | MO_BE];
for (i = 0; i < s_size; i += a_size) {
TCGReg d = data;
int shift = (s_size - a_size - i) * 8;
if (shift) {
d = TCG_REG_T2;
tcg_out_arithi(s, d, data, shift, SHIFT_SRLX);
}
tcg_out_ldst(s, d, TCG_REG_T1, i, st_opc);
}
} else if (a_bits == 0) {
tcg_out_ldst(s, data, TCG_REG_T1, 0, STB);
for (i = 1; i < s_size; i++) {
tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX);
tcg_out_ldst(s, TCG_REG_T2, TCG_REG_T1, i, STB);
}
} else {
/* Note that ST*A with immediate asi must use indexed address. */
st_opc = qemu_st_opc[a_bits + MO_LE];
tcg_out_ldst_rr(s, data, TCG_REG_T1, TCG_REG_G0, st_opc);
for (i = a_size; i < s_size; i += a_size) {
tcg_out_arithi(s, TCG_REG_T2, data, i * 8, SHIFT_SRLX);
tcg_out_arithi(s, TCG_REG_T1, TCG_REG_T1, a_size, ARITH_ADD);
tcg_out_ldst_rr(s, TCG_REG_T2, TCG_REG_T1, TCG_REG_G0, st_opc);
}
}
}

*label_ptr |= INSN_OFF19(tcg_ptr_byte_diff(s->code_ptr, label_ptr));
#endif /* CONFIG_SOFTMMU */
}

Expand Down

0 comments on commit 321dbde

Please sign in to comment.