From 5eb4f645eba8a79ea643b228c74a79183d436c97 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 14 Sep 2015 11:34:54 +0100 Subject: [PATCH 1/3] tcg/mips: Fix clobbering of qemu_ld inputs The MIPS TCG backend implements qemu_ld with 64-bit targets using the v0 register (base) as a temporary to load the upper half of the QEMU TLB comparator (see line 5 below), however this happens before the input address is used (line 8 to mask off the low bits for the TLB comparison, and line 12 to add the host-guest offset). If the input address (addrl) also happens to have been placed in v0 (as in the second column below), it gets clobbered before it is used. addrl in t2 addrl in v0 1 srl a0,t2,0x7 srl a0,v0,0x7 2 andi a0,a0,0x1fe0 andi a0,a0,0x1fe0 3 addu a0,a0,s0 addu a0,a0,s0 4 lw at,9136(a0) lw at,9136(a0) set TCG_TMP0 (at) 5 lw v0,9140(a0) lw v0,9140(a0) set base (v0) 6 li t9,-4093 li t9,-4093 7 lw a0,9160(a0) lw a0,9160(a0) set addend (a0) 8 and t9,t9,t2 and t9,t9,v0 use addrl 9 bne at,t9,0x836d8c8 bne at,t9,0x836d838 use TCG_TMP0 10 nop nop 11 bne v0,t8,0x836d8c8 bne v0,a1,0x836d838 use base 12 addu v0,a0,t2 addu v0,a0,v0 use addrl, addend 13 lw t0,0(v0) lw t0,0(v0) Fix by using TCG_TMP0 (at) as the temporary instead of v0 (base), pushing the load on line 5 forward into the delay slot of the low comparison (line 10). The early load of the addend on line 7 also needs pushing even further for 64-bit targets, or it will clobber a0 before we're done with it. The output for 32-bit targets is unaffected. srl a0,v0,0x7 andi a0,a0,0x1fe0 addu a0,a0,s0 lw at,9136(a0) -lw v0,9140(a0) load high comparator li t9,-4093 -lw a0,9160(a0) load addend and t9,t9,v0 bne at,t9,0x836d838 - nop + lw at,9140(a0) load high comparator +lw a0,9160(a0) load addend -bne v0,a1,0x836d838 +bne at,a1,0x836d838 addu v0,a0,v0 lw t0,0(v0) Cc: qemu-stable@nongnu.org Reviewed-by: Richard Henderson Reviewed-by: Aurelien Jarno Signed-off-by: James Hogan Signed-off-by: Aurelien Jarno --- tcg/mips/tcg-target.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c index c0ce520228a7..38c968220b9a 100644 --- a/tcg/mips/tcg-target.c +++ b/tcg/mips/tcg-target.c @@ -962,30 +962,34 @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl, add_off -= 0x7ff0; } - /* Load the tlb comparator. */ - if (TARGET_LONG_BITS == 64) { - tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, TCG_REG_A0, cmp_off + LO_OFF); - tcg_out_opc_imm(s, OPC_LW, base, TCG_REG_A0, cmp_off + HI_OFF); - } else { - tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, TCG_REG_A0, cmp_off); - } + /* Load the (low half) tlb comparator. */ + tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, TCG_REG_A0, + cmp_off + (TARGET_LONG_BITS == 64 ? LO_OFF : 0)); /* Mask the page bits, keeping the alignment bits to compare against. - In between, load the tlb addend for the fast path. */ + In between on 32-bit targets, load the tlb addend for the fast path. */ tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1, TARGET_PAGE_MASK | ((1 << s_bits) - 1)); - tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0, add_off); + if (TARGET_LONG_BITS == 32) { + tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0, add_off); + } tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl); label_ptr[0] = s->code_ptr; tcg_out_opc_br(s, OPC_BNE, TCG_TMP1, TCG_TMP0); + /* Load and test the high half tlb comparator. */ if (TARGET_LONG_BITS == 64) { /* delay slot */ - tcg_out_nop(s); + tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, TCG_REG_A0, cmp_off + HI_OFF); + + /* Load the tlb addend for the fast path. We can't do it earlier with + 64-bit targets or we'll clobber a0 before reading the high half tlb + comparator. */ + tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0, add_off); label_ptr[1] = s->code_ptr; - tcg_out_opc_br(s, OPC_BNE, addrh, base); + tcg_out_opc_br(s, OPC_BNE, addrh, TCG_TMP0); } /* delay slot */ From d9f26847f1429bdb8ccaa4e7bd5f8b57a9da0e8d Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Fri, 10 Jul 2015 22:04:48 +0200 Subject: [PATCH 2/3] tcg/mips: move tcg_out_addsub2 Somehow the tcg_out_addsub2 function ended-up in the middle of the qemu_ld/st related functions. Move it with other arithmetics related functions. Reviewed-by: Richard Henderson Signed-off-by: Aurelien Jarno --- tcg/mips/tcg-target.c | 98 +++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c index 38c968220b9a..4f1e0025dc24 100644 --- a/tcg/mips/tcg-target.c +++ b/tcg/mips/tcg-target.c @@ -567,6 +567,55 @@ static inline void tcg_out_addi(TCGContext *s, TCGReg reg, TCGArg val) } } +static void tcg_out_addsub2(TCGContext *s, TCGReg rl, TCGReg rh, TCGReg al, + TCGReg ah, TCGArg bl, TCGArg bh, bool cbl, + bool cbh, bool is_sub) +{ + TCGReg th = TCG_TMP1; + + /* If we have a negative constant such that negating it would + make the high part zero, we can (usually) eliminate one insn. */ + if (cbl && cbh && bh == -1 && bl != 0) { + bl = -bl; + bh = 0; + is_sub = !is_sub; + } + + /* By operating on the high part first, we get to use the final + carry operation to move back from the temporary. */ + if (!cbh) { + tcg_out_opc_reg(s, (is_sub ? OPC_SUBU : OPC_ADDU), th, ah, bh); + } else if (bh != 0 || ah == rl) { + tcg_out_opc_imm(s, OPC_ADDIU, th, ah, (is_sub ? -bh : bh)); + } else { + th = ah; + } + + /* Note that tcg optimization should eliminate the bl == 0 case. */ + if (is_sub) { + if (cbl) { + tcg_out_opc_imm(s, OPC_SLTIU, TCG_TMP0, al, bl); + tcg_out_opc_imm(s, OPC_ADDIU, rl, al, -bl); + } else { + tcg_out_opc_reg(s, OPC_SLTU, TCG_TMP0, al, bl); + tcg_out_opc_reg(s, OPC_SUBU, rl, al, bl); + } + tcg_out_opc_reg(s, OPC_SUBU, rh, th, TCG_TMP0); + } else { + if (cbl) { + tcg_out_opc_imm(s, OPC_ADDIU, rl, al, bl); + tcg_out_opc_imm(s, OPC_SLTIU, TCG_TMP0, rl, bl); + } else if (rl == al && rl == bl) { + tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, al, 31); + tcg_out_opc_reg(s, OPC_ADDU, rl, al, bl); + } else { + tcg_out_opc_reg(s, OPC_ADDU, rl, al, bl); + tcg_out_opc_reg(s, OPC_SLTU, TCG_TMP0, rl, (rl == bl ? al : bl)); + } + tcg_out_opc_reg(s, OPC_ADDU, rh, th, TCG_TMP0); + } +} + /* Bit 0 set if inversion required; bit 1 set if swapping required. */ #define MIPS_CMP_INV 1 #define MIPS_CMP_SWAP 2 @@ -1237,55 +1286,6 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, } } -static void tcg_out_addsub2(TCGContext *s, TCGReg rl, TCGReg rh, TCGReg al, - TCGReg ah, TCGArg bl, TCGArg bh, bool cbl, - bool cbh, bool is_sub) -{ - TCGReg th = TCG_TMP1; - - /* If we have a negative constant such that negating it would - make the high part zero, we can (usually) eliminate one insn. */ - if (cbl && cbh && bh == -1 && bl != 0) { - bl = -bl; - bh = 0; - is_sub = !is_sub; - } - - /* By operating on the high part first, we get to use the final - carry operation to move back from the temporary. */ - if (!cbh) { - tcg_out_opc_reg(s, (is_sub ? OPC_SUBU : OPC_ADDU), th, ah, bh); - } else if (bh != 0 || ah == rl) { - tcg_out_opc_imm(s, OPC_ADDIU, th, ah, (is_sub ? -bh : bh)); - } else { - th = ah; - } - - /* Note that tcg optimization should eliminate the bl == 0 case. */ - if (is_sub) { - if (cbl) { - tcg_out_opc_imm(s, OPC_SLTIU, TCG_TMP0, al, bl); - tcg_out_opc_imm(s, OPC_ADDIU, rl, al, -bl); - } else { - tcg_out_opc_reg(s, OPC_SLTU, TCG_TMP0, al, bl); - tcg_out_opc_reg(s, OPC_SUBU, rl, al, bl); - } - tcg_out_opc_reg(s, OPC_SUBU, rh, th, TCG_TMP0); - } else { - if (cbl) { - tcg_out_opc_imm(s, OPC_ADDIU, rl, al, bl); - tcg_out_opc_imm(s, OPC_SLTIU, TCG_TMP0, rl, bl); - } else if (rl == al && rl == bl) { - tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, al, 31); - tcg_out_opc_reg(s, OPC_ADDU, rl, al, bl); - } else { - tcg_out_opc_reg(s, OPC_ADDU, rl, al, bl); - tcg_out_opc_reg(s, OPC_SLTU, TCG_TMP0, rl, (rl == bl ? al : bl)); - } - tcg_out_opc_reg(s, OPC_ADDU, rh, th, TCG_TMP0); - } -} - static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64) { TCGReg addr_regl, addr_regh __attribute__((unused)); From 81dfaf1a8f7f95259801da9732472f879023ef77 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Fri, 10 Jul 2015 22:10:02 +0200 Subject: [PATCH 3/3] tcg/mips: pass oi to tcg_out_tlb_load Instead of computing mem_index and s_bits in both tcg_out_qemu_ld and tcg_out_qemu_st function and passing them to tcg_out_tlb_load, directly pass oi to the tcg_out_tlb_load function and compute mem_index and s_bits there. Reviewed-by: Richard Henderson Signed-off-by: Aurelien Jarno --- tcg/mips/tcg-target.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c index 4f1e0025dc24..4305af967326 100644 --- a/tcg/mips/tcg-target.c +++ b/tcg/mips/tcg-target.c @@ -983,9 +983,11 @@ static int tcg_out_call_iarg_reg2(TCGContext *s, int i, TCGReg al, TCGReg ah) /* Perform the tlb comparison operation. The complete host address is placed in BASE. Clobbers AT, T0, A0. */ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl, - TCGReg addrh, int mem_index, TCGMemOp s_bits, + TCGReg addrh, TCGMemOpIdx oi, tcg_insn_unit *label_ptr[2], bool is_load) { + TCGMemOp s_bits = get_memop(oi) & MO_SIZE; + int mem_index = get_mmuidx(oi); int cmp_off = (is_load ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read) @@ -1209,8 +1211,6 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64) TCGMemOp opc; #if defined(CONFIG_SOFTMMU) tcg_insn_unit *label_ptr[2]; - int mem_index; - TCGMemOp s_bits; #endif /* Note that we've eliminated V0 from the output registers, so we won't overwrite the base register during loading. */ @@ -1224,11 +1224,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64) opc = get_memop(oi); #if defined(CONFIG_SOFTMMU) - mem_index = get_mmuidx(oi); - s_bits = opc & MO_SIZE; - - tcg_out_tlb_load(s, base, addr_regl, addr_regh, mem_index, - s_bits, label_ptr, 1); + tcg_out_tlb_load(s, base, addr_regl, addr_regh, oi, label_ptr, 1); tcg_out_qemu_ld_direct(s, data_regl, data_regh, base, opc); add_qemu_ldst_label(s, 1, oi, data_regl, data_regh, addr_regl, addr_regh, s->code_ptr, label_ptr); @@ -1294,8 +1290,6 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64) TCGMemOp opc; #if defined(CONFIG_SOFTMMU) tcg_insn_unit *label_ptr[2]; - int mem_index; - TCGMemOp s_bits; #endif data_regl = *args++; @@ -1306,14 +1300,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64) opc = get_memop(oi); #if defined(CONFIG_SOFTMMU) - mem_index = get_mmuidx(oi); - s_bits = opc & 3; - /* Note that we eliminated the helper's address argument, so we can reuse that for the base. */ base = (TARGET_LONG_BITS == 32 ? TCG_REG_A1 : TCG_REG_A2); - tcg_out_tlb_load(s, base, addr_regl, addr_regh, mem_index, - s_bits, label_ptr, 0); + tcg_out_tlb_load(s, base, addr_regl, addr_regh, oi, label_ptr, 0); tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc); add_qemu_ldst_label(s, 0, oi, data_regl, data_regh, addr_regl, addr_regh, s->code_ptr, label_ptr);