Skip to content
This repository has been archived by the owner on May 7, 2024. It is now read-only.

Commit

Permalink
RISC-V: Implement movmemsi
Browse files Browse the repository at this point in the history
Without this we aren't getting proper memcpy inlining on RISC-V systems,
which is particularly disastrous for Dhrystone performance on RV32IM
systems.

gcc/ChangeLog

2017-11-07  Andrew Waterman  <andrew@sifive.com>

        * config/riscv/riscv-protos.h (riscv_hard_regno_nregs): New
        prototype.
        (riscv_expand_block_move): Likewise.
        gcc/config/riscv/riscv.h (MOVE_RATIO): Tune cost to movmemsi
        implementation.
        (RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER): New define.
        (RISCV_MAX_MOVE_BYTES_STRAIGHT): New define.
        gcc/config/riscv/riscv.c (riscv_block_move_straight): New
        function.
        (riscv_adjust_block_mem): Likewise.
        (riscv_block_move_loop): Likewise.
        (riscv_expand_block_move): Likewise.
        gcc/config/riscv/riscv.md (movmemsi): New pattern.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@254501 138bc75d-0d04-0410-961f-82ee72b054a4
  • Loading branch information
palmer authored and palmer-dabbelt committed Nov 7, 2017
1 parent 605bc7b commit b731149
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 3 deletions.
16 changes: 16 additions & 0 deletions gcc/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
2017-11-07 Andrew Waterman <andrew@sifive.com>

* config/riscv/riscv-protos.h (riscv_hard_regno_nregs): New
prototype.
(riscv_expand_block_move): Likewise.
gcc/config/riscv/riscv.h (MOVE_RATIO): Tune cost to movmemsi
implementation.
(RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER): New define.
(RISCV_MAX_MOVE_BYTES_STRAIGHT): New define.
gcc/config/riscv/riscv.c (riscv_block_move_straight): New
function.
(riscv_adjust_block_mem): Likewise.
(riscv_block_move_loop): Likewise.
(riscv_expand_block_move): Likewise.
gcc/config/riscv/riscv.md (movmemsi): New pattern.

2017-11-07 Michael Clark <michaeljclark@mac.com>

* config/riscv/linux.h (MUSL_ABI_SUFFIX): New define.
Expand Down
1 change: 1 addition & 0 deletions gcc/config/riscv/riscv-protos.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ extern void riscv_expand_epilogue (bool);
extern bool riscv_can_use_return_insn (void);
extern rtx riscv_function_value (const_tree, const_tree, enum machine_mode);
extern unsigned int riscv_hard_regno_nregs (int, enum machine_mode);
extern bool riscv_expand_block_move (rtx, rtx, rtx);

/* Routines implemented in riscv-c.c. */
void riscv_cpu_cpp_builtins (cpp_reader *);
Expand Down
156 changes: 156 additions & 0 deletions gcc/config/riscv/riscv.c
Original file line number Diff line number Diff line change
Expand Up @@ -2695,6 +2695,162 @@ riscv_legitimize_call_address (rtx addr)
return addr;
}

/* Emit straight-line code to move LENGTH bytes from SRC to DEST.
Assume that the areas do not overlap. */

static void
riscv_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length)
{
HOST_WIDE_INT offset, delta;
unsigned HOST_WIDE_INT bits;
int i;
enum machine_mode mode;
rtx *regs;

bits = MAX (BITS_PER_UNIT,
MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest))));

mode = mode_for_size (bits, MODE_INT, 0);
delta = bits / BITS_PER_UNIT;

/* Allocate a buffer for the temporary registers. */
regs = XALLOCAVEC (rtx, length / delta);

/* Load as many BITS-sized chunks as possible. Use a normal load if
the source has enough alignment, otherwise use left/right pairs. */
for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
{
regs[i] = gen_reg_rtx (mode);
riscv_emit_move (regs[i], adjust_address (src, mode, offset));
}

/* Copy the chunks to the destination. */
for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
riscv_emit_move (adjust_address (dest, mode, offset), regs[i]);

/* Mop up any left-over bytes. */
if (offset < length)
{
src = adjust_address (src, BLKmode, offset);
dest = adjust_address (dest, BLKmode, offset);
move_by_pieces (dest, src, length - offset,
MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), 0);
}
}

/* Helper function for doing a loop-based block operation on memory
reference MEM. Each iteration of the loop will operate on LENGTH
bytes of MEM.
Create a new base register for use within the loop and point it to
the start of MEM. Create a new memory reference that uses this
register. Store them in *LOOP_REG and *LOOP_MEM respectively. */

static void
riscv_adjust_block_mem (rtx mem, HOST_WIDE_INT length,
rtx *loop_reg, rtx *loop_mem)
{
*loop_reg = copy_addr_to_reg (XEXP (mem, 0));

/* Although the new mem does not refer to a known location,
it does keep up to LENGTH bytes of alignment. */
*loop_mem = change_address (mem, BLKmode, *loop_reg);
set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
}

/* Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that
the memory regions do not overlap. */

static void
riscv_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
HOST_WIDE_INT bytes_per_iter)
{
rtx label, src_reg, dest_reg, final_src, test;
HOST_WIDE_INT leftover;

leftover = length % bytes_per_iter;
length -= leftover;

/* Create registers and memory references for use within the loop. */
riscv_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
riscv_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);

/* Calculate the value that SRC_REG should have after the last iteration
of the loop. */
final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
0, 0, OPTAB_WIDEN);

/* Emit the start of the loop. */
label = gen_label_rtx ();
emit_label (label);

/* Emit the loop body. */
riscv_block_move_straight (dest, src, bytes_per_iter);

/* Move on to the next block. */
riscv_emit_move (src_reg, plus_constant (Pmode, src_reg, bytes_per_iter));
riscv_emit_move (dest_reg, plus_constant (Pmode, dest_reg, bytes_per_iter));

/* Emit the loop condition. */
test = gen_rtx_NE (VOIDmode, src_reg, final_src);
if (Pmode == DImode)
emit_jump_insn (gen_cbranchdi4 (test, src_reg, final_src, label));
else
emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));

/* Mop up any left-over bytes. */
if (leftover)
riscv_block_move_straight (dest, src, leftover);
else
emit_insn(gen_nop ());
}

/* Expand a movmemsi instruction, which copies LENGTH bytes from
memory reference SRC to memory reference DEST. */

bool
riscv_expand_block_move (rtx dest, rtx src, rtx length)
{
if (CONST_INT_P (length))
{
HOST_WIDE_INT factor, align;

align = MIN (MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), BITS_PER_WORD);
factor = BITS_PER_WORD / align;

if (optimize_function_for_size_p (cfun)
&& INTVAL (length) * factor * UNITS_PER_WORD > MOVE_RATIO (false))
return false;

if (INTVAL (length) <= RISCV_MAX_MOVE_BYTES_STRAIGHT / factor)
{
riscv_block_move_straight (dest, src, INTVAL (length));
return true;
}
else if (optimize && align >= BITS_PER_WORD)
{
unsigned min_iter_words
= RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD;
unsigned iter_words = min_iter_words;
HOST_WIDE_INT bytes = INTVAL (length), words = bytes / UNITS_PER_WORD;

/* Lengthen the loop body if it shortens the tail. */
for (unsigned i = min_iter_words; i < min_iter_words * 2 - 1; i++)
{
unsigned cur_cost = iter_words + words % iter_words;
unsigned new_cost = i + words % i;
if (new_cost <= cur_cost)
iter_words = i;
}

riscv_block_move_loop (dest, src, bytes, iter_words * UNITS_PER_WORD);
return true;
}
}
return false;
}

/* Print symbolic operand OP, which is part of a HIGH or LO_SUM
in context CONTEXT. HI_RELOC indicates a high-part reloc. */

Expand Down
21 changes: 18 additions & 3 deletions gcc/config/riscv/riscv.h
Original file line number Diff line number Diff line change
Expand Up @@ -851,10 +851,25 @@ while (0)
#undef PTRDIFF_TYPE
#define PTRDIFF_TYPE (POINTER_SIZE == 64 ? "long int" : "int")

/* If a memory-to-memory move would take MOVE_RATIO or more simple
move-instruction pairs, we will do a movmem or libcall instead. */
/* The maximum number of bytes copied by one iteration of a movmemsi loop. */

#define RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)

/* The maximum number of bytes that can be copied by a straight-line
movmemsi implementation. */

#define MOVE_RATIO(speed) (CLEAR_RATIO (speed) / 2)
#define RISCV_MAX_MOVE_BYTES_STRAIGHT (RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER * 3)

/* If a memory-to-memory move would take MOVE_RATIO or more simple
move-instruction pairs, we will do a movmem or libcall instead.
Do not use move_by_pieces at all when strict alignment is not
in effect but the target has slow unaligned accesses; in this
case, movmem or libcall is more efficient. */

#define MOVE_RATIO(speed) \
(!STRICT_ALIGNMENT && riscv_slow_unaligned_access ? 1 : \
(speed) ? RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD : \
CLEAR_RATIO (speed) / 2)

/* For CLEAR_RATIO, when optimizing for size, give a better estimate
of the length of a memset call, but use the default otherwise. */
Expand Down
13 changes: 13 additions & 0 deletions gcc/config/riscv/riscv.md
Original file line number Diff line number Diff line change
Expand Up @@ -1435,6 +1435,19 @@
DONE;
})

(define_expand "movmemsi"
[(parallel [(set (match_operand:BLK 0 "general_operand")
(match_operand:BLK 1 "general_operand"))
(use (match_operand:SI 2 ""))
(use (match_operand:SI 3 "const_int_operand"))])]
""
{
if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
DONE;
else
FAIL;
})

;; Expand in-line code to clear the instruction cache between operand[0] and
;; operand[1].
(define_expand "clear_cache"
Expand Down

0 comments on commit b731149

Please sign in to comment.