Skip to content

Commit

Permalink
musl: add the option on x86_64 to use the super-fast AMD memcpy
Browse files Browse the repository at this point in the history
if selected,
we only put this into dynamic lib, not static, otherwise your static
linked binaries will all be GPL2 infected and only work on cpus with AVX
(and also several hundred bytes bigger).
  • Loading branch information
rofl0r committed Apr 8, 2022
1 parent e1632a8 commit 40c9a77
Show file tree
Hide file tree
Showing 2 changed files with 379 additions and 0 deletions.
364 changes: 364 additions & 0 deletions KEEP/musl-improved-memcpy.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,364 @@
From 28a355ec36542d24a4a80de748dd9e1cf8ab350d Mon Sep 17 00:00:00 2001
From: rofl0r <rofl0r@users.noreply.github.com>
Date: Mon, 6 Sep 2021 18:13:56 +0000
Subject: [PATCH] add amd improved memcpy

/*
* Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/

---
src/string/x86_64/memcpy.s | 288 +++++++++++++++++++++++++++++++++---
src/string/x86_64/memmove.s | 23 ++-
2 files changed, 285 insertions(+), 26 deletions(-)

diff --git a/src/string/x86_64/memcpy.s b/src/string/x86_64/memcpy.s
index 3d960efa..f61e7aad 100644
--- a/src/string/x86_64/memcpy.s
+++ b/src/string/x86_64/memcpy.s
@@ -1,25 +1,265 @@
-.global memcpy
-.global __memcpy_fwd
-.hidden __memcpy_fwd
-.type memcpy,@function
+ .text
+ .globl memcpy
+ .type memcpy, @function
memcpy:
-__memcpy_fwd:
- mov %rdi,%rax
- cmp $8,%rdx
- jc 1f
- test $7,%edi
- jz 1f
-2: movsb
- dec %rdx
- test $7,%edi
- jnz 2b
-1: mov %rdx,%rcx
- shr $3,%rcx
- rep
- movsq
- and $7,%edx
- jz 1f
-2: movsb
- dec %edx
- jnz 2b
-1: ret
+.LFB0:
+ pushq %rbp
+ movq %rsp, %rbp
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ movq %rdi, -40(%rbp)
+ movq %rsi, -48(%rbp)
+ movq %rdx, -56(%rbp)
+ movq -48(%rbp), %rbx
+ movq -40(%rbp), %r13
+ movq -56(%rbp), %r14
+#APP
+# 27 "amd.c" 1
+ movq %rbx, %rsi
+ movq %r13, %rdi
+ movq %r14, %rdx
+ movq %rdi, %rax
+ cmp $32, %rdx
+ jb less_vec
+ cmp $(32 * 2), %rdx
+ ja more_2x_vec
+ vmovdqu (%rsi), %ymm0
+ vmovdqu -32(%rsi,%rdx), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, -32(%rdi,%rdx)
+ vzeroupper
+ jmp .L2
+ less_vec:
+ cmpb $32, %dl
+ jae between_32_63
+ cmpb $16, %dl
+ jae between_16_31
+ cmpb $8, %dl
+ jae between_8_15
+ cmpb $4, %dl
+ jae between_4_7
+ cmpb $1, %dl
+ ja between_2_3
+ jb 1f
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+ 1:
+ jmp .L2
+ between_32_63:
+ vmovdqu (%rsi), %ymm0
+ vmovdqu -32(%rsi,%rdx), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, -32(%rdi,%rdx)
+ vzeroupper
+ jmp .L2
+ between_16_31:
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -16(%rsi,%rdx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -16(%rdi,%rdx)
+ jmp .L2
+ between_8_15:
+ movq -8(%rsi,%rdx), %rcx
+ movq (%rsi), %rsi
+ movq %rcx, -8(%rdi,%rdx)
+ movq %rsi, (%rdi)
+ jmp .L2
+ between_4_7:
+ movl -4(%rsi,%rdx), %ecx
+ movl (%rsi), %esi
+ movl %ecx, -4(%rdi,%rdx)
+ movl %esi, (%rdi)
+ jmp .L2
+ between_2_3:
+ movzwl -2(%rsi,%rdx), %ecx
+ movzwl (%rsi), %esi
+ movw %cx, -2(%rdi,%rdx)
+ movw %si, (%rdi)
+ jmp .L2
+ more_2x_vec:
+ cmpq $(32 * 8), %rdx
+ ja more_8x_vec
+ cmpq $(32 * 4), %rdx
+ jb last_4x_vec
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %ymm1
+ vmovdqu (32 * 2)(%rsi), %ymm2
+ vmovdqu (32 * 3)(%rsi), %ymm3
+ vmovdqu -32(%rsi,%rdx), %ymm4
+ vmovdqu -(32 * 2)(%rsi,%rdx), %ymm5
+ vmovdqu -(32 * 3)(%rsi,%rdx), %ymm6
+ vmovdqu -(32 * 4)(%rsi,%rdx), %ymm7
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 32(%rdi)
+ vmovdqu %ymm2, (32 * 2)(%rdi)
+ vmovdqu %ymm3, (32 * 3)(%rdi)
+ vmovdqu %ymm4, -32(%rdi,%rdx)
+ vmovdqu %ymm5, -(32 * 2)(%rdi,%rdx)
+ vmovdqu %ymm6, -(32 * 3)(%rdi,%rdx)
+ vmovdqu %ymm7, -(32 * 4)(%rdi,%rdx)
+ vzeroupper
+ jmp .L2
+ last_4x_vec:
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %ymm1
+ vmovdqu -32(%rsi,%rdx), %ymm2
+ vmovdqu -(32 * 2)(%rsi,%rdx), %ymm3
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, 32(%rdi)
+ vmovdqu %ymm2, -32(%rdi,%rdx)
+ vmovdqu %ymm3, -(32 * 2)(%rdi,%rdx)
+ vzeroupper
+ nop:
+ jmp .L2
+ more_8x_vec:
+ cmpq %rsi, %rdi
+ ja more_8x_vec_backward
+ je nop
+ vmovdqu (%rsi), %ymm4
+ vmovdqu -32(%rsi, %rdx), %ymm5
+ vmovdqu -(32 * 2)(%rsi, %rdx), %ymm6
+ vmovdqu -(32 * 3)(%rsi, %rdx), %ymm7
+ vmovdqu -(32 * 4)(%rsi, %rdx), %ymm8
+ movq %rdi, %r11
+ leaq -32(%rdi, %rdx), %rcx
+ movq %rdi, %r8
+ andq $(32 - 1), %r8
+ subq $32, %r8
+ subq %r8, %rsi
+ subq %r8, %rdi
+ addq %r8, %rdx
+ cmpq $(1024*1024), %rdx
+ ja large_forward
+ loop_4x_vec_forward:
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %ymm1
+ vmovdqu (32 * 2)(%rsi), %ymm2
+ vmovdqu (32 * 3)(%rsi), %ymm3
+ addq $(32 * 4), %rsi
+ subq $(32 * 4), %rdx
+ vmovdqa %ymm0, (%rdi)
+ vmovdqa %ymm1, 32(%rdi)
+ vmovdqa %ymm2, (32 * 2)(%rdi)
+ vmovdqa %ymm3, (32 * 3)(%rdi)
+ addq $(32 * 4), %rdi
+ cmpq $(32 * 4), %rdx
+ ja loop_4x_vec_forward
+ vmovdqu %ymm5, (%rcx)
+ vmovdqu %ymm6, -32(%rcx)
+ vmovdqu %ymm7, -(32 * 2)(%rcx)
+ vmovdqu %ymm8, -(32 * 3)(%rcx)
+ vmovdqu %ymm4, (%r11)
+ vzeroupper
+ jmp .L2
+ more_8x_vec_backward:
+ vmovdqu (%rsi), %ymm4
+ vmovdqu 32(%rsi), %ymm5
+ vmovdqu (32 * 2)(%rsi), %ymm6
+ vmovdqu (32 * 3)(%rsi), %ymm7
+ vmovdqu -32(%rsi,%rdx), %ymm8
+ leaq -32(%rdi, %rdx), %r11
+ leaq -32(%rsi, %rdx), %rcx
+ movq %r11, %r9
+ movq %r11, %r8
+ andq $(32 - 1), %r8
+ subq %r8, %rcx
+ subq %r8, %r9
+ subq %r8, %rdx
+ cmpq $(1024*1024), %rdx
+ ja large_backward
+ loop_4x_vec_backward:
+ vmovdqu (%rcx), %ymm0
+ vmovdqu -32(%rcx), %ymm1
+ vmovdqu -(32 * 2)(%rcx), %ymm2
+ vmovdqu -(32 * 3)(%rcx), %ymm3
+ subq $(32 * 4), %rcx
+ subq $(32 * 4), %rdx
+ vmovdqa %ymm0, (%r9)
+ vmovdqa %ymm1, -32(%r9)
+ vmovdqa %ymm2, -(32 * 2)(%r9)
+ vmovdqa %ymm3, -(32 * 3)(%r9)
+ subq $(32 * 4), %r9
+ cmpq $(32 * 4), %rdx
+ ja loop_4x_vec_backward
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, 32(%rdi)
+ vmovdqu %ymm6, (32 * 2)(%rdi)
+ vmovdqu %ymm7, (32 * 3)(%rdi)
+ vmovdqu %ymm8, (%r11)
+ vzeroupper
+ jmp .L2
+ large_forward:
+ leaq (%rdi, %rdx), %r10
+ cmpq %r10, %rsi
+ jb loop_4x_vec_forward
+ loop_large_forward:
+ prefetcht0 (32*4*2)(%rsi)
+ prefetcht0 (32*4*2 + 64)(%rsi)
+ prefetcht0 (32*4*3)(%rsi)
+ prefetcht0 (32*4*3 + 64)(%rsi)
+ vmovdqu (%rsi), %ymm0
+ vmovdqu 32(%rsi), %ymm1
+ vmovdqu (32 * 2)(%rsi), %ymm2
+ vmovdqu (32 * 3)(%rsi), %ymm3
+ addq $(32*4), %rsi
+ subq $(32*4), %rdx
+ vmovntdq %ymm0, (%rdi)
+ vmovntdq %ymm1, 32(%rdi)
+ vmovntdq %ymm2, (32 * 2)(%rdi)
+ vmovntdq %ymm3, (32 * 3)(%rdi)
+ addq $(32*4), %rdi
+ cmpq $(32*4), %rdx
+ ja loop_large_forward
+ sfence
+ vmovdqu %ymm5, (%rcx)
+ vmovdqu %ymm6, -32(%rcx)
+ vmovdqu %ymm7, -(32 * 2)(%rcx)
+ vmovdqu %ymm8, -(32 * 3)(%rcx)
+ vmovdqu %ymm4, (%r11)
+ vzeroupper
+ jmp .L2
+ large_backward:
+ leaq (%rcx, %rdx), %r10
+ cmpq %r10, %r9
+ jb loop_4x_vec_backward
+ loop_large_backward:
+ prefetcht0 (-32 * 4 * 2)(%rcx)
+ prefetcht0 (-32 * 4 * 2 - 64)(%rcx)
+ prefetcht0 (-32 * 4 * 3)(%rcx)
+ prefetcht0 (-32 * 4 * 3 - 64)(%rcx)
+ vmovdqu (%rcx), %ymm0
+ vmovdqu -32(%rcx), %ymm1
+ vmovdqu -(32 * 2)(%rcx), %ymm2
+ vmovdqu -(32 * 3)(%rcx), %ymm3
+ subq $(32*4), %rcx
+ subq $(32*4), %rdx
+ vmovntdq %ymm0, (%r9)
+ vmovntdq %ymm1, -32(%r9)
+ vmovntdq %ymm2, -(32 * 2)(%r9)
+ vmovntdq %ymm3, -(32 * 3)(%r9)
+ subq $(32 * 4), %r9
+ cmpq $(32 * 4), %rdx
+ ja loop_large_backward
+ sfence
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu %ymm5, 32(%rdi)
+ vmovdqu %ymm6, (32 * 2)(%rdi)
+ vmovdqu %ymm7, (32 * 3)(%rdi)
+ vmovdqu %ymm8, (%r11)
+ vzeroupper
+ jmp .L2
+#NO_APP
+.L2:
+.L3:
+ movq -40(%rbp), %rax
+ popq %rbx
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %rbp
+ ret
+.LFE0:
+ .size memcpy, .-memcpy
+ .section .note.GNU-stack,"",@progbits
diff --git a/src/string/x86_64/memmove.s b/src/string/x86_64/memmove.s
index 172c0252..24824612 100644
--- a/src/string/x86_64/memmove.s
+++ b/src/string/x86_64/memmove.s
@@ -4,8 +4,7 @@ memmove:
mov %rdi,%rax
sub %rsi,%rax
cmp %rdx,%rax
-.hidden __memcpy_fwd
- jae __memcpy_fwd
+ jae 3f
mov %rdx,%rcx
lea -1(%rdi,%rdx),%rdi
lea -1(%rsi,%rdx),%rsi
@@ -14,3 +13,23 @@ memmove:
cld
lea 1(%rdi),%rax
ret
+3: mov %rdi,%rax
+ cmp $8,%rdx
+ jc 1f
+ test $7,%edi
+ jz 1f
+2: movsb
+ dec %rdx
+ test $7,%edi
+ jnz 2b
+1: mov %rdx,%rcx
+ shr $3,%rcx
+ rep
+ movsq
+ and $7,%edx
+ jz 1f
+2: movsb
+ dec %edx
+ jnz 2b
+1: ret
+
--
2.31.1

15 changes: 15 additions & 0 deletions pkg/musl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ CFLAGS="$optcflags" \
[ ! -z "$DEBUGBUILD" ] && echo "CFLAGS_ALL_SHARED += -O0 -g" >> config.mak

make -j$MAKE_THREADS

if test "$A" = x86_64 && test "$option_amd_memcpy" = 1 ; then
# use AMD memcpy with AVX. this is the fastest memcpy impl i've seen for x86_64
# use this option with consideration, because it has several issues:
# this will make all your static linked binaries GPL2+ instead of MIT,
# considerably bigger, and your static binaries won't work on CPUs without AVX.
# therefore we put this only into libc.so, but not libc.a
# ref: https://github.com/rofl0r/memcpy-test/commit/fe1016aedcfd117aebc2193eba3184cc2023eac9
patch -p1 < "$K"/musl-improved-memcpy.patch
touch obj/src/string/x86_64/memcpy.o
touch obj/src/string/x86_64/memmove.o
touch lib/libc.a
make lib/libc.so
fi

make DESTDIR="$butch_install_dir" install

[ "$butch_prefix" = "/" ] && butch_prefix=
Expand Down

4 comments on commit 40c9a77

@firasuke
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is interesting, but why wasn't it proposed or accepted by upstream?

I know Rich has been trying to reduce ASM usage as much as possible, which could explain why it wasn't accepted?

@rofl0r
Copy link
Member Author

@rofl0r rofl0r commented on 40c9a77 May 5, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is GPL code; therefore incompatible with musl's license

@firasuke
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. And how does this compare against cosmopolitan's memcpy/strcpy implementations?

@rofl0r
Copy link
Member Author

@rofl0r rofl0r commented on 40c9a77 May 5, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's a lot faster, but i dunno about strcpy. it's also not really interesting for me since strings are usually very short, whereas memcpy is used for all different kinds of blocksizes.

Please sign in to comment.