musl: add the option on x86_64 to use the super-fast AMD memcpy

if selected, we only put this into dynamic lib, not static, otherwise your static linked binaries will all be GPL2 infected and only work on cpus with AVX (and also several hundred bytes bigger).
sabotage-linux · Apr 8, 2022 · 40c9a77 · 40c9a77 · firasuke · May 5, 2022
1 parent e1632a8
commit 40c9a77
Show file tree

Hide file tree

Showing 2 changed files with 379 additions and 0 deletions.
diff --git a/KEEP/musl-improved-memcpy.patch b/KEEP/musl-improved-memcpy.patch
@@ -0,0 +1,364 @@
+From 28a355ec36542d24a4a80de748dd9e1cf8ab350d Mon Sep 17 00:00:00 2001
+From: rofl0r <rofl0r@users.noreply.github.com>
+Date: Mon, 6 Sep 2021 18:13:56 +0000
+Subject: [PATCH] add amd improved memcpy
+
+/*
+ * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+---
+ src/string/x86_64/memcpy.s  | 288 +++++++++++++++++++++++++++++++++---
+ src/string/x86_64/memmove.s |  23 ++-
+ 2 files changed, 285 insertions(+), 26 deletions(-)
+
+diff --git a/src/string/x86_64/memcpy.s b/src/string/x86_64/memcpy.s
+index 3d960efa..f61e7aad 100644
+--- a/src/string/x86_64/memcpy.s
++++ b/src/string/x86_64/memcpy.s
+@@ -1,25 +1,265 @@
+-.global memcpy
+-.global __memcpy_fwd
+-.hidden __memcpy_fwd
+-.type memcpy,@function
++	.text
++	.globl	memcpy
++	.type	memcpy, @function
+ memcpy:
+-__memcpy_fwd:
+-	mov %rdi,%rax
+-	cmp $8,%rdx
+-	jc 1f
+-	test $7,%edi
+-	jz 1f
+-2:	movsb
+-	dec %rdx
+-	test $7,%edi
+-	jnz 2b
+-1:	mov %rdx,%rcx
+-	shr $3,%rcx
+-	rep
+-	movsq
+-	and $7,%edx
+-	jz 1f
+-2:	movsb
+-	dec %edx
+-	jnz 2b
+-1:	ret
++.LFB0:
++	pushq	%rbp
++	movq	%rsp, %rbp
++	pushq	%r14
++	pushq	%r13
++	pushq	%r12
++	pushq	%rbx
++	movq	%rdi, -40(%rbp)
++	movq	%rsi, -48(%rbp)
++	movq	%rdx, -56(%rbp)
++	movq	-48(%rbp), %rbx
++	movq	-40(%rbp), %r13
++	movq	-56(%rbp), %r14
++#APP
++# 27 "amd.c" 1
++	movq	%rbx, %rsi
++	movq	%r13, %rdi
++	movq	%r14, %rdx
++	movq    %rdi, %rax
++	cmp     $32, %rdx
++	jb      less_vec
++	cmp     $(32 * 2), %rdx
++	ja      more_2x_vec
++	vmovdqu   (%rsi), %ymm0
++	vmovdqu   -32(%rsi,%rdx), %ymm1
++	vmovdqu   %ymm0, (%rdi)
++	vmovdqu   %ymm1, -32(%rdi,%rdx)
++	vzeroupper
++	jmp .L2
++	less_vec:
++	cmpb    $32, %dl
++	jae     between_32_63
++	cmpb    $16, %dl
++	jae     between_16_31
++	cmpb    $8, %dl
++	jae     between_8_15
++	cmpb    $4, %dl
++	jae     between_4_7
++	cmpb    $1, %dl
++	ja      between_2_3
++	jb      1f
++	movzbl  (%rsi), %ecx
++	movb    %cl, (%rdi)
++	1:
++	jmp .L2
++	between_32_63:
++	vmovdqu (%rsi), %ymm0
++	vmovdqu -32(%rsi,%rdx), %ymm1
++	vmovdqu %ymm0, (%rdi)
++	vmovdqu %ymm1, -32(%rdi,%rdx)
++	vzeroupper
++	jmp .L2
++	between_16_31:
++	vmovdqu (%rsi), %xmm0
++	vmovdqu -16(%rsi,%rdx), %xmm1
++	vmovdqu %xmm0, (%rdi)
++	vmovdqu %xmm1, -16(%rdi,%rdx)
++	jmp .L2
++	between_8_15:
++	movq    -8(%rsi,%rdx), %rcx
++	movq    (%rsi), %rsi
++	movq    %rcx, -8(%rdi,%rdx)
++	movq    %rsi, (%rdi)
++	jmp .L2
++	between_4_7:
++	movl    -4(%rsi,%rdx), %ecx
++	movl    (%rsi), %esi
++	movl    %ecx, -4(%rdi,%rdx)
++	movl    %esi, (%rdi)
++	jmp .L2
++	between_2_3:
++	movzwl  -2(%rsi,%rdx), %ecx
++	movzwl  (%rsi), %esi
++	movw    %cx, -2(%rdi,%rdx)
++	movw    %si, (%rdi)
++	jmp .L2
++	more_2x_vec:
++	cmpq    $(32 * 8), %rdx
++	ja      more_8x_vec
++	cmpq    $(32 * 4), %rdx
++	jb      last_4x_vec
++	vmovdqu   (%rsi), %ymm0
++	vmovdqu   32(%rsi), %ymm1
++	vmovdqu   (32 * 2)(%rsi), %ymm2
++	vmovdqu   (32 * 3)(%rsi), %ymm3
++	vmovdqu   -32(%rsi,%rdx), %ymm4
++	vmovdqu   -(32 * 2)(%rsi,%rdx), %ymm5
++	vmovdqu   -(32 * 3)(%rsi,%rdx), %ymm6
++	vmovdqu   -(32 * 4)(%rsi,%rdx), %ymm7
++	vmovdqu   %ymm0, (%rdi)
++	vmovdqu   %ymm1, 32(%rdi)
++	vmovdqu   %ymm2, (32 * 2)(%rdi)
++	vmovdqu   %ymm3, (32 * 3)(%rdi)
++	vmovdqu   %ymm4, -32(%rdi,%rdx)
++	vmovdqu   %ymm5, -(32 * 2)(%rdi,%rdx)
++	vmovdqu   %ymm6, -(32 * 3)(%rdi,%rdx)
++	vmovdqu   %ymm7, -(32 * 4)(%rdi,%rdx)
++	vzeroupper
++	jmp .L2
++	last_4x_vec:
++	vmovdqu   (%rsi), %ymm0
++	vmovdqu   32(%rsi), %ymm1
++	vmovdqu   -32(%rsi,%rdx), %ymm2
++	vmovdqu   -(32 * 2)(%rsi,%rdx), %ymm3
++	vmovdqu   %ymm0, (%rdi)
++	vmovdqu   %ymm1, 32(%rdi)
++	vmovdqu   %ymm2, -32(%rdi,%rdx)
++	vmovdqu   %ymm3, -(32 * 2)(%rdi,%rdx)
++	vzeroupper
++	nop:
++	jmp .L2
++	more_8x_vec:
++	cmpq    %rsi, %rdi
++	ja      more_8x_vec_backward
++	je      nop
++	vmovdqu   (%rsi), %ymm4
++	vmovdqu   -32(%rsi, %rdx), %ymm5
++	vmovdqu   -(32 * 2)(%rsi, %rdx), %ymm6
++	vmovdqu   -(32 * 3)(%rsi, %rdx), %ymm7
++	vmovdqu   -(32 * 4)(%rsi, %rdx), %ymm8
++	movq    %rdi, %r11
++	leaq    -32(%rdi, %rdx), %rcx
++	movq    %rdi, %r8
++	andq    $(32 - 1), %r8
++	subq    $32, %r8
++	subq    %r8, %rsi
++	subq    %r8, %rdi
++	addq    %r8, %rdx
++	cmpq	 $(1024*1024), %rdx
++	ja      large_forward
++	loop_4x_vec_forward:
++	vmovdqu   (%rsi), %ymm0
++	vmovdqu   32(%rsi), %ymm1
++	vmovdqu   (32 * 2)(%rsi), %ymm2
++	vmovdqu   (32 * 3)(%rsi), %ymm3
++	addq    $(32 * 4), %rsi
++	subq    $(32 * 4), %rdx
++	vmovdqa   %ymm0, (%rdi)
++	vmovdqa   %ymm1, 32(%rdi)
++	vmovdqa   %ymm2, (32 * 2)(%rdi)
++	vmovdqa   %ymm3, (32 * 3)(%rdi)
++	addq    $(32 * 4), %rdi
++	cmpq    $(32 * 4), %rdx
++	ja      loop_4x_vec_forward
++	vmovdqu   %ymm5, (%rcx)
++	vmovdqu   %ymm6, -32(%rcx)
++	vmovdqu   %ymm7, -(32 * 2)(%rcx)
++	vmovdqu   %ymm8, -(32 * 3)(%rcx)
++	vmovdqu   %ymm4, (%r11)
++	vzeroupper
++	jmp .L2
++	more_8x_vec_backward:
++	vmovdqu   (%rsi), %ymm4
++	vmovdqu   32(%rsi), %ymm5
++	vmovdqu   (32 * 2)(%rsi), %ymm6
++	vmovdqu   (32 * 3)(%rsi), %ymm7
++	vmovdqu   -32(%rsi,%rdx), %ymm8
++	leaq    -32(%rdi, %rdx), %r11
++	leaq    -32(%rsi, %rdx), %rcx
++	movq    %r11, %r9
++	movq    %r11, %r8
++	andq    $(32 - 1), %r8
++	subq    %r8, %rcx
++	subq    %r8, %r9
++	subq    %r8, %rdx
++	cmpq	 $(1024*1024), %rdx
++	ja      large_backward
++	loop_4x_vec_backward:
++	vmovdqu   (%rcx), %ymm0
++	vmovdqu   -32(%rcx), %ymm1
++	vmovdqu   -(32 * 2)(%rcx), %ymm2
++	vmovdqu   -(32 * 3)(%rcx), %ymm3
++	subq    $(32 * 4), %rcx
++	subq    $(32 * 4), %rdx
++	vmovdqa   %ymm0, (%r9)
++	vmovdqa   %ymm1, -32(%r9)
++	vmovdqa   %ymm2, -(32 * 2)(%r9)
++	vmovdqa   %ymm3, -(32 * 3)(%r9)
++	subq    $(32 * 4), %r9
++	cmpq    $(32 * 4), %rdx
++	ja      loop_4x_vec_backward
++	vmovdqu   %ymm4, (%rdi)
++	vmovdqu   %ymm5, 32(%rdi)
++	vmovdqu   %ymm6, (32 * 2)(%rdi)
++	vmovdqu   %ymm7, (32 * 3)(%rdi)
++	vmovdqu   %ymm8, (%r11)
++	vzeroupper
++	jmp .L2
++	large_forward:
++	leaq    (%rdi, %rdx), %r10
++	cmpq    %r10, %rsi
++	jb      loop_4x_vec_forward
++	loop_large_forward:
++	prefetcht0 (32*4*2)(%rsi)
++	prefetcht0 (32*4*2 + 64)(%rsi)
++	prefetcht0 (32*4*3)(%rsi)
++	prefetcht0 (32*4*3 + 64)(%rsi)
++	vmovdqu   (%rsi), %ymm0
++	vmovdqu   32(%rsi), %ymm1
++	vmovdqu   (32 * 2)(%rsi), %ymm2
++	vmovdqu   (32 * 3)(%rsi), %ymm3
++	addq    $(32*4), %rsi
++	subq    $(32*4), %rdx
++	vmovntdq  %ymm0, (%rdi)
++	vmovntdq  %ymm1, 32(%rdi)
++	vmovntdq  %ymm2, (32 * 2)(%rdi)
++	vmovntdq  %ymm3, (32 * 3)(%rdi)
++	addq    $(32*4), %rdi
++	cmpq    $(32*4), %rdx
++	ja      loop_large_forward
++	sfence
++	vmovdqu   %ymm5, (%rcx)
++	vmovdqu   %ymm6, -32(%rcx)
++	vmovdqu   %ymm7, -(32 * 2)(%rcx)
++	vmovdqu   %ymm8, -(32 * 3)(%rcx)
++	vmovdqu   %ymm4, (%r11)
++	vzeroupper
++	jmp .L2
++	large_backward:
++	leaq    (%rcx, %rdx), %r10
++	cmpq    %r10, %r9
++	jb      loop_4x_vec_backward
++	loop_large_backward:
++	prefetcht0 (-32 * 4 * 2)(%rcx)
++	prefetcht0 (-32 * 4 * 2 - 64)(%rcx)
++	prefetcht0 (-32 * 4 * 3)(%rcx)
++	prefetcht0 (-32 * 4 * 3 - 64)(%rcx)
++	vmovdqu   (%rcx), %ymm0
++	vmovdqu   -32(%rcx), %ymm1
++	vmovdqu   -(32 * 2)(%rcx), %ymm2
++	vmovdqu   -(32 * 3)(%rcx), %ymm3
++	subq    $(32*4), %rcx
++	subq    $(32*4), %rdx
++	vmovntdq  %ymm0, (%r9)
++	vmovntdq  %ymm1, -32(%r9)
++	vmovntdq  %ymm2, -(32 * 2)(%r9)
++	vmovntdq  %ymm3, -(32 * 3)(%r9)
++	subq    $(32 * 4), %r9
++	cmpq    $(32 * 4), %rdx
++	ja      loop_large_backward
++	sfence
++	vmovdqu   %ymm4, (%rdi)
++	vmovdqu   %ymm5, 32(%rdi)
++	vmovdqu   %ymm6, (32 * 2)(%rdi)
++	vmovdqu   %ymm7, (32 * 3)(%rdi)
++	vmovdqu   %ymm8, (%r11)
++	vzeroupper
++	jmp .L2
++#NO_APP
++.L2:
++.L3:
++	movq	-40(%rbp), %rax
++	popq	%rbx
++	popq	%r12
++	popq	%r13
++	popq	%r14
++	popq	%rbp
++	ret
++.LFE0:
++	.size	memcpy, .-memcpy
++	.section	.note.GNU-stack,"",@progbits
+diff --git a/src/string/x86_64/memmove.s b/src/string/x86_64/memmove.s
+index 172c0252..24824612 100644
+--- a/src/string/x86_64/memmove.s
++++ b/src/string/x86_64/memmove.s
+@@ -4,8 +4,7 @@ memmove:
+ 	mov %rdi,%rax
+ 	sub %rsi,%rax
+ 	cmp %rdx,%rax
+-.hidden __memcpy_fwd
+-	jae __memcpy_fwd
++	jae 3f
+ 	mov %rdx,%rcx
+ 	lea -1(%rdi,%rdx),%rdi
+ 	lea -1(%rsi,%rdx),%rsi
+@@ -14,3 +13,23 @@ memmove:
+ 	cld
+ 	lea 1(%rdi),%rax
+ 	ret
++3:	mov %rdi,%rax
++	cmp $8,%rdx
++	jc 1f
++	test $7,%edi
++	jz 1f
++2:	movsb
++	dec %rdx
++	test $7,%edi
++	jnz 2b
++1:	mov %rdx,%rcx
++	shr $3,%rcx
++	rep
++	movsq
++	and $7,%edx
++	jz 1f
++2:	movsb
++	dec %edx
++	jnz 2b
++1:	ret
++
+-- 
+2.31.1
+
diff --git a/pkg/musl b/pkg/musl
@@ -27,6 +27,21 @@ CFLAGS="$optcflags" \
 [ ! -z "$DEBUGBUILD" ] && echo "CFLAGS_ALL_SHARED += -O0 -g" >> config.mak
 
 make -j$MAKE_THREADS
+
+if test "$A" = x86_64 && test "$option_amd_memcpy" = 1 ; then
+# use AMD memcpy with AVX. this is the fastest memcpy impl i've seen for x86_64
+# use this option with consideration, because it has several issues:
+# this will make all your static linked binaries GPL2+ instead of MIT,
+# considerably bigger, and your static binaries won't work on CPUs without AVX.
+# therefore we put this only into libc.so, but not libc.a
+# ref: https://github.com/rofl0r/memcpy-test/commit/fe1016aedcfd117aebc2193eba3184cc2023eac9
+patch -p1 < "$K"/musl-improved-memcpy.patch
+touch obj/src/string/x86_64/memcpy.o
+touch obj/src/string/x86_64/memmove.o
+touch lib/libc.a
+make lib/libc.so
+fi
+
 make DESTDIR="$butch_install_dir" install
 
 [ "$butch_prefix" = "/" ] && butch_prefix=