Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SIMD intrinsics are never inlined in debug #129953

Closed
usamoi opened this issue Sep 4, 2024 · 2 comments
Closed

SIMD intrinsics are never inlined in debug #129953

usamoi opened this issue Sep 4, 2024 · 2 comments
Labels
C-discussion Category: Discussion or questions that doesn't represent real issues.

Comments

@usamoi
Copy link
Contributor

usamoi commented Sep 4, 2024

I tried this code:

#![feature(avx512_target_feature)]
#![feature(stdarch_x86_avx512)]

use std::arch::x86_64::*;

#[no_mangle]
#[target_feature(enable = "avx512bw")]
unsafe fn inline_failed(us: __m512i) {
    unsafe {
        std::hint::black_box(_mm512_abs_epi16(us));
    }
}

fn main() {
    unsafe {
        inline_failed(std::mem::zeroed());
    }
}

I expected to see this happen: the intrinsics is inlined

Instead, this happened: SIMD intrinsics are never inlined in debug, and rustc generates

0000000000013bd0 <core::core_arch::x86::avx512bw::_mm512_abs_epi16>:
   13bd0:	55                   	push   %rbp
   13bd1:	48 89 e5             	mov    %rsp,%rbp
   13bd4:	48 83 e4 c0          	and    $0xffffffffffffffc0,%rsp
   13bd8:	48 81 ec 80 02 00 00 	sub    $0x280,%rsp
   13bdf:	48 89 7c 24 30       	mov    %rdi,0x30(%rsp)
   13be4:	48 89 7c 24 38       	mov    %rdi,0x38(%rsp)
   13be9:	62 f1 fd 48 6f 06    	vmovdqa64 (%rsi),%zmm0
   13bef:	62 f1 fd 48 7f 44 24 	vmovdqa64 %zmm0,0x80(%rsp)
   13bf6:	02 
   13bf7:	48 8d 7c 24 40       	lea    0x40(%rsp),%rdi
   13bfc:	48 8d b4 24 80 00 00 	lea    0x80(%rsp),%rsi
   13c03:	00 
   13c04:	c5 f8 77             	vzeroupper
   13c07:	e8 b4 03 00 00       	call   13fc0 <core::core_arch::x86::m512iExt::as_i16x32>
   13c0c:	48 8b 7c 24 30       	mov    0x30(%rsp),%rdi
   13c11:	48 8b 44 24 38       	mov    0x38(%rsp),%rax
   13c16:	62 f1 fd 48 6f 4c 24 	vmovdqa64 0x40(%rsp),%zmm1
   13c1d:	01 
   13c1e:	62 f1 fd 48 7f 4c 24 	vmovdqa64 %zmm1,0xc0(%rsp)
   13c25:	03 
   13c26:	66 c7 84 24 6e 02 00 	movw   $0x0,0x26e(%rsp)
   13c2d:	00 00 00 
   13c30:	66 c7 84 24 6c 02 00 	movw   $0x0,0x26c(%rsp)
   13c37:	00 00 00 
   13c3a:	66 8b 8c 24 6c 02 00 	mov    0x26c(%rsp),%cx
   13c41:	00 
   13c42:	66 89 8c 24 6a 02 00 	mov    %cx,0x26a(%rsp)
   13c49:	00 
   13c4a:	62 f2 7d 48 79 84 24 	vpbroadcastw 0x26a(%rsp),%zmm0
   13c51:	6a 02 00 00 
   13c55:	62 f1 fd 48 7f 44 24 	vmovdqa64 %zmm0,0x100(%rsp)
   13c5c:	04 
   13c5d:	62 f1 fd 48 6f 44 24 	vmovdqa64 0x100(%rsp),%zmm0
   13c64:	04 
   13c65:	62 f1 75 48 65 c0    	vpcmpgtw %zmm0,%zmm1,%k0
   13c6b:	62 f2 fe 48 28 c0    	vpmovm2w %k0,%zmm0
   13c71:	62 f1 fd 48 7f 44 24 	vmovdqa64 %zmm0,0x140(%rsp)
   13c78:	05 
   13c79:	62 f1 fd 48 6f 54 24 	vmovdqa64 0x140(%rsp),%zmm2
   13c80:	05 
   13c81:	62 f1 fd 48 7f 54 24 	vmovdqa64 %zmm2,0x180(%rsp)
   13c88:	06 
   13c89:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
   13c8d:	62 f1 7d 48 f9 c1    	vpsubw %zmm1,%zmm0,%zmm0
   13c93:	62 f1 fd 48 7f 44 24 	vmovdqa64 %zmm0,0x1c0(%rsp)
   13c9a:	07 
   13c9b:	62 f1 fd 48 6f 44 24 	vmovdqa64 0x1c0(%rsp),%zmm0
   13ca2:	07 
   13ca3:	62 f1 6d 48 71 f2 0f 	vpsllw $0xf,%zmm2,%zmm2
   13caa:	62 f2 fe 48 29 ca    	vpmovw2m %zmm2,%k1
   13cb0:	62 f1 ff 49 6f c1    	vmovdqu16 %zmm1,%zmm0{%k1}
   13cb6:	62 f1 fd 48 7f 44 24 	vmovdqa64 %zmm0,0x200(%rsp)
   13cbd:	08 
   13cbe:	62 f1 fd 48 6f 44 24 	vmovdqa64 0x200(%rsp),%zmm0
   13cc5:	08 
   13cc6:	62 f1 fd 48 7f 07    	vmovdqa64 %zmm0,(%rdi)
   13ccc:	48 89 ec             	mov    %rbp,%rsp
   13ccf:	5d                   	pop    %rbp
   13cd0:	c5 f8 77             	vzeroupper
   13cd3:	c3                   	ret
   13cd4:	cc                   	int3
   13cd5:	cc                   	int3
   13cd6:	cc                   	int3
   13cd7:	cc                   	int3
   13cd8:	cc                   	int3
   13cd9:	cc                   	int3
   13cda:	cc                   	int3
   13cdb:	cc                   	int3
   13cdc:	cc                   	int3
   13cdd:	cc                   	int3
   13cde:	cc                   	int3
   13cdf:	cc                   	int3

0000000000013ce0 <inline_failed>:
   13ce0:	55                   	push   %rbp
   13ce1:	48 89 e5             	mov    %rsp,%rbp
   13ce4:	48 83 e4 c0          	and    $0xffffffffffffffc0,%rsp
   13ce8:	48 81 ec 40 01 00 00 	sub    $0x140,%rsp
   13cef:	62 f1 fd 48 6f 07    	vmovdqa64 (%rdi),%zmm0
   13cf5:	62 f1 fd 48 7f 44 24 	vmovdqa64 %zmm0,0x40(%rsp)
   13cfc:	01 
   13cfd:	48 89 e7             	mov    %rsp,%rdi
   13d00:	48 8d 74 24 40       	lea    0x40(%rsp),%rsi
   13d05:	c5 f8 77             	vzeroupper
   13d08:	e8 c3 fe ff ff       	call   13bd0 <core::core_arch::x86::avx512bw::_mm512_abs_epi16>
   13d0d:	62 f1 fd 48 6f 04 24 	vmovdqa64 (%rsp),%zmm0
   13d14:	62 f1 fd 48 7f 44 24 	vmovdqa64 %zmm0,0xc0(%rsp)
   13d1b:	03 
   13d1c:	48 8d bc 24 80 00 00 	lea    0x80(%rsp),%rdi
   13d23:	00 
   13d24:	48 8d b4 24 c0 00 00 	lea    0xc0(%rsp),%rsi
   13d2b:	00 
   13d2c:	c5 f8 77             	vzeroupper
   13d2f:	e8 5c 04 00 00       	call   14190 <core::hint::black_box>
   13d34:	48 89 ec             	mov    %rbp,%rsp
   13d37:	5d                   	pop    %rbp
   13d38:	c3                   	ret
   13d39:	cc                   	int3
   13d3a:	cc                   	int3
   13d3b:	cc                   	int3
   13d3c:	cc                   	int3
   13d3d:	cc                   	int3
   13d3e:	cc                   	int3
   13d3f:	cc                   	int3

Meta

rustc --version --verbose:

rustc 1.83.0-nightly (bd53aa3bf 2024-09-02)
binary: rustc
commit-hash: bd53aa3bf7a24a70d763182303bd75e5fc51a9af
commit-date: 2024-09-02
host: x86_64-unknown-linux-gnu
release: 1.83.0-nightly
LLVM version: 19.1.0

The behavior is the same as Rust 1.53 (https://godbolt.org/z/v3qovqG3P). Is it expected behavior?

@usamoi usamoi added the C-bug Category: This is a bug. label Sep 4, 2024
@rustbot rustbot added the needs-triage This issue may need triage. Remove it if it has been sufficiently triaged. label Sep 4, 2024
@saethlin
Copy link
Member

saethlin commented Sep 4, 2024

Yes it is expected that the compiler does not do any optimizations when that is what you ask for.

The "decent codegen" setting is opt-level=1. It's a much better experience when doing SIMD.

@saethlin saethlin added C-discussion Category: Discussion or questions that doesn't represent real issues. and removed C-bug Category: This is a bug. needs-triage This issue may need triage. Remove it if it has been sufficiently triaged. labels Sep 4, 2024
@usamoi
Copy link
Contributor Author

usamoi commented Sep 5, 2024

I add opt-level=1 to profile.dev, and intrinsics are inlined correctly. I'm closing it since it works.

@usamoi usamoi closed this as completed Sep 5, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
C-discussion Category: Discussion or questions that doesn't represent real issues.
Projects
None yet
Development

No branches or pull requests

3 participants