-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ExactSizeIterator seems to generate worse assembly if mutated before collected into Vec
#110734
Comments
Vec
Vec
The offending non-optimization is here: rust/library/alloc/src/vec/spec_from_iter.rs Lines 37 to 64 in 7f94b31
If I just remove the if-condition and unconditionally reuse the big asmrust_test::exact_size:
push r15
push r14
push rbx
sub rsp, 64
mov edi, 12
mov esi, 4
call qword ptr [rip + __rust_alloc@GOTPCREL]
test rax, rax
je .LBB0_6
movabs rcx, 8589934593
mov qword ptr [rax], rcx
mov dword ptr [rax + 8], 3
mov rcx, rax
add rcx, 12
mov qword ptr [rsp + 8], 3
mov qword ptr [rsp + 16], rax
mov qword ptr [rsp + 24], rcx
mov qword ptr [rsp + 32], rax
lea rax, [rsp + 8]
#APP
#NO_APP
mov r15, qword ptr [rsp + 8]
mov rsi, qword ptr [rsp + 16]
mov rbx, qword ptr [rsp + 24]
mov r14, qword ptr [rsp + 32]
sub rbx, rsi
cmp rsi, r14
je .LBB0_3
mov rdi, r14
mov rdx, rbx
call qword ptr [rip + memmove@GOTPCREL]
.LBB0_3:
shr rbx, 2
mov qword ptr [rsp + 40], r15
mov qword ptr [rsp + 48], r14
mov qword ptr [rsp + 56], rbx
lea rax, [rsp + 40]
#APP
#NO_APP
mov rsi, qword ptr [rsp + 40]
test rsi, rsi
je .LBB0_5
mov rdi, qword ptr [rsp + 48]
shl rsi, 2
mov edx, 4
call qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB0_5:
add rsp, 64
pop rbx
pop r14
pop r15
ret
.LBB0_6:
mov edi, 12
mov esi, 4
call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
ud2 rust_test::var_size:
push rbx
sub rsp, 32
mov edi, 12
mov esi, 4
call qword ptr [rip + __rust_alloc@GOTPCREL]
test rax, rax
je .LBB1_12
movabs rcx, 8589934593
mov qword ptr [rax], rcx
mov dword ptr [rax + 8], 3
mov rcx, rax
add rcx, 12
mov qword ptr [rsp], 3
mov qword ptr [rsp + 8], rax
mov qword ptr [rsp + 16], rcx
mov qword ptr [rsp + 24], rax
mov rax, rsp
#APP
#NO_APP
mov rcx, qword ptr [rsp]
mov r8, qword ptr [rsp + 8]
mov rsi, qword ptr [rsp + 16]
mov rdx, qword ptr [rsp + 24]
mov rdi, rdx
cmp r8, rsi
je .LBB1_9
mov r10, rsi
sub r10, r8
add r10, -4
cmp r10, 28
jb .LBB1_3
mov rdi, rdx
sub rdi, r8
cmp rdi, 32
jb .LBB1_3
shr r10, 2
inc r10
mov r11, r10
and r11, -8
lea rdi, [rdx + 4*r11]
lea r9, [r8 + 4*r11]
xor ebx, ebx
.LBB1_6:
movups xmm0, xmmword ptr [r8 + 4*rbx]
movups xmm1, xmmword ptr [r8 + 4*rbx + 16]
movups xmmword ptr [rdx + 4*rbx], xmm0
movups xmmword ptr [rdx + 4*rbx + 16], xmm1
add rbx, 8
cmp r11, rbx
jne .LBB1_6
cmp r10, r11
jne .LBB1_8
jmp .LBB1_9
.LBB1_3:
mov rdi, rdx
mov r9, r8
.LBB1_8:
mov r8d, dword ptr [r9]
add r9, 4
mov dword ptr [rdi], r8d
add rdi, 4
cmp r9, rsi
jne .LBB1_8
.LBB1_9:
sub rdi, rdx
shr rdi, 2
mov qword ptr [rsp], rcx
mov qword ptr [rsp + 8], rdx
mov qword ptr [rsp + 16], rdi
#APP
#NO_APP
mov rsi, qword ptr [rsp]
test rsi, rsi
je .LBB1_11
mov rdi, qword ptr [rsp + 8]
shl rsi, 2
mov edx, 4
call qword ptr [rip + __rust_dealloc@GOTPCREL]
.LBB1_11:
add rsp, 32
pop rbx
ret
.LBB1_12:
mov edi, 12
mov esi, 4
call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
ud2 |
We have many different code-paths for rust/library/alloc/src/vec/spec_from_iter.rs Lines 6 to 23 in 7f94b31
So differences in assembly output are to be expected. In particular when collecting from a Have you encountered a measurable performance problem due to those differences? |
Thanks for pointing the way. It helps tremendously.
Not yet. I discovered this while prototyping some iterable abstractions. I think now I can simulate an unsafe workaround in my library code if turned out really necessary. You can backlog or close this issue. |
I've gone through the specialization logic and the generated assembly. It turns out to be a minor problem. Here is a summary: Cause
ImpactThis turns out to be a very artificial issue and has minimal performance impact.
Optional FixWe can equalize the source vector reuse optimization aggressiveness between the two specializations. They have the same space-efficiency concern of producing sparsely populated The suggested fix is to unconditionally reuse source ConclusionWe can either close this issue due to its minimal impact or take the optional fix. @lukas-code @the8472 |
@rustbot label +A-codegen +I-heavy |
Godbolt link https://godbolt.org/z/cMdx6v1G9
I expected to see this happen:
exact_size
to generate better assembly thanvar_size
Instead, this happened:
var_size
generates way shorter assembly (~90) thanexact size
(>200) with zero call to allocateVec
var_size
is trying to allocate theVec
on stack. This optimization did not happen onexact_size
(Excuse me if I misintepreted the assembly).The key to trigger this deoptimization seems to be mutating the iterator before collecting. This example is a reduction of a real-world code where the first few elements are processed differently and the rest of the elements is collected into
Vec
then consumed locally.Godbolt link for a more realistic example: https://godbolt.org/z/sccdTcvh6
Edit: Include a more realistic example
The text was updated successfully, but these errors were encountered: