1.26.0 cannot optimize length of vector collected from a slice of known size #50925

tspiteri · 2018-05-20T21:11:48Z

The code:

pub fn foo() {
    let a: Vec<_> = [1, 2, 3].iter().collect();
    assert!(a.len() == 3);
}

With rustc 1.25.0:

example::foo:
        push    rbp
        mov     rbp, rsp
        pop     rbp
        ret

With rustc 1.26.0:

;...
example::foo:
        push    rbp
        mov     rbp, rsp
        push    rbx
        sub     rsp, 88
        mov     qword ptr [rbp - 56], 8
        xorps   xmm0, xmm0
        movups  xmmword ptr [rbp - 48], xmm0
        lea     rdx, [rbp - 32]
        mov     edi, 24
        mov     esi, 8
        call    __rust_alloc@PLT
        test    rax, rax
        je      .LBB6_6
        mov     qword ptr [rbp - 56], rax
        mov     qword ptr [rbp - 48], 3
        jmp     .LBB6_2
.LBB6_6:
        movups  xmm0, xmmword ptr [rbp - 24]
        movaps  xmmword ptr [rbp - 96], xmm0
        mov     qword ptr [rbp - 32], 0
        movaps  xmm0, xmmword ptr [rbp - 96]
        movups  xmmword ptr [rbp - 24], xmm0
        lea     rdi, [rbp - 80]
        lea     rsi, [rbp - 32]
        call    <core::heap::CollectionAllocErr as core::convert::From<core::heap::AllocErr>>::from@PLT
        mov     rax, qword ptr [rbp - 80]
        cmp     rax, 3
        jne     .LBB6_9
        mov     eax, 8
.LBB6_2:
        xor     ecx, ecx
        test    cl, cl
        jne     .LBB6_4
        lea     rcx, [rip + .Lbyte_str.c]
        mov     qword ptr [rax], rcx
        lea     rcx, [rip + .Lbyte_str.c+4]
        mov     qword ptr [rax + 8], rcx
        lea     rcx, [rip + .Lbyte_str.c+8]
        mov     qword ptr [rax + 16], rcx
        mov     ecx, 3
.LBB6_4:
        mov     qword ptr [rbp - 40], rcx
        movups  xmm0, xmmword ptr [rbp - 56]
        movaps  xmmword ptr [rbp - 32], xmm0
        mov     qword ptr [rbp - 16], rcx
        cmp     qword ptr [rbp - 16], 3
        jne     .LBB6_5
        mov     rsi, qword ptr [rbp - 24]
        test    rsi, rsi
        je      .LBB6_15
        shl     rsi, 3
        mov     rdi, qword ptr [rbp - 32]
        mov     edx, 8
        call    __rust_dealloc@PLT
.LBB6_15:
        add     rsp, 88
        pop     rbx
        pop     rbp
        ret
.LBB6_5:
        call    std::panicking::begin_panic
        jmp     .LBB6_11
.LBB6_9:
        cmp     rax, 2
        jne     .LBB6_12
        lea     rdi, [rip + .Lbyte_str.6]
        call    core::panicking::panic@PLT
.LBB6_11:
        ud2
.LBB6_12:
        mov     rcx, qword ptr [rbp - 72]
        mov     rdx, qword ptr [rbp - 64]
        mov     qword ptr [rbp - 32], rax
        mov     qword ptr [rbp - 24], rcx
        mov     qword ptr [rbp - 16], rdx
        lea     rdi, [rbp - 32]
        call    <alloc::heap::Heap as core::heap::Alloc>::oom
        ud2
        mov     rbx, rax
        lea     rdi, [rbp - 32]
        jmp     .LBB6_18
        mov     rbx, rax
        lea     rdi, [rbp - 56]
.LBB6_18:
        call    core::ptr::drop_in_place
        mov     rdi, rbx
        call    _Unwind_Resume@PLT
        ud2
;...

ishitatsuyuki · 2018-05-20T23:56:09Z

Can you try on nightly?

sfackler · 2018-05-21T02:13:39Z

Nightly doesn't elide the allocation either:

playground::main:
	pushq	%rbx
	subq	$64, %rsp
	movq	$8, 8(%rsp)
	pxor	%xmm0, %xmm0
	movdqu	%xmm0, 16(%rsp)
	movl	$24, %edi
	movl	$8, %esi
	callq	__rust_alloc@PLT
	testq	%rax, %rax
	je	.LBB10_5
	movq	%rax, 8(%rsp)
	movq	$3, 16(%rsp)
	xorl	%ecx, %ecx
	testb	%cl, %cl
	jne	.LBB10_3
	leaq	.Lbyte_str.a+4(%rip), %rcx
	movq	%rcx, %xmm0
	leaq	.Lbyte_str.a(%rip), %rcx
	movq	%rcx, %xmm1
	punpcklqdq	%xmm0, %xmm1
	movdqu	%xmm1, (%rax)
	leaq	.Lbyte_str.a+8(%rip), %rcx
	movq	%rcx, 16(%rax)
	movl	$3, %ecx

.LBB10_3:
	movq	%rcx, 24(%rsp)
	movdqu	8(%rsp), %xmm0
	movdqa	%xmm0, 32(%rsp)
	movq	%rcx, 48(%rsp)
	cmpq	$3, 48(%rsp)
	jne	.LBB10_4
	movq	40(%rsp), %rsi
	testq	%rsi, %rsi
	je	.LBB10_9
	movq	32(%rsp), %rdi
	shlq	$3, %rsi
	movl	$8, %edx
	callq	__rust_dealloc@PLT

.LBB10_9:
	addq	$64, %rsp
	popq	%rbx
	retq

.LBB10_5:
	callq	alloc::alloc::oom@PLT
	jmp	.LBB10_6

.LBB10_4:
	callq	std::panicking::begin_panic

.LBB10_6:
	ud2
	movq	%rax, %rbx
	leaq	32(%rsp), %rdi
	callq	core::ptr::drop_in_place
	movq	%rbx, %rdi
	callq	_Unwind_Resume@PLT
	ud2
	movq	%rax, %rbx
	leaq	8(%rsp), %rdi
	callq	core::ptr::drop_in_place
	movq	%rbx, %rdi
	callq	_Unwind_Resume@PLT
	ud2

alexcrichton · 2018-05-21T21:23:27Z

If you compile with panic=abort it looks like the regression goes away. Sure enough looking at the IR I see:

; invoke alloc::alloc::oom
  invoke void @_ZN5alloc5alloc3oom17h12c4e4476d085e6dE()
    to label %.noexc10.i.i.i unwind label %bb1.i.i.i, !noalias !11

If I change that to call void @... then it optimizes the same.

I think we may need to add #[rustc_allocator_nounwind] to the oom function in libstd. After that all we should need is a codegen test to ensure this doesn't regress again!

sfackler · 2018-05-21T21:31:15Z

Ironically, we want to move oom to be allowed to unwind... #50880 (comment)

nikomatsakis · 2018-05-24T14:31:56Z

I'm marking this as T-libs since it seems like a "libs policy" decision as much as anything, no?

sfackler · 2018-05-24T16:14:11Z

I don't really understand why oom unwinding would prevent LLVM from eliding allocations. It seems like we should be able to teach it to do the right thing here, right?

alexcrichton · 2018-05-24T19:00:20Z

@sfackler in general invoke instructions (unwinding) thwarts a lot of LLVM optimizations in the sense that it just introduces more data dependencies, preventing optimizations from happening. For example in the optimized IR as of today's nightly the crucial aspect is that oom is called with the invoke instruction, namely providing a hook if the oom function unwinds. This hook executes the destructor for the intermediate Vec on the stack.

I believe LLVM has pattern recognition to realize when code allocates, doesn't use the data, and then deallocates. In that situation LLVM will entirely eliminate the allocation. In the example above the allocation is never actually used, so LLVM should be able to optimize it out. With the invoke instruction, however, the destructor looks like it may touch the data (as the function call isn't inlined) so the allocation isn't inlined away.

With the invoke removed then there's no destructor so LLVM has total knowledge that the memory is never read or used, so everything is optimized away and the const return value is optimized into place

OOM can't unwind today, and historically it's been optimized as if it can't unwind. This accidentally regressed with recent changes to the OOM handler, so this commit adds in a codegen test to assert that everything gets optimized away after the OOM function is approrpiately classified as nounwind Closes rust-lang#50925

std: Ensure OOM is classified as `nounwind` OOM can't unwind today, and historically it's been optimized as if it can't unwind. This accidentally regressed with recent changes to the OOM handler, so this commit adds in a codegen test to assert that everything gets optimized away after the OOM function is approrpiately classified as nounwind Closes #50925

alexcrichton added A-codegen Area: Code generation regression-from-stable-to-stable Performance or correctness regression from one stable version to another. labels May 21, 2018

nikomatsakis added the T-libs-api Relevant to the library API team, which will review and decide on the PR/issue. label May 24, 2018

alexcrichton mentioned this issue May 24, 2018

std: Ensure OOM is classified as nounwind #51041

Merged

bors closed this as completed in #51041 May 26, 2018

nagisa mentioned this issue Nov 18, 2020

Do not inline finish_grow #78682

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

1.26.0 cannot optimize length of vector collected from a slice of known size #50925

1.26.0 cannot optimize length of vector collected from a slice of known size #50925

tspiteri commented May 20, 2018

ishitatsuyuki commented May 20, 2018

sfackler commented May 21, 2018

alexcrichton commented May 21, 2018

sfackler commented May 21, 2018

nikomatsakis commented May 24, 2018

sfackler commented May 24, 2018

alexcrichton commented May 24, 2018

1.26.0 cannot optimize length of vector collected from a slice of known size #50925

1.26.0 cannot optimize length of vector collected from a slice of known size #50925

Comments

tspiteri commented May 20, 2018

ishitatsuyuki commented May 20, 2018

sfackler commented May 21, 2018

alexcrichton commented May 21, 2018

sfackler commented May 21, 2018

nikomatsakis commented May 24, 2018

sfackler commented May 24, 2018

alexcrichton commented May 24, 2018