New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Big performance problem with closed intervals looping #45222

Open
leonardo-m opened this Issue Oct 11, 2017 · 14 comments

Comments

Projects
None yet
8 participants
@leonardo-m
Copy link

leonardo-m commented Oct 11, 2017

In my code I've essentially stopped using loops with ... (intervals closed on the right, recently written with the syntax ..=) because they give performance problems. This is a simple example that shows the problem:

#![feature(inclusive_range_syntax)]
#![allow(private_no_mangle_fns)]

#[inline(never)]
#[no_mangle]
fn foo1(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 .. n + 1).rev() {
            count += j;
        }
    }
    count
}

#[inline(never)]
#[no_mangle]
fn foo2(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 ..= n).rev() {
            count += j;
        }
    }
    count
}

fn main() {
    let n: u64 = std::env::args().nth(1).unwrap().parse().unwrap();
    let what: u32 = std::env::args().nth(2).unwrap().parse().unwrap();

    match what {
        1 => println!("{}", foo1(n)),
        2 => println!("{}", foo2(n)),
        _ => panic!(),
    }
}

Compiled with the last Nightly:

rustc 1.22.0-nightly (d6d711dd8 2017-10-10)
binary: rustc
commit-hash: d6d711dd8f7ad5885294b8e1f0009a23dc1f8b1f
commit-date: 2017-10-10
host: x86_64-pc-windows-gnu
release: 1.22.0-nightly
LLVM version: 4.0

Compiled with:
rustc -O test.rs

Running it calling foo1 takes 0.02 seconds:

...>elaps test 100000 1
500005000000000

Running it calling foo2 takes about 13.65 seconds:


...>elaps test 100000 2
500005000000000
The asm I am seeing using "--emit asm"
foo1:
	testq	%rcx, %rcx
	je	.LBB5_1
	movq	%rcx, %r8
	imulq	%r8, %r8
	leaq	-1(%rcx), %rdx
	movq	%rcx, %rax
	mulq	%rdx
	shldq	$63, %rax, %rdx
	subq	%rdx, %r8
	cmpq	$3, %rcx
	jbe	.LBB5_3
	movq	%rcx, %rdx
	andq	$-4, %rdx
	je	.LBB5_3
	movd	%r8, %xmm0
	pshufd	$68, %xmm0, %xmm2
	leaq	-4(%rdx), %r9
	movl	%r9d, %eax
	shrl	$2, %eax
	incl	%eax
	andq	$3, %rax
	je	.LBB5_8
	cmpq	$-1, %rcx
	pxor	%xmm0, %xmm0
	pxor	%xmm3, %xmm3
	je	.LBB5_11
	movdqa	%xmm2, %xmm3
.LBB5_11:
	negq	%rax
	xorl	%r10d, %r10d
	pxor	%xmm1, %xmm1
	.p2align	4, 0x90
.LBB5_12:
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	addq	$4, %r10
	incq	%rax
	jne	.LBB5_12
	jmp	.LBB5_13
.LBB5_3:
	xorl	%eax, %eax
	xorl	%edx, %edx
.LBB5_4:
	xorl	%r9d, %r9d
	cmpq	$-1, %rcx
	cmoveq	%r9, %r8
	.p2align	4, 0x90
.LBB5_5:
	incq	%rdx
	addq	%r8, %rax
	cmpq	%rcx, %rdx
	jb	.LBB5_5
.LBB5_19:
	retq
.LBB5_1:
	xorl	%eax, %eax
	retq
.LBB5_8:
	xorl	%r10d, %r10d
	pxor	%xmm0, %xmm0
	pxor	%xmm1, %xmm1
.LBB5_13:
	cmpq	$12, %r9
	jb	.LBB5_18
	cmpq	$-1, %rcx
	pxor	%xmm3, %xmm3
	je	.LBB5_16
	movdqa	%xmm2, %xmm3
.LBB5_16:
	movq	%rdx, %rax
	subq	%r10, %rax
	.p2align	4, 0x90
.LBB5_17:
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	addq	$-16, %rax
	jne	.LBB5_17
.LBB5_18:
	paddq	%xmm1, %xmm0
	pshufd	$78, %xmm0, %xmm1
	paddq	%xmm0, %xmm1
	movd	%xmm1, %rax
	cmpq	%rcx, %rdx
	jne	.LBB5_4
	jmp	.LBB5_19



foo2:
	pushq	%rsi
	pushq	%rdi
	pushq	%rbx
	testq	%rcx, %rcx
	je	.LBB6_1
	testb	$1, %cl
	jne	.LBB6_4
	xorl	%eax, %eax
	xorl	%r8d, %r8d
	cmpq	$1, %rcx
	jne	.LBB6_11
	jmp	.LBB6_23
.LBB6_1:
	xorl	%eax, %eax
	jmp	.LBB6_23
.LBB6_4:
	xorl	%r8d, %r8d
	movq	$-1, %r9
	xorl	%r10d, %r10d
	movq	%rcx, %r11
	xorl	%eax, %eax
	jmp	.LBB6_5
	.p2align	4, 0x90
.LBB6_8:
	addq	%r11, %rax
	movq	%rdi, %r10
	movq	%rdx, %r11
.LBB6_5:
	cmpq	%r11, %r10
	movl	$1, %esi
	cmovbq	%r9, %rsi
	cmoveq	%r8, %rsi
	testq	%rsi, %rsi
	movl	$1, %edi
	movl	$0, %edx
	je	.LBB6_8
	cmpq	$-1, %rsi
	jne	.LBB6_9
	leaq	-1(%r11), %rdx
	movq	%r10, %rdi
	jmp	.LBB6_8
.LBB6_9:
	movl	$1, %r8d
	cmpq	$1, %rcx
	je	.LBB6_23
.LBB6_11:
	xorl	%r9d, %r9d
	movq	$-1, %r10
	.p2align	4, 0x90
.LBB6_12:
	xorl	%r11d, %r11d
	movq	%rcx, %rdx
	jmp	.LBB6_13
	.p2align	4, 0x90
.LBB6_16:
	addq	%rdx, %rax
	movq	%rbx, %r11
	movq	%rsi, %rdx
.LBB6_13:
	cmpq	%rdx, %r11
	movl	$1, %edi
	cmovbq	%r10, %rdi
	cmoveq	%r9, %rdi
	testq	%rdi, %rdi
	movl	$1, %ebx
	movl	$0, %esi
	je	.LBB6_16
	cmpq	$-1, %rdi
	jne	.LBB6_17
	leaq	-1(%rdx), %rsi
	movq	%r11, %rbx
	jmp	.LBB6_16
	.p2align	4, 0x90
.LBB6_17:
	addq	$2, %r8
	xorl	%r11d, %r11d
	movq	%rcx, %rdx
	jmp	.LBB6_18
	.p2align	4, 0x90
.LBB6_21:
	addq	%rdx, %rax
	movq	%rbx, %r11
	movq	%rsi, %rdx
.LBB6_18:
	cmpq	%rdx, %r11
	movl	$1, %edi
	cmovbq	%r10, %rdi
	cmoveq	%r9, %rdi
	testq	%rdi, %rdi
	movl	$1, %ebx
	movl	$0, %esi
	je	.LBB6_21
	cmpq	$-1, %rdi
	jne	.LBB6_22
	leaq	-1(%rdx), %rsi
	movq	%r11, %rbx
	jmp	.LBB6_21
	.p2align	4, 0x90
.LBB6_22:
	cmpq	%rcx, %r8
	jb	.LBB6_12
.LBB6_23:
	popq	%rbx
	popq	%rdi
	popq	%rsi
	retq
@ExpHP

This comment has been minimized.

Copy link
Contributor

ExpHP commented Oct 12, 2017

Performing the low-hanging fruit for minimization:

#![feature(inclusive_range_syntax)]
#![allow(private_no_mangle_fns)]

#[inline(never)]
#[no_mangle]
fn triangle_exc(n: u64) -> u64 {
    let mut count = 0;
    for j in (0 .. n + 1) {
        count += j;
    }
    count
}

#[inline(never)]
#[no_mangle]
fn triangle_inc(n: u64) -> u64 {
    let mut count = 0;
    for j in 0 ..= n {
        count += j;
    }
    count
}

fn main() {
    let n: u64 = std::env::args().nth(1).unwrap().parse().unwrap();

    println!("{}", triangle_exc(n));
    println!("{}", triangle_inc(n));
}

Good:

//-----------------------------------------
       │     0000000000007300 <triangle_exc>:
       │     triangle_exc():
       │       inc    %rdi
       │     ↓ je     8c
       │       cmp    $0x3,%rdi
       │     ↓ jbe    7b
       │       mov    %rdi,%rcx
       │       and    $0xfffffffffffffffc,%rcx
       │     ↓ je     7b
       │       lea    -0x4(%rcx),%rax
       │       mov    %eax,%esi
       │       shr    $0x2,%esi
       │       inc    %esi
       │       and    $0x3,%rsi
       │     ↓ je     8f
       │       neg    %rsi
       │       mov    $0x1,%edx
       │       movq   %rdx,%xmm0
       │       pslldq $0x8,%xmm0
       │       pxor   %xmm2,%xmm2
       │       xor    %edx,%edx
       │       movdqa _fini+0x44,%xmm3
       │       movdqa _fini+0x54,%xmm4
       │       pxor   %xmm1,%xmm1
       │       data16 nopw %cs:0x0(%rax,%rax,1)
       │ 60:   paddq  %xmm0,%xmm2
       │       paddq  %xmm0,%xmm1
       │       paddq  %xmm3,%xmm1
       │       add    $0x4,%rdx
       │       paddq  %xmm4,%xmm0
       │       inc    %rsi
       │     ↑ jne    60
       │ 7b:   xor    %eax,%eax
       │       xor    %ecx,%ecx
       │       nop
       │ 80:   add    %rcx,%rax
       │       inc    %rcx
       │       cmp    %rdi,%rcx
       │     ↑ jb     80
       │ 8b: ← retq
       │ 8c:   xor    %eax,%eax
       │     ← retq
       │ 8f:   xor    %edx,%edx
       │       mov    $0x1,%esi
       │       movq   %rsi,%xmm0
       │       pslldq $0x8,%xmm0
       │       pxor   %xmm2,%xmm2
       │       pxor   %xmm1,%xmm1
       │ a8:   cmp    $0xc,%rax
       │       movdqa %xmm2,%xmm6
       │     ↓ jb     106
       │       mov    %rcx,%rax
       │       sub    %rdx,%rax
       │       movdqa _fini+0x64,%xmm3
       │       movdqa _fini+0x74,%xmm4
       │       movdqa _fini+0x84,%xmm5
       │ d0:   paddq  %xmm0,%xmm2
       │       paddq  %xmm0,%xmm1
 20.00 │       movdqa %xmm0,%xmm6
 10.00 │       paddq  %xmm6,%xmm6
       │       paddq  %xmm6,%xmm1
 10.00 │       paddq  %xmm2,%xmm6
 30.00 │       paddq  %xmm0,%xmm6
       │       paddq  %xmm0,%xmm1
 10.00 │       paddq  %xmm3,%xmm6
       │       paddq  %xmm4,%xmm1
       │       paddq  %xmm5,%xmm0
 20.00 │       movdqa %xmm6,%xmm2
       │     ↑ jne    d0
       │106:   paddq  %xmm1,%xmm6
       │       pshufd $0x4e,%xmm6,%xmm0
       │       paddq  %xmm6,%xmm0
       │       movq   %xmm0,%rax
       │       cmp    %rcx,%rdi
       │     ↑ jne    80
       │     ↑ jmpq   8b

Bad:

       │    0000000000007430 <triangle_inc>:
       │    triangle_inc():
       │      xor    %r8d,%r8d
       │      mov    $0xffffffffffffffff,%r9
       │      xor    %r10d,%r10d
       │      xor    %eax,%eax
       │    ↓ jmp    29
       │      data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
       │20:   add    %r10,%rax
  1.79 │      mov    %rdx,%rdi
  2.98 │      mov    %rsi,%r10
 17.86 │29:   cmp    %rdi,%r10
       │      mov    $0x1,%ecx
  7.14 │      cmovb  %r9,%rcx
 16.07 │      cmove  %r8,%rcx
  3.57 │      test   %rcx,%rcx
  1.79 │      mov    $0x0,%edx
  1.79 │      mov    $0x1,%esi
 14.29 │    ↑ je     20
       │      cmp    $0xffffffffffffffff,%rcx
       │    ↓ jne    57
  2.98 │      lea    0x1(%r10),%rsi
  3.57 │      mov    %rdi,%rdx
 26.19 │    ↑ jmp    20
       │57: ← retq

Lost some kind of fast lane?

@arthurprs

This comment has been minimized.

Copy link
Contributor

arthurprs commented Oct 13, 2017

Apparently llvm is way happier optimizing the open interval with simd .

@kennytm

This comment has been minimized.

Copy link
Member

kennytm commented Jan 28, 2018

Referring to the example in #45222 (comment), the first code is recognized by LLVM loop-vectorize, while the second doesn't.

$ rustc +nightly --crate-type staticlib -C debuginfo=1 -C codegen-units=1 -C opt-level=3 -C panic=abort -C target-cpu=native -C remark=loop-vectorize 1.rs

note: optimization analysis for loop-vectorize at 1.rs:9:0: loop not vectorized: loop control flow is not understood by vectorizer

note: optimization missed for loop-vectorize at 1.rs:9:0: loop not vectorized

After vectorization the .. loop becomes just a * (a+1) / 2, which explains the huge time difference.

If we black-box the j in count += j when benchmarking, the result becomes more realistic (2.6× slowdown, not 680× slowdown):

$ time ./1 100000 1
500005000000000

real	0m5.680s
user	0m5.645s
sys	0m0.012s

$ time ./1 100000 2
500005000000000

real	0m15.088s
user	0m15.015s
sys	0m0.026s

We may see if upgrading to LLVM 6 can help this case.

Also note that the two pieces of code are not equivalent: the n+1 version cannot properly handle the case n == u64::max_value() (although rare).

@kennytm

This comment has been minimized.

Copy link
Member

kennytm commented Jan 30, 2018

I've checked again using the LLVM 6 build. The performance is improved, but there is still a large gap (2.6× → 2.0×).

Vectorization is still not recognized for 0 ..= n with the naked j.

Timings
$ rustc +nightly -C codegen-units=1 -C opt-level=3 -C target-cpu=native 1.rs

$ time ./1 30000 2
13500450000000

real	0m5.389s
user	0m5.136s
sys	0m0.012s

$ time ./1 30000 1
13500450000000

real	0m2.195s
user	0m2.192s
sys	0m0.000s

$ rustc +38bd38147d2fa21f8a684b019fc0763adf8fd436 -C codegen-units=1 -C opt-level=3 -C target-cpu=native 1.rs

$ time ./1 30000 2
13500450000000

real	0m4.445s
user	0m4.332s
sys	0m0.000s

$ time ./1 30000 1
13500450000000

real	0m2.330s
user	0m2.184s
sys	0m0.016s
@scottmcm

This comment has been minimized.

Copy link
Member

scottmcm commented Feb 5, 2018

This might just be the classic external-iteration-is-slower-sometimes problem. Note that even (0..=n).sum() is currently generating unfortunate code.

Fix for internal iteration is up at #48012

@ollie27

This comment has been minimized.

Copy link
Contributor

ollie27 commented Feb 5, 2018

I believe this can be fixed by adding an extra field to RangeInclusive like this:

struct FixedRangeInclusive {
    start: u64,
    end: u64,
    done: bool,
}

fn fixed_range_inclusive(start: u64, end: u64) -> FixedRangeInclusive {
    FixedRangeInclusive {
        start,
        end,
        done: false,
    }
}

impl Iterator for FixedRangeInclusive {
    type Item = u64;
    fn next(&mut self) -> Option<Self::Item> {
        if !self.done {
            if self.start == self.end {
                self.done = true;
            }
            let new = self.start.wrapping_add(1);
            Some(std::mem::replace(&mut self.start, new))
        } else {
            None
        }
    }
}

Check out the assembly on the playground.

@kennytm

This comment has been minimized.

Copy link
Member

kennytm commented Feb 5, 2018

The current two-field RangeInclusive follows from rust-lang/rfcs#1980. The done field was the original design in RFC 1192 but was changed due to rust-lang/rfcs#1192 (comment).

@ollie27

This comment has been minimized.

Copy link
Contributor

ollie27 commented Feb 5, 2018

I'm aware that RangeInclusive has gone through many different designs but the current design was clearly not chosen with performance in mind. Of course ideally RangeInclusive wouldn't have any public fields so these kind of changes can be made easily.

@ExpHP

This comment has been minimized.

Copy link
Contributor

ExpHP commented Feb 5, 2018

Of course ideally RangeInclusive wouldn't have any public fields so these kind of changes can be made easily.

This would be jarring, in consideration of the fact that Range does have public data members.

It is unfortunate. Were this not the case I could almost picture something like this:

pub struct RangeInclusive<T> {
    // NOTE: not pub
    start: T, // actually, these should probably be ManuallyDrop<T> 
    end: T,   // or union MaybeUninit<T> { value: T, empty: () }
    done: bool,
}

impl RangeInclusive<T> {
    // Expose an API that matches the functionality of the enum type
    #[inline] pub fn new(start: T, end: T) -> Self { ... }
    #[inline] pub fn new_done() -> Self { ... }
    #[inline] pub fn endpoints(&self) -> Option<(&T, &T)> { ... }
    #[inline] pub fn endpoints_mut(&mut self) -> Option<(&mut T, &mut T)> { ... }
    #[inline] pub fn into_endpoints(self) -> Option<(T, T)> { ... }
}

and ISTM (note: haven't tested) that this should optimize just as well as the three-field struct, since it IS the three-field struct (just with statically enforced usage patterns). But I suppose that, even then, it would seem questionable to have a standard library type that simulates a enum (rather than being one) solely for performance concerns.


Edit: I misread somewhat and thought that the enum was the current proposal.

@scottmcm

This comment has been minimized.

Copy link
Member

scottmcm commented Feb 6, 2018

@leonardo-m Can you share some non-simple examples where the current form is a problem? #48012 will make it so that the example in here is fine if written the easier way count += (0 ..= n).sum().

For simple things like sums of iota, it's easy to get suboptimal codegen from all kinds of different iterators. Like using for x in (0..3).chain(3..x) to add things instead of folding the same iterator has the identical problem as was raised here: https://godbolt.org/g/tYt7TX

Edit: #48057 has also improved things in recent nightlies, though it's still not perfect.

kennytm added a commit to kennytm/rust that referenced this issue Feb 6, 2018

Rollup merge of rust-lang#48012 - scottmcm:faster-rangeinclusive-fold…
…, r=alexcrichton

Override try_[r]fold for RangeInclusive

Because the last item needs special handling, it seems that LLVM has trouble canonicalizing the loops in external iteration.  With the override, it becomes obvious that the start==end case exits the loop (as opposed to the one *after* that exiting the loop in external iteration).

Demo adapted from rust-lang#45222
```rust
#[no_mangle]
pub fn foo3r(n: u64) -> u64 {
    let mut count = 0;
    (0..n).for_each(|_| {
        (0 ..= n).rev().for_each(|j| {
            count += j;
        })
    });
    count
}
```

<details>
 <summary>Current nightly ASM, 100 lines (https://play.rust-lang.org/?gist=f5674c702c6e2045c3aab5d03763e5f6&version=nightly&mode=release)</summary>

```asm
foo3r:
	pushq	%rbx
.Lcfi0:
.Lcfi1:
	testq	%rdi, %rdi
	je	.LBB0_1
	testb	$1, %dil
	jne	.LBB0_4
	xorl	%eax, %eax
	xorl	%r8d, %r8d
	cmpq	$1, %rdi
	jne	.LBB0_11
	jmp	.LBB0_23
.LBB0_1:
	xorl	%eax, %eax
	popq	%rbx
	retq
.LBB0_4:
	xorl	%r8d, %r8d
	movq	$-1, %r9
	xorl	%eax, %eax
	movq	%rdi, %r11
	xorl	%r10d, %r10d
	jmp	.LBB0_5
.LBB0_8:
	addq	%r11, %rax
	movq	%rsi, %r11
	movq	%rdx, %r10
.LBB0_5:
	cmpq	%r11, %r10
	movl	$1, %ecx
	cmovbq	%r9, %rcx
	cmoveq	%r8, %rcx
	testq	%rcx, %rcx
	movl	$0, %esi
	movl	$1, %edx
	je	.LBB0_8
	cmpq	$-1, %rcx
	jne	.LBB0_9
	leaq	-1(%r11), %rsi
	movq	%r10, %rdx
	jmp	.LBB0_8
.LBB0_9:
	movl	$1, %r8d
	cmpq	$1, %rdi
	je	.LBB0_23
.LBB0_11:
	xorl	%r9d, %r9d
	movq	$-1, %r10
.LBB0_12:
	movq	%rdi, %rsi
	xorl	%r11d, %r11d
	jmp	.LBB0_13
.LBB0_16:
	addq	%rsi, %rax
	movq	%rcx, %rsi
	movq	%rbx, %r11
.LBB0_13:
	cmpq	%rsi, %r11
	movl	$1, %edx
	cmovbq	%r10, %rdx
	cmoveq	%r9, %rdx
	testq	%rdx, %rdx
	movl	$0, %ecx
	movl	$1, %ebx
	je	.LBB0_16
	cmpq	$-1, %rdx
	jne	.LBB0_17
	leaq	-1(%rsi), %rcx
	movq	%r11, %rbx
	jmp	.LBB0_16
.LBB0_17:
	movq	%rdi, %rcx
	xorl	%r11d, %r11d
	jmp	.LBB0_18
.LBB0_21:
	addq	%rcx, %rax
	movq	%rsi, %rcx
	movq	%rbx, %r11
.LBB0_18:
	cmpq	%rcx, %r11
	movl	$1, %edx
	cmovbq	%r10, %rdx
	cmoveq	%r9, %rdx
	testq	%rdx, %rdx
	movl	$0, %esi
	movl	$1, %ebx
	je	.LBB0_21
	cmpq	$-1, %rdx
	jne	.LBB0_22
	leaq	-1(%rcx), %rsi
	movq	%r11, %rbx
	jmp	.LBB0_21
.LBB0_22:
	addq	$2, %r8
	cmpq	%rdi, %r8
	jne	.LBB0_12
.LBB0_23:
	popq	%rbx
	retq
.Lfunc_end0:
```
</details><br>

With this PR:
```asm
foo3r:
	test	rcx, rcx
	je	.LBB3_1
	lea	r8, [rcx - 1]
	lea	rdx, [rcx - 2]
	mov	rax, r8
	mul	rdx
	shld	rdx, rax, 63
	imul	r8, r8
	add	r8, rcx
	sub	r8, rdx
	imul	r8, rcx
	mov	rax, r8
	ret
.LBB3_1:
	xor	r8d, r8d
	mov	rax, r8
	ret
```

bors added a commit that referenced this issue Feb 8, 2018

Auto merge of #48057 - scottmcm:less-match-more-compare, r=dtolnay
Simplify RangeInclusive::next[_back]

`match`ing on an `Option<Ordering>` seems cause some confusion for LLVM; switching to just using comparison operators removes a few jumps from the simple `for` loops I was trying.

cc #45222 #28237 (comment)

Example:
```rust
#[no_mangle]
pub fn coresum(x: std::ops::RangeInclusive<u64>) -> u64 {
    let mut sum = 0;
    for i in x {
        sum += i ^ (i-1);
    }
    sum
}
```
Today:
```asm
coresum:
    xor r8d, r8d
    mov r9, -1
    xor eax, eax
    jmp .LBB0_1
.LBB0_4:
    lea rcx, [rdi - 1]
    xor rcx, rdi
    add rax, rcx
    mov rsi, rdx
    mov rdi, r10
.LBB0_1:
    cmp rdi, rsi
    mov ecx, 1
    cmovb   rcx, r9
    cmove   rcx, r8
    test    rcx, rcx
    mov edx, 0
    mov r10d, 1
    je  .LBB0_4         // 1
    cmp rcx, -1
    jne .LBB0_5         // 2
    lea r10, [rdi + 1]
    mov rdx, rsi
    jmp .LBB0_4         // 3
.LBB0_5:
    ret
```
With this PR:
```asm
coresum:
	cmp	rcx, rdx
	jbe	.LBB0_2
	xor	eax, eax
	ret
.LBB0_2:
	xor	r8d, r8d
	mov	r9d, 1
	xor	eax, eax
	.p2align	4, 0x90
.LBB0_3:
	lea	r10, [rcx + 1]
	cmp	rcx, rdx
	cmovae	rdx, r8
	cmovae	r10, r9
	lea	r11, [rcx - 1]
	xor	r11, rcx
	add	rax, r11
	mov	rcx, r10
	cmp	r10, rdx
	jbe	.LBB0_3         // Just this
	ret
```

<details><summary>Though using internal iteration (`.map(|i| i ^ (i-1)).sum()`) is still shorter to type, and lets the compiler unroll it</summary>

```asm
coresum_inner:
.Lcfi0:
.seh_proc coresum_inner
	sub	rsp, 168
.Lcfi1:
	.seh_stackalloc 168
	vmovdqa	xmmword ptr [rsp + 144], xmm15
.Lcfi2:
	.seh_savexmm 15, 144
	vmovdqa	xmmword ptr [rsp + 128], xmm14
.Lcfi3:
	.seh_savexmm 14, 128
	vmovdqa	xmmword ptr [rsp + 112], xmm13
.Lcfi4:
	.seh_savexmm 13, 112
	vmovdqa	xmmword ptr [rsp + 96], xmm12
.Lcfi5:
	.seh_savexmm 12, 96
	vmovdqa	xmmword ptr [rsp + 80], xmm11
.Lcfi6:
	.seh_savexmm 11, 80
	vmovdqa	xmmword ptr [rsp + 64], xmm10
.Lcfi7:
	.seh_savexmm 10, 64
	vmovdqa	xmmword ptr [rsp + 48], xmm9
.Lcfi8:
	.seh_savexmm 9, 48
	vmovdqa	xmmword ptr [rsp + 32], xmm8
.Lcfi9:
	.seh_savexmm 8, 32
	vmovdqa	xmmword ptr [rsp + 16], xmm7
.Lcfi10:
	.seh_savexmm 7, 16
	vmovdqa	xmmword ptr [rsp], xmm6
.Lcfi11:
	.seh_savexmm 6, 0
.Lcfi12:
	.seh_endprologue
	cmp	rdx, rcx
	jae	.LBB1_2
	xor	eax, eax
	jmp	.LBB1_13
.LBB1_2:
	mov	r8, rdx
	sub	r8, rcx
	jbe	.LBB1_3
	cmp	r8, 7
	jbe	.LBB1_5
	mov	rax, r8
	and	rax, -8
	mov	r9, r8
	and	r9, -8
	je	.LBB1_5
	add	rax, rcx
	vmovq	xmm0, rcx
	vpshufd	xmm0, xmm0, 68
	mov	ecx, 1
	vmovq	xmm1, rcx
	vpslldq	xmm1, xmm1, 8
	vpaddq	xmm1, xmm0, xmm1
	vpxor	xmm0, xmm0, xmm0
	vpcmpeqd	xmm11, xmm11, xmm11
	vmovdqa	xmm12, xmmword ptr [rip + __xmm@00000000000000010000000000000001]
	vmovdqa	xmm13, xmmword ptr [rip + __xmm@00000000000000030000000000000003]
	vmovdqa	xmm14, xmmword ptr [rip + __xmm@00000000000000050000000000000005]
	vmovdqa	xmm15, xmmword ptr [rip + __xmm@00000000000000080000000000000008]
	mov	rcx, r9
	vpxor	xmm4, xmm4, xmm4
	vpxor	xmm5, xmm5, xmm5
	vpxor	xmm6, xmm6, xmm6
	.p2align	4, 0x90
.LBB1_9:
	vpaddq	xmm7, xmm1, xmmword ptr [rip + __xmm@00000000000000020000000000000002]
	vpaddq	xmm9, xmm1, xmmword ptr [rip + __xmm@00000000000000040000000000000004]
	vpaddq	xmm10, xmm1, xmmword ptr [rip + __xmm@00000000000000060000000000000006]
	vpaddq	xmm8, xmm1, xmm12
	vpxor	xmm7, xmm8, xmm7
	vpaddq	xmm2, xmm1, xmm13
	vpxor	xmm8, xmm2, xmm9
	vpaddq	xmm3, xmm1, xmm14
	vpxor	xmm3, xmm3, xmm10
	vpaddq	xmm2, xmm1, xmm11
	vpxor	xmm2, xmm2, xmm1
	vpaddq	xmm0, xmm2, xmm0
	vpaddq	xmm4, xmm7, xmm4
	vpaddq	xmm5, xmm8, xmm5
	vpaddq	xmm6, xmm3, xmm6
	vpaddq	xmm1, xmm1, xmm15
	add	rcx, -8
	jne	.LBB1_9
	vpaddq	xmm0, xmm4, xmm0
	vpaddq	xmm0, xmm5, xmm0
	vpaddq	xmm0, xmm6, xmm0
	vpshufd	xmm1, xmm0, 78
	vpaddq	xmm0, xmm0, xmm1
	vmovq	r10, xmm0
	cmp	r8, r9
	jne	.LBB1_6
	jmp	.LBB1_11
.LBB1_3:
	xor	r10d, r10d
	jmp	.LBB1_12
.LBB1_5:
	xor	r10d, r10d
	mov	rax, rcx
	.p2align	4, 0x90
.LBB1_6:
	lea	rcx, [rax - 1]
	xor	rcx, rax
	inc	rax
	add	r10, rcx
	cmp	rdx, rax
	jne	.LBB1_6
.LBB1_11:
	mov	rcx, rdx
.LBB1_12:
	lea	rax, [rcx - 1]
	xor	rax, rcx
	add	rax, r10
.LBB1_13:
	vmovaps	xmm6, xmmword ptr [rsp]
	vmovaps	xmm7, xmmword ptr [rsp + 16]
	vmovaps	xmm8, xmmword ptr [rsp + 32]
	vmovaps	xmm9, xmmword ptr [rsp + 48]
	vmovaps	xmm10, xmmword ptr [rsp + 64]
	vmovaps	xmm11, xmmword ptr [rsp + 80]
	vmovaps	xmm12, xmmword ptr [rsp + 96]
	vmovaps	xmm13, xmmword ptr [rsp + 112]
	vmovaps	xmm14, xmmword ptr [rsp + 128]
	vmovaps	xmm15, xmmword ptr [rsp + 144]
	add	rsp, 168
	ret
	.seh_handlerdata
	.section	.text,"xr",one_only,coresum_inner
.Lcfi13:
	.seh_endproc
```

</details>

bors added a commit that referenced this issue Feb 8, 2018

Auto merge of #48057 - scottmcm:less-match-more-compare, r=dtolnay
Simplify RangeInclusive::next[_back]

`match`ing on an `Option<Ordering>` seems cause some confusion for LLVM; switching to just using comparison operators removes a few jumps from the simple `for` loops I was trying.

cc #45222 #28237 (comment)

Example:
```rust
#[no_mangle]
pub fn coresum(x: std::ops::RangeInclusive<u64>) -> u64 {
    let mut sum = 0;
    for i in x {
        sum += i ^ (i-1);
    }
    sum
}
```
Today:
```asm
coresum:
    xor r8d, r8d
    mov r9, -1
    xor eax, eax
    jmp .LBB0_1
.LBB0_4:
    lea rcx, [rdi - 1]
    xor rcx, rdi
    add rax, rcx
    mov rsi, rdx
    mov rdi, r10
.LBB0_1:
    cmp rdi, rsi
    mov ecx, 1
    cmovb   rcx, r9
    cmove   rcx, r8
    test    rcx, rcx
    mov edx, 0
    mov r10d, 1
    je  .LBB0_4         // 1
    cmp rcx, -1
    jne .LBB0_5         // 2
    lea r10, [rdi + 1]
    mov rdx, rsi
    jmp .LBB0_4         // 3
.LBB0_5:
    ret
```
With this PR:
```asm
coresum:
	cmp	rcx, rdx
	jbe	.LBB0_2
	xor	eax, eax
	ret
.LBB0_2:
	xor	r8d, r8d
	mov	r9d, 1
	xor	eax, eax
	.p2align	4, 0x90
.LBB0_3:
	lea	r10, [rcx + 1]
	cmp	rcx, rdx
	cmovae	rdx, r8
	cmovae	r10, r9
	lea	r11, [rcx - 1]
	xor	r11, rcx
	add	rax, r11
	mov	rcx, r10
	cmp	r10, rdx
	jbe	.LBB0_3         // Just this
	ret
```

<details><summary>Though using internal iteration (`.map(|i| i ^ (i-1)).sum()`) is still shorter to type, and lets the compiler unroll it</summary>

```asm
coresum_inner:
.Lcfi0:
.seh_proc coresum_inner
	sub	rsp, 168
.Lcfi1:
	.seh_stackalloc 168
	vmovdqa	xmmword ptr [rsp + 144], xmm15
.Lcfi2:
	.seh_savexmm 15, 144
	vmovdqa	xmmword ptr [rsp + 128], xmm14
.Lcfi3:
	.seh_savexmm 14, 128
	vmovdqa	xmmword ptr [rsp + 112], xmm13
.Lcfi4:
	.seh_savexmm 13, 112
	vmovdqa	xmmword ptr [rsp + 96], xmm12
.Lcfi5:
	.seh_savexmm 12, 96
	vmovdqa	xmmword ptr [rsp + 80], xmm11
.Lcfi6:
	.seh_savexmm 11, 80
	vmovdqa	xmmword ptr [rsp + 64], xmm10
.Lcfi7:
	.seh_savexmm 10, 64
	vmovdqa	xmmword ptr [rsp + 48], xmm9
.Lcfi8:
	.seh_savexmm 9, 48
	vmovdqa	xmmword ptr [rsp + 32], xmm8
.Lcfi9:
	.seh_savexmm 8, 32
	vmovdqa	xmmword ptr [rsp + 16], xmm7
.Lcfi10:
	.seh_savexmm 7, 16
	vmovdqa	xmmword ptr [rsp], xmm6
.Lcfi11:
	.seh_savexmm 6, 0
.Lcfi12:
	.seh_endprologue
	cmp	rdx, rcx
	jae	.LBB1_2
	xor	eax, eax
	jmp	.LBB1_13
.LBB1_2:
	mov	r8, rdx
	sub	r8, rcx
	jbe	.LBB1_3
	cmp	r8, 7
	jbe	.LBB1_5
	mov	rax, r8
	and	rax, -8
	mov	r9, r8
	and	r9, -8
	je	.LBB1_5
	add	rax, rcx
	vmovq	xmm0, rcx
	vpshufd	xmm0, xmm0, 68
	mov	ecx, 1
	vmovq	xmm1, rcx
	vpslldq	xmm1, xmm1, 8
	vpaddq	xmm1, xmm0, xmm1
	vpxor	xmm0, xmm0, xmm0
	vpcmpeqd	xmm11, xmm11, xmm11
	vmovdqa	xmm12, xmmword ptr [rip + __xmm@00000000000000010000000000000001]
	vmovdqa	xmm13, xmmword ptr [rip + __xmm@00000000000000030000000000000003]
	vmovdqa	xmm14, xmmword ptr [rip + __xmm@00000000000000050000000000000005]
	vmovdqa	xmm15, xmmword ptr [rip + __xmm@00000000000000080000000000000008]
	mov	rcx, r9
	vpxor	xmm4, xmm4, xmm4
	vpxor	xmm5, xmm5, xmm5
	vpxor	xmm6, xmm6, xmm6
	.p2align	4, 0x90
.LBB1_9:
	vpaddq	xmm7, xmm1, xmmword ptr [rip + __xmm@00000000000000020000000000000002]
	vpaddq	xmm9, xmm1, xmmword ptr [rip + __xmm@00000000000000040000000000000004]
	vpaddq	xmm10, xmm1, xmmword ptr [rip + __xmm@00000000000000060000000000000006]
	vpaddq	xmm8, xmm1, xmm12
	vpxor	xmm7, xmm8, xmm7
	vpaddq	xmm2, xmm1, xmm13
	vpxor	xmm8, xmm2, xmm9
	vpaddq	xmm3, xmm1, xmm14
	vpxor	xmm3, xmm3, xmm10
	vpaddq	xmm2, xmm1, xmm11
	vpxor	xmm2, xmm2, xmm1
	vpaddq	xmm0, xmm2, xmm0
	vpaddq	xmm4, xmm7, xmm4
	vpaddq	xmm5, xmm8, xmm5
	vpaddq	xmm6, xmm3, xmm6
	vpaddq	xmm1, xmm1, xmm15
	add	rcx, -8
	jne	.LBB1_9
	vpaddq	xmm0, xmm4, xmm0
	vpaddq	xmm0, xmm5, xmm0
	vpaddq	xmm0, xmm6, xmm0
	vpshufd	xmm1, xmm0, 78
	vpaddq	xmm0, xmm0, xmm1
	vmovq	r10, xmm0
	cmp	r8, r9
	jne	.LBB1_6
	jmp	.LBB1_11
.LBB1_3:
	xor	r10d, r10d
	jmp	.LBB1_12
.LBB1_5:
	xor	r10d, r10d
	mov	rax, rcx
	.p2align	4, 0x90
.LBB1_6:
	lea	rcx, [rax - 1]
	xor	rcx, rax
	inc	rax
	add	r10, rcx
	cmp	rdx, rax
	jne	.LBB1_6
.LBB1_11:
	mov	rcx, rdx
.LBB1_12:
	lea	rax, [rcx - 1]
	xor	rax, rcx
	add	rax, r10
.LBB1_13:
	vmovaps	xmm6, xmmword ptr [rsp]
	vmovaps	xmm7, xmmword ptr [rsp + 16]
	vmovaps	xmm8, xmmword ptr [rsp + 32]
	vmovaps	xmm9, xmmword ptr [rsp + 48]
	vmovaps	xmm10, xmmword ptr [rsp + 64]
	vmovaps	xmm11, xmmword ptr [rsp + 80]
	vmovaps	xmm12, xmmword ptr [rsp + 96]
	vmovaps	xmm13, xmmword ptr [rsp + 112]
	vmovaps	xmm14, xmmword ptr [rsp + 128]
	vmovaps	xmm15, xmmword ptr [rsp + 144]
	add	rsp, 168
	ret
	.seh_handlerdata
	.section	.text,"xr",one_only,coresum_inner
.Lcfi13:
	.seh_endproc
```

</details>
@kennytm

This comment has been minimized.

Copy link
Member

kennytm commented May 1, 2018

For record: Enabling Polly (#50044) doesn't fix the issue.

@Stargateur

This comment has been minimized.

Copy link

Stargateur commented May 11, 2018

@ollie27 Just a note, if you are going to use a bool your example always do two tests, maybe something like this could speed up thing because it only test two time for the two last value:

impl Iterator for FixedRangeInclusive {
    type Item = u64;
    fn next(&mut self) -> Option<Self::Item> {
        if self.start < self.end {
            let new = self.start.wrapping_add(1);
            Some(std::mem::replace(&mut self.start, new))
        } else if !self.done {
            self.done = true;
            Some(self.start)
        } else {
            None
        }
    }
}

kennytm added a commit to kennytm/rust that referenced this issue Jun 18, 2018

kennytm added a commit to kennytm/rust that referenced this issue Jun 19, 2018

kennytm added a commit to kennytm/rust that referenced this issue Jun 19, 2018

kennytm added a commit to kennytm/rust that referenced this issue Jun 22, 2018

kennytm added a commit to kennytm/rust that referenced this issue Jun 30, 2018

kennytm added a commit to kennytm/rust that referenced this issue Jul 12, 2018

Rollup merge of rust-lang#51622 - kennytm:three-field-range-inclusive…
…, r=SimonSapin

Change RangeInclusive to a three-field struct.

Fix rust-lang#45222.

This PR also reverts rust-lang#48012 (i.e. removed the `try_fold`/`try_rfold` specialization for `RangeInclusive`) because LLVM no longer has trouble recognizing a RangeInclusive loop.

bors added a commit that referenced this issue Jul 12, 2018

Auto merge of #51622 - kennytm:three-field-range-inclusive, r=SimonSapin
Change RangeInclusive to a three-field struct.

Fix #45222.

This PR also reverts #48012 (i.e. removed the `try_fold`/`try_rfold` specialization for `RangeInclusive`) because LLVM no longer has trouble recognizing a RangeInclusive loop.

kennytm added a commit to kennytm/rust that referenced this issue Jul 13, 2018

bors added a commit that referenced this issue Jul 13, 2018

Auto merge of #51622 - kennytm:three-field-range-inclusive, r=SimonSapin
Change RangeInclusive to a three-field struct.

Fix #45222.

This PR also reverts #48012 (i.e. removed the `try_fold`/`try_rfold` specialization for `RangeInclusive`) because LLVM no longer has trouble recognizing a RangeInclusive loop.

bors added a commit that referenced this issue Jul 13, 2018

Auto merge of #51622 - kennytm:three-field-range-inclusive, r=SimonSapin
Change RangeInclusive to a three-field struct.

Fix #45222.

This PR also reverts #48012 (i.e. removed the `try_fold`/`try_rfold` specialization for `RangeInclusive`) because LLVM no longer has trouble recognizing a RangeInclusive loop.

@bors bors closed this in #51622 Jul 13, 2018

@leonardo-m

This comment has been minimized.

Copy link

leonardo-m commented Dec 6, 2018

To answer Issue #56516, this is a first example of the performance problem, better examples could follow. Code example from:
http://ericniebler.com/2014/04/27/range-comprehensions/

fn triples() -> impl Iterator<Item=(u32, u32, u32)> {
    (1 ..).flat_map(|z| (1 .. z + 1)
                        .flat_map(move |x| (x .. z + 1u32)
                                           .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                                           .map(move |y| (x, y, z))))
}

fn main() {
    let result: u32 = triples().take(3_000).map(|(x, y, z)| x + y + z).sum();
    println!("{}", result); // 10650478, about 2.8 seconds.
}

If I replace the open intervals with closed ones:

fn triples() -> impl Iterator<Item=(u32, u32, u32)> {
    (1 ..).flat_map(|z| (1u32 ..= z)
                        .flat_map(move |x| (x ..= z)
                                           .filter(move |&y| x.pow(2) + y.pow(2) == z.pow(2))
                                           .map(move |y| (x, y, z))))
}

fn main() {
    let result: u32 = triples().take(3_000).map(|(x, y, z)| x + y + z).sum();
    println!("{}", result);
}

For the second version I am seeing a run-time of about 6 seconds.

@leonardo-m

This comment has been minimized.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment