Constructing an iterator from a slice or Vec doesn't optimise completely #11751

Closed
huonw opened this Issue Jan 23, 2014 · 11 comments

Comments

Projects
None yet
6 participants
@huonw
Member

huonw commented Jan 23, 2014

#![crate_type = "lib"]

pub fn slice(s: &[uint]) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}
pub fn vec(s: Vec<uint>) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}

pub fn owned(s: ~[uint]) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}

Compiled with -O --lib --emit-llvm -S gives the following. The only major difference between &[]/Vec and ~[] are two lines marked THIS CHECK, which, we think, is because when constructing an iterator from ~[] we do a pointer offset and dereference, so LLVM knows the pointers are non-null (in the slice/Vec case, the match it.next() { None => ... } part of the for loop isn't removed).

; ModuleID = '11751.rs'
target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"struct.std::vec::Vec<uint>[#1]" = type { i64, i64, i64* }

; Function Attrs: nounwind readonly uwtable
define i64 @_ZN5slice20h084d58a6edab0287daa4v0.0E({ i64*, i64 }* noalias nocapture nonnull readonly) unnamed_addr #0 {
entry-block:
  %1 = getelementptr inbounds { i64*, i64 }* %0, i64 0, i32 0
  %2 = load i64** %1, align 8
  %3 = getelementptr inbounds { i64*, i64 }* %0, i64 0, i32 1
  %4 = load i64* %3, align 8
  %5 = getelementptr inbounds i64* %2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %match_else, %entry-block
  %6 = phi i64* [ %9, %match_else ], [ %2, %entry-block ]
  %7 = icmp eq i64* %6, %5
  %8 = icmp eq i64* %6, null     ; THIS CHECK!
  %or.cond = or i1 %7, %8
  br i1 %or.cond, label %return, label %match_else

match_else:                                       ; preds = %loop_body
  %9 = getelementptr inbounds i64* %6, i64 1
  %10 = load i64* %6, align 8
  %11 = icmp ugt i64 %10, 10
  br i1 %11, label %return, label %loop_body

return:                                           ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %10, %match_else ], [ 0, %loop_body ]
  ret i64 %__make_return_pointer.0
}

; Function Attrs: uwtable
define i64 @_ZN3vec20h4963a1d1a9f58c9eUaa4v0.0E(%"struct.std::vec::Vec<uint>[#1]"* noalias nocapture nonnull readonly) unnamed_addr #1 {
entry-block:
  %1 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 2
  %2 = load i64** %1, align 8
  %3 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 0
  %4 = load i64* %3, align 8
  %5 = getelementptr inbounds i64* %2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %entry-block, %match_else
  %6 = phi i64* [ %2, %entry-block ], [ %9, %match_else ]
  %7 = icmp eq i64* %6, %5
  %8 = icmp eq i64* %6, null      ; THIS CHECK!
  %or.cond = or i1 %7, %8
  br i1 %or.cond, label %clean_custom_6, label %match_else

match_else:                                       ; preds = %loop_body
  %9 = getelementptr inbounds i64* %6, i64 1
  %10 = load i64* %6, align 8
  %11 = icmp ugt i64 %10, 10
  br i1 %11, label %clean_custom_6, label %loop_body

clean_custom_6:                                   ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %10, %match_else ], [ 0, %loop_body ]
  %12 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 1
  %13 = load i64* %12, align 8
  %14 = icmp eq i64 %13, 0
  br i1 %14, label %"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit", label %then-block-549-.i.i

then-block-549-.i.i:                              ; preds = %clean_custom_6
  %15 = bitcast i64* %2 to i8*
  tail call void @je_dallocx(i8* %15, i32 3)
  br label %"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit"

"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit": ; preds = %clean_custom_6, %then-block-549-.i.i
  ret i64 %__make_return_pointer.0
}

declare void @je_dallocx(i8*, i32) unnamed_addr #2

; Function Attrs: uwtable
define i64 @_ZN5owned20h3f7b4426165c9e96Bba4v0.0E({ i64, i64, [0 x i64] }* noalias nonnull) unnamed_addr #1 {
entry-block:
  %1 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 2, i64 0
  %2 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 0
  %3 = load i64* %2, align 8
  %4 = lshr i64 %3, 3
  %5 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %entry-block, %match_else
  %6 = phi i64* [ %1, %entry-block ], [ %8, %match_else ]
  %7 = icmp eq i64* %6, %5
  br i1 %7, label %"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit", label %match_else

match_else:                                       ; preds = %loop_body
  %8 = getelementptr inbounds i64* %6, i64 1
  %9 = load i64* %6, align 8
  %10 = icmp ugt i64 %9, 10
  br i1 %10, label %"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit", label %loop_body

"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit": ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %9, %match_else ], [ 0, %loop_body ]
  %11 = bitcast { i64, i64, [0 x i64] }* %0 to i8*
  tail call void @je_dallocx(i8* %11, i32 3)
  ret i64 %__make_return_pointer.0
}

attributes #0 = { nounwind readonly uwtable "split-stack" }
attributes #1 = { uwtable "split-stack" }
attributes #2 = { "split-stack" }
@thestinger

This comment has been minimized.

Show comment
Hide comment
@thestinger

thestinger Jan 23, 2014

Contributor

Implementing #9546 would fix this.

Contributor

thestinger commented Jan 23, 2014

Implementing #9546 would fix this.

@thestinger thestinger referenced this issue Jan 28, 2014

Closed

port from ~[T] to Vec<T> #11875

1 of 3 tasks complete

huonw added a commit to huonw/rust that referenced this issue Feb 16, 2014

std::str: safen and optimize is_utf8.
This uses a vector iterator to avoid the necessity for unsafe indexing,
and makes this function slightly faster. Unfortunately #11751 means that
the iterator comes with repeated `null` checks which means the
pure-ASCII case still has room for significant improvement (and the
other cases too, but it's most significant for just ASCII).

Before:

    is_utf8_100_ascii             ... bench:       143 ns/iter (+/- 6)
    is_utf8_100_multibyte         ... bench:       134 ns/iter (+/- 4)

After:

    is_utf8_100_ascii             ... bench:       123 ns/iter (+/- 4)
    is_utf8_100_multibyte         ... bench:       115 ns/iter (+/- 5)

huonw added a commit to huonw/rust that referenced this issue Feb 17, 2014

std::str: safen and optimize is_utf8.
This uses a vector iterator to avoid the necessity for unsafe indexing,
and makes this function slightly faster. Unfortunately #11751 means that
the iterator comes with repeated `null` checks which means the
pure-ASCII case still has room for significant improvement (and the
other cases too, but it's most significant for just ASCII).

Before:

    is_utf8_100_ascii             ... bench:       143 ns/iter (+/- 6)
    is_utf8_100_multibyte         ... bench:       134 ns/iter (+/- 4)

After:

    is_utf8_100_ascii             ... bench:       123 ns/iter (+/- 4)
    is_utf8_100_multibyte         ... bench:       115 ns/iter (+/- 5)

huonw added a commit to huonw/rust that referenced this issue Feb 18, 2014

std::str: safen and optimize is_utf8.
This uses a vector iterator to avoid the necessity for unsafe indexing,
and makes this function slightly faster. Unfortunately #11751 means that
the iterator comes with repeated `null` checks which means the
pure-ASCII case still has room for significant improvement (and the
other cases too, but it's most significant for just ASCII).

Before:

    is_utf8_100_ascii             ... bench:       143 ns/iter (+/- 6)
    is_utf8_100_multibyte         ... bench:       134 ns/iter (+/- 4)

After:

    is_utf8_100_ascii             ... bench:       123 ns/iter (+/- 4)
    is_utf8_100_multibyte         ... bench:       115 ns/iter (+/- 5)

jxs added a commit to jxs/rust that referenced this issue Apr 2, 2014

std::str: safen and optimize is_utf8.
This uses a vector iterator to avoid the necessity for unsafe indexing,
and makes this function slightly faster. Unfortunately #11751 means that
the iterator comes with repeated `null` checks which means the
pure-ASCII case still has room for significant improvement (and the
other cases too, but it's most significant for just ASCII).

Before:

    is_utf8_100_ascii             ... bench:       143 ns/iter (+/- 6)
    is_utf8_100_multibyte         ... bench:       134 ns/iter (+/- 4)

After:

    is_utf8_100_ascii             ... bench:       123 ns/iter (+/- 4)
    is_utf8_100_multibyte         ... bench:       115 ns/iter (+/- 5)

@huonw huonw changed the title from Constructing an iterator from a slice doesn't optimise completely to Constructing an iterator from a slice or Vec doesn't optimise completely Jun 4, 2014

@aturon aturon added the A-libs label Jun 4, 2014

@zwarich

This comment has been minimized.

Show comment
Hide comment
@zwarich

zwarich Jun 11, 2014

In theory, LLVM should be able to determine that this null check is unnecessary without additional metadata. There are two separate changes to LLVM's optimizer that are required:

  1. An inbounds GEP either produces a valid pointer into an allocated object or a poison value. In address space 0, there is no allocated object at the zero address. This implies that any inbounds GEP in address space 0 is a poison value. According to the LLVM LangRef, the icmp would depend on the poison value, and any instruction that depends on poison values exhibits undefined behavior. However, Dan Gohman (sunfish) tells me that it was only intended to apply to instructions exhibiting externally visible side effects, as otherwise it would mean that any add instruction could potentially have undefined behavior. Any chain of inbounds GEPs and phis ending with a load of the inbounds value should be undefined behavior, because of how poison values behave like undef. We can't just assume that an inbounds GEP produces a nonnull value, because then that implies that inbounds GEPs themselves can have undefined behavior when the runtime value is actually null. Lots of optimizations depend on being able to hoist GEPs, e.g. out of loops, and that wouldn't be possible if they potentially had undefined behavior.
  2. The current LazyValueInfo / CorrelatedValuePropagation passes are not optimistic with respect to control flow. Since there is a phi here, the optimization opportunity would be missed even if LazyValueInfo understood that the pointers are null. This would improve other optimizations as well, but it would probably hurt compile-time a bit. IIRC changes along these lines have been proposed for LLVM in the past, but they have never gone in.

Correctly implementing the rule for poison values in LazyValueInfo would be quite difficult, because it requires reasoning about control-dependence with respect to poison values. Also, the cost of making LazyValueInfo optimistic might be too high in compile time to get the patch landed.

In #9546 there is a proposal to add metadata on LLVM instructions that indicates that the instruction produces a nonnull value. There are two reasons why this proposal would be a bit more difficult than it seems at first:

  1. If an instruction is marked with the nonnull metadata, then what happens if the value actually isn't null at runtime? Is it undefined behavior, or is it a poison value? If it is undefined behavior, then this means that any instruction with the nonnull metadata would potentially have side effects, and code motion that modifies control dependence of this instruction would be prohibited. Dan and I realized that this is also a problem with the current range metadata in LLVM that has likely gone unnoticed. It isn't a problem with the existing nonnull attribute on function parameters and return values, because the attribute is erased upon inlining.
  2. The nonnull metadata would only tell you that the inputs to the phi are nonnull, but LazyValueInfo would still not be able to propagate that to the phi because of the inadequacy mentioned above. Another pass would have to propagate nonnull on phis.

Another option would be to write a pass that looks for chains of inbounds GEPs, nonnull parameters / return values, and phis of these that feed into loads and stores control-dependent on null checks of some intermediate value in the chain. The null checks could be replaced with false, and then hopefully other optimization passes would be able to clean everything up.

zwarich commented Jun 11, 2014

In theory, LLVM should be able to determine that this null check is unnecessary without additional metadata. There are two separate changes to LLVM's optimizer that are required:

  1. An inbounds GEP either produces a valid pointer into an allocated object or a poison value. In address space 0, there is no allocated object at the zero address. This implies that any inbounds GEP in address space 0 is a poison value. According to the LLVM LangRef, the icmp would depend on the poison value, and any instruction that depends on poison values exhibits undefined behavior. However, Dan Gohman (sunfish) tells me that it was only intended to apply to instructions exhibiting externally visible side effects, as otherwise it would mean that any add instruction could potentially have undefined behavior. Any chain of inbounds GEPs and phis ending with a load of the inbounds value should be undefined behavior, because of how poison values behave like undef. We can't just assume that an inbounds GEP produces a nonnull value, because then that implies that inbounds GEPs themselves can have undefined behavior when the runtime value is actually null. Lots of optimizations depend on being able to hoist GEPs, e.g. out of loops, and that wouldn't be possible if they potentially had undefined behavior.
  2. The current LazyValueInfo / CorrelatedValuePropagation passes are not optimistic with respect to control flow. Since there is a phi here, the optimization opportunity would be missed even if LazyValueInfo understood that the pointers are null. This would improve other optimizations as well, but it would probably hurt compile-time a bit. IIRC changes along these lines have been proposed for LLVM in the past, but they have never gone in.

Correctly implementing the rule for poison values in LazyValueInfo would be quite difficult, because it requires reasoning about control-dependence with respect to poison values. Also, the cost of making LazyValueInfo optimistic might be too high in compile time to get the patch landed.

In #9546 there is a proposal to add metadata on LLVM instructions that indicates that the instruction produces a nonnull value. There are two reasons why this proposal would be a bit more difficult than it seems at first:

  1. If an instruction is marked with the nonnull metadata, then what happens if the value actually isn't null at runtime? Is it undefined behavior, or is it a poison value? If it is undefined behavior, then this means that any instruction with the nonnull metadata would potentially have side effects, and code motion that modifies control dependence of this instruction would be prohibited. Dan and I realized that this is also a problem with the current range metadata in LLVM that has likely gone unnoticed. It isn't a problem with the existing nonnull attribute on function parameters and return values, because the attribute is erased upon inlining.
  2. The nonnull metadata would only tell you that the inputs to the phi are nonnull, but LazyValueInfo would still not be able to propagate that to the phi because of the inadequacy mentioned above. Another pass would have to propagate nonnull on phis.

Another option would be to write a pass that looks for chains of inbounds GEPs, nonnull parameters / return values, and phis of these that feed into loads and stores control-dependent on null checks of some intermediate value in the chain. The null checks could be replaced with false, and then hopefully other optimization passes would be able to clean everything up.

@pcwalton

This comment has been minimized.

Show comment
Hide comment
@pcwalton

pcwalton Jun 12, 2014

Contributor

Sounds like the last option is the easiest. I also like the fact that it's a separate pass, meaning that if we have trouble getting it upstream we can maintain it in our branch for a while.

Contributor

pcwalton commented Jun 12, 2014

Sounds like the last option is the easiest. I also like the fact that it's a separate pass, meaning that if we have trouble getting it upstream we can maintain it in our branch for a while.

@zwarich

This comment has been minimized.

Show comment
Hide comment
@zwarich

zwarich Jun 26, 2014

I wrote the optimization pass I described. It is able to optimize the first case (with &[uint]), but it is unable to optimize the second case (with Vec<uint>). I have an informal inductive argument for why it should still always be nonnull with LLVM IR's poison value semantics, but implementing it as code will be trickier.

zwarich commented Jun 26, 2014

I wrote the optimization pass I described. It is able to optimize the first case (with &[uint]), but it is unable to optimize the second case (with Vec<uint>). I have an informal inductive argument for why it should still always be nonnull with LLVM IR's poison value semantics, but implementing it as code will be trickier.

@zwarich

This comment has been minimized.

Show comment
Hide comment
@zwarich

zwarich Jun 27, 2014

Unfortunately, my pass causes the compiled rustc to segfault when compiling liblibc. That might be a pain to track down. The problem could be in my code, or rustc could just be marking a GEP inbounds when it shouldn't.

zwarich commented Jun 27, 2014

Unfortunately, my pass causes the compiled rustc to segfault when compiling liblibc. That might be a pain to track down. The problem could be in my code, or rustc could just be marking a GEP inbounds when it shouldn't.

@zwarich

This comment has been minimized.

Show comment
Hide comment
@zwarich

zwarich Jun 28, 2014

I found the issue and put a first cut of my pass up as rust-lang/llvm#13.

zwarich commented Jun 28, 2014

I found the issue and put a first cut of my pass up as rust-lang/llvm#13.

@zwarich zwarich referenced this issue in rust-lang/llvm Jun 28, 2014

Closed

Add a Rust NullCheckElimination pass #13

@brson

This comment has been minimized.

Show comment
Hide comment
@brson

brson Jun 28, 2014

Contributor

\o/
On Jun 28, 2014 12:24 AM, "Cameron Zwarich" notifications@github.com
wrote:

I found the issue and put a first cut of my pass up as rust-lang/llvm#13
rust-lang/llvm#13.


Reply to this email directly or view it on GitHub
#11751 (comment).

Contributor

brson commented Jun 28, 2014

\o/
On Jun 28, 2014 12:24 AM, "Cameron Zwarich" notifications@github.com
wrote:

I found the issue and put a first cut of my pass up as rust-lang/llvm#13
rust-lang/llvm#13.


Reply to this email directly or view it on GitHub
#11751 (comment).

@zwarich

This comment has been minimized.

Show comment
Hide comment
@zwarich

zwarich Jun 28, 2014

The pass that was landed handles the &[T] case. There are two obvious remaining things to do:

  1. Generalize to the case of multiple null checks involved in a single branch. This shouldn't be too hard.
  2. Handle the case where the base pointer is not known to be nonnull, but everything else in the recurrence is an inbounds GEP. This corresponds to the Vec<T> case. In theory, we could add nonnull metadata to the load (since loads are allowed to have undefined behavior), but applying the LLVM rules inductively actually lets the checks be removed without that.

zwarich commented Jun 28, 2014

The pass that was landed handles the &[T] case. There are two obvious remaining things to do:

  1. Generalize to the case of multiple null checks involved in a single branch. This shouldn't be too hard.
  2. Handle the case where the base pointer is not known to be nonnull, but everything else in the recurrence is an inbounds GEP. This corresponds to the Vec<T> case. In theory, we could add nonnull metadata to the load (since loads are allowed to have undefined behavior), but applying the LLVM rules inductively actually lets the checks be removed without that.
@zwarich

This comment has been minimized.

Show comment
Hide comment
@zwarich

zwarich Jun 28, 2014

Actually, that second point applies to the IR generated by &[T] as well. I must've oversimplified it when I made test cases. I'll try to solve that soon then.

zwarich commented Jun 28, 2014

Actually, that second point applies to the IR generated by &[T] as well. I must've oversimplified it when I made test cases. I'll try to solve that soon then.

@zwarich

This comment has been minimized.

Show comment
Hide comment
@zwarich

zwarich Jun 28, 2014

I have local changes that fix those two items. In order to make this apply to zip(), I also need to track dominating conditions from other blocks. This shouldn't be that much more work, so maybe I'll hold out until I have that working.

zwarich commented Jun 28, 2014

I have local changes that fix those two items. In order to make this apply to zip(), I also need to track dominating conditions from other blocks. This shouldn't be that much more work, so maybe I'll hold out until I have that working.

@zwarich zwarich self-assigned this Jun 28, 2014

@zwarich

This comment has been minimized.

Show comment
Hide comment
@zwarich

zwarich Jun 29, 2014

Those changes are up as rust-lang/llvm#14. I'll need to add less conservative control dependence checking, and then it should be able to handle arbitrary chaining of zipped iterators.

zwarich commented Jun 29, 2014

Those changes are up as rust-lang/llvm#14. I'll need to add less conservative control dependence checking, and then it should be able to handle arbitrary chaining of zipped iterators.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment