diff --git a/src/libcore/benches/ascii.rs b/src/libcore/benches/ascii.rs index 76ccd3ddb6fd8..05dd7adff1fbb 100644 --- a/src/libcore/benches/ascii.rs +++ b/src/libcore/benches/ascii.rs @@ -1,3 +1,5 @@ +mod is_ascii; + // Lower-case ASCII 'a' is the first byte that has its highest bit set // after wrap-adding 0x1F: // diff --git a/src/libcore/benches/ascii/is_ascii.rs b/src/libcore/benches/ascii/is_ascii.rs new file mode 100644 index 0000000000000..729b0a04eb6ba --- /dev/null +++ b/src/libcore/benches/ascii/is_ascii.rs @@ -0,0 +1,82 @@ +use super::{LONG, MEDIUM, SHORT}; +use test::black_box; +use test::Bencher; + +macro_rules! benches { + ($( fn $name: ident($arg: ident: &[u8]) $body: block )+) => { + benches!(mod short SHORT[..] $($name $arg $body)+); + benches!(mod medium MEDIUM[..] $($name $arg $body)+); + benches!(mod long LONG[..] $($name $arg $body)+); + // Ensure we benchmark cases where the functions are called with strings + // that are not perfectly aligned or have a length which is not a + // multiple of size_of::() (or both) + benches!(mod unaligned_head MEDIUM[1..] $($name $arg $body)+); + benches!(mod unaligned_tail MEDIUM[..(MEDIUM.len() - 1)] $($name $arg $body)+); + benches!(mod unaligned_both MEDIUM[1..(MEDIUM.len() - 1)] $($name $arg $body)+); + }; + + (mod $mod_name: ident $input: ident [$range: expr] $($name: ident $arg: ident $body: block)+) => { + mod $mod_name { + use super::*; + $( + #[bench] + fn $name(bencher: &mut Bencher) { + bencher.bytes = $input[$range].len() as u64; + let mut vec = $input.as_bytes().to_vec(); + bencher.iter(|| { + let $arg: &[u8] = &black_box(&mut vec)[$range]; + black_box($body) + }) + } + )+ + } + }; +} + +benches! { + fn case00_libcore(bytes: &[u8]) { + bytes.is_ascii() + } + + fn case01_iter_all(bytes: &[u8]) { + bytes.iter().all(|b| b.is_ascii()) + } + + fn case02_align_to(bytes: &[u8]) { + is_ascii_align_to(bytes) + } + + fn case03_align_to_unrolled(bytes: &[u8]) { + is_ascii_align_to_unrolled(bytes) + } +} + +// These are separate since it's easier to debug errors if they don't go through +// macro expansion first. +fn is_ascii_align_to(bytes: &[u8]) -> bool { + if bytes.len() < core::mem::size_of::() { + return bytes.iter().all(|b| b.is_ascii()); + } + // SAFETY: transmuting a sequence of `u8` to `usize` is always fine + let (head, body, tail) = unsafe { bytes.align_to::() }; + head.iter().all(|b| b.is_ascii()) + && body.iter().all(|w| !contains_nonascii(*w)) + && tail.iter().all(|b| b.is_ascii()) +} + +fn is_ascii_align_to_unrolled(bytes: &[u8]) -> bool { + if bytes.len() < core::mem::size_of::() { + return bytes.iter().all(|b| b.is_ascii()); + } + // SAFETY: transmuting a sequence of `u8` to `[usize; 2]` is always fine + let (head, body, tail) = unsafe { bytes.align_to::<[usize; 2]>() }; + head.iter().all(|b| b.is_ascii()) + && body.iter().all(|w| !contains_nonascii(w[0] | w[1])) + && tail.iter().all(|b| b.is_ascii()) +} + +#[inline] +fn contains_nonascii(v: usize) -> bool { + const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; + (NONASCII_MASK & v) != 0 +} diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs index e7a2d7adedea0..bed8495993f43 100644 --- a/src/libcore/slice/mod.rs +++ b/src/libcore/slice/mod.rs @@ -2795,7 +2795,7 @@ impl [u8] { #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] #[inline] pub fn is_ascii(&self) -> bool { - self.iter().all(|b| b.is_ascii()) + is_ascii(self) } /// Checks that two slices are an ASCII case-insensitive match. @@ -2843,6 +2843,106 @@ impl [u8] { } } +/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed +/// from `../str/mod.rs`, which does something similar for utf8 validation. +#[inline] +fn contains_nonascii(v: usize) -> bool { + const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; + (NONASCII_MASK & v) != 0 +} + +/// Optimized ASCII test that will use usize-at-a-time operations instead of +/// byte-at-a-time operations (when possible). +/// +/// The algorithm we use here is pretty simple. If `s` is too short, we just +/// check each byte and be done with it. Otherwise: +/// +/// - Read the first word with an unaligned load. +/// - Align the pointer, read subsequent words until end with aligned loads. +/// - If there's a tail, the last `usize` from `s` with an unaligned load. +/// +/// If any of these loads produces something for which `contains_nonascii` +/// (above) returns true, then we know the answer is false. +#[inline] +fn is_ascii(s: &[u8]) -> bool { + const USIZE_SIZE: usize = mem::size_of::(); + + let len = s.len(); + let align_offset = s.as_ptr().align_offset(USIZE_SIZE); + + // If we wouldn't gain anything from the word-at-a-time implementation, fall + // back to a scalar loop. + // + // We also do this for architectures where `size_of::()` isn't + // sufficient alignment for `usize`, because it's a weird edge case. + if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::() { + return s.iter().all(|b| b.is_ascii()); + } + + // We always read the first word unaligned, which means `align_offset` is + // 0, we'd read the same value again for the aligned read. + let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset }; + + let start = s.as_ptr(); + // SAFETY: We verify `len < USIZE_SIZE` above. + let first_word = unsafe { (start as *const usize).read_unaligned() }; + + if contains_nonascii(first_word) { + return false; + } + // We checked this above, somewhat implicitly. Note that `offset_to_aligned` + // is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked + // above. + debug_assert!(offset_to_aligned <= len); + + // word_ptr is the (properly aligned) usize ptr we use to read the middle chunk of the slice. + let mut word_ptr = unsafe { start.add(offset_to_aligned) as *const usize }; + + // `byte_pos` is the byte index of `word_ptr`, used for loop end checks. + let mut byte_pos = offset_to_aligned; + + // Paranoia check about alignment, since we're about to do a bunch of + // unaligned loads. In practice this should be impossible barring a bug in + // `align_offset` though. + debug_assert_eq!((word_ptr as usize) % mem::align_of::(), 0); + + while byte_pos <= len - USIZE_SIZE { + debug_assert!( + // Sanity check that the read is in bounds + (word_ptr as usize + USIZE_SIZE) <= (start.wrapping_add(len) as usize) && + // And that our assumptions about `byte_pos` hold. + (word_ptr as usize) - (start as usize) == byte_pos + ); + + // Safety: We know `word_ptr` is properly aligned (because of + // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end + let word = unsafe { word_ptr.read() }; + if contains_nonascii(word) { + return false; + } + + byte_pos += USIZE_SIZE; + // SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that + // after this `add`, `word_ptr` will be at most one-past-the-end. + word_ptr = unsafe { word_ptr.add(1) }; + } + + // If we have anything left over, it should be at-most 1 usize worth of bytes, + // which we check with a read_unaligned. + if byte_pos == len { + return true; + } + + // Sanity check to ensure there really is only one `usize` left. This should + // be guaranteed by our loop condition. + debug_assert!(byte_pos < len && len - byte_pos < USIZE_SIZE); + + // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start. + let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() }; + + !contains_nonascii(last_word) +} + #[stable(feature = "rust1", since = "1.0.0")] impl ops::Index for [T] where diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 0014501d2c4d0..003ed7df36e2a 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -4348,7 +4348,7 @@ impl str { // We can treat each byte as character here: all multibyte characters // start with a byte that is not in the ascii range, so we will stop // there already. - self.bytes().all(|b| b.is_ascii()) + self.as_bytes().is_ascii() } /// Checks that two strings are an ASCII case-insensitive match. diff --git a/src/libcore/tests/ascii.rs b/src/libcore/tests/ascii.rs index 71275d40c4621..57f2de16b2b37 100644 --- a/src/libcore/tests/ascii.rs +++ b/src/libcore/tests/ascii.rs @@ -343,3 +343,59 @@ fn test_is_ascii_control() { " ", ); } + +// `is_ascii` does a good amount of pointer manipulation and has +// alignment-dependent computation. This is all sanity-checked via +// `debug_assert!`s, so we test various sizes/alignments thoroughly versus an +// "obviously correct" baseline function. +#[test] +fn test_is_ascii_align_size_thoroughly() { + // The "obviously-correct" baseline mentioned above. + fn is_ascii_baseline(s: &[u8]) -> bool { + s.iter().all(|b| b.is_ascii()) + } + + // Helper to repeat `l` copies of `b0` followed by `l` copies of `b1`. + fn repeat_concat(b0: u8, b1: u8, l: usize) -> Vec { + use core::iter::repeat; + repeat(b0).take(l).chain(repeat(b1).take(l)).collect() + } + + // Miri is too slow for much of this, and in miri `align_offset` always + // returns `usize::max_value()` anyway (at the moment), so we just test + // lightly. + let iter = if cfg!(miri) { 0..5 } else { 0..100 }; + + for i in iter { + #[cfg(not(miri))] + let cases = &[ + b"a".repeat(i), + b"\0".repeat(i), + b"\x7f".repeat(i), + b"\x80".repeat(i), + b"\xff".repeat(i), + repeat_concat(b'a', 0x80u8, i), + repeat_concat(0x80u8, b'a', i), + ]; + + #[cfg(miri)] + let cases = &[repeat_concat(b'a', 0x80u8, i)]; + + for case in cases { + for pos in 0..=case.len() { + // Potentially misaligned head + let prefix = &case[pos..]; + assert_eq!(is_ascii_baseline(prefix), prefix.is_ascii(),); + + // Potentially misaligned tail + let suffix = &case[..case.len() - pos]; + + assert_eq!(is_ascii_baseline(suffix), suffix.is_ascii(),); + + // Both head and tail are potentially misaligned + let mid = &case[(pos / 2)..(case.len() - (pos / 2))]; + assert_eq!(is_ascii_baseline(mid), mid.is_ascii(),); + } + } + } +}