From b031c516641c53577d021da1a89d0c953acdc23a Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Tue, 7 Oct 2025 05:52:37 -0400 Subject: [PATCH 1/4] slice/ascii: Optimize `eq_ignore_ascii_case` with auto-vectorization Refactor the current functionality into a helper function Use `as_chunks` to encourage auto-vectorization in the optimized chunk processing function Add a codegen test Add benches for `eq_ignore_ascii_case` The optimized function is initially only enabled for x86_64 which has `sse2` as part of its baseline, but none of the code is platform specific. Other platforms with SIMD instructions may also benefit from this implementation. Performance improvements only manifest for slices of 16 bytes or longer, so the optimized path is gated behind a length check for greater than or equal to 16. --- library/core/src/slice/ascii.rs | 43 ++++++++++++++ library/coretests/benches/ascii.rs | 1 + .../benches/ascii/eq_ignore_ascii_case.rs | 56 +++++++++++++++++++ .../lib-optimizations/eq_ignore_ascii_case.rs | 14 +++++ 4 files changed, 114 insertions(+) create mode 100644 library/coretests/benches/ascii/eq_ignore_ascii_case.rs create mode 100644 tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index e17a2e03d2dc4..1f9ca4bc66987 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -60,6 +60,18 @@ impl [u8] { return false; } + #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] + if self.len() >= 16 { + return self.eq_ignore_ascii_case_chunks(other); + } + + self.eq_ignore_ascii_case_simple(other) + } + + /// ASCII case-insensitive equality check without chunk-at-a-time + /// optimization. + #[inline] + const fn eq_ignore_ascii_case_simple(&self, other: &[u8]) -> bool { // FIXME(const-hack): This implementation can be reverted when // `core::iter::zip` is allowed in const. The original implementation: // self.len() == other.len() && iter::zip(self, other).all(|(a, b)| a.eq_ignore_ascii_case(b)) @@ -78,6 +90,37 @@ impl [u8] { true } + /// Optimized version of `eq_ignore_ascii_case` which processes chunks at a + /// time. + /// + /// Platforms that have SIMD instructions may benefit from this + /// implementation over `eq_ignore_ascii_case_simple`. + #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] + #[inline] + const fn eq_ignore_ascii_case_chunks(&self, other: &[u8]) -> bool { + const N: usize = 16; + let (a, a_rem) = self.as_chunks::(); + let (b, b_rem) = other.as_chunks::(); + + let mut i = 0; + while i < a.len() && i < b.len() { + let mut equal_ascii = true; + let mut j = 0; + while j < N { + equal_ascii &= a[i][j].eq_ignore_ascii_case(&b[i][j]); + j += 1; + } + + if !equal_ascii { + return false; + } + + i += 1; + } + + a_rem.eq_ignore_ascii_case_simple(b_rem) + } + /// Converts this slice to its ASCII upper case equivalent in-place. /// /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', diff --git a/library/coretests/benches/ascii.rs b/library/coretests/benches/ascii.rs index 64bdc7fed118f..17a520922bfa7 100644 --- a/library/coretests/benches/ascii.rs +++ b/library/coretests/benches/ascii.rs @@ -1,3 +1,4 @@ +mod eq_ignore_ascii_case; mod is_ascii; // Lower-case ASCII 'a' is the first byte that has its highest bit set diff --git a/library/coretests/benches/ascii/eq_ignore_ascii_case.rs b/library/coretests/benches/ascii/eq_ignore_ascii_case.rs new file mode 100644 index 0000000000000..a51acb1e84637 --- /dev/null +++ b/library/coretests/benches/ascii/eq_ignore_ascii_case.rs @@ -0,0 +1,56 @@ +use test::Bencher; + +#[bench] +fn bench_str_under_8_bytes_eq(b: &mut Bencher) { + let s = "foo"; + let other = "FOo"; + b.iter(|| { + assert!(s.eq_ignore_ascii_case(other)); + }) +} + +#[bench] +fn bench_str_of_8_bytes_eq(b: &mut Bencher) { + let s = "foobar78"; + let other = "FOObAr78"; + b.iter(|| { + assert!(s.eq_ignore_ascii_case(other)); + }) +} + +#[bench] +fn bench_str_17_bytes_eq(b: &mut Bencher) { + let s = "performance-criti"; + let other = "performANce-cRIti"; + b.iter(|| { + assert!(s.eq_ignore_ascii_case(other)); + }) +} + +#[bench] +fn bench_str_31_bytes_eq(b: &mut Bencher) { + let s = "foobarbazquux02foobarbazquux025"; + let other = "fooBARbazQuuX02fooBARbazQuuX025"; + b.iter(|| { + assert!(s.eq_ignore_ascii_case(other)); + }) +} + +#[bench] +fn bench_long_str_eq(b: &mut Bencher) { + let s = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor \ + incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud \ + exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute \ + irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla \ + pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui \ + officia deserunt mollit anim id est laborum."; + let other = "Lorem ipsum dolor sit amet, CONSECTETUR adipisicing elit, sed do eiusmod tempor \ + incididunt ut labore et dolore MAGNA aliqua. Ut enim ad MINIM veniam, quis nostrud \ + exercitation ullamco LABORIS nisi ut aliquip ex ea commodo consequat. Duis aute \ + irure dolor in reprehenderit in voluptate velit esse cillum DOLORE eu fugiat nulla \ + pariatur. Excepteur sint occaecat CUPIDATAT non proident, sunt in culpa qui \ + officia deserunt mollit anim id est laborum."; + b.iter(|| { + assert!(s.eq_ignore_ascii_case(other)); + }) +} diff --git a/tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs b/tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs new file mode 100644 index 0000000000000..b733f1812c92c --- /dev/null +++ b/tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs @@ -0,0 +1,14 @@ +//@ compile-flags: -Copt-level=3 +//@ only-x86_64 +#![crate_type = "lib"] + +// Ensure that the optimized variant of the function gets auto-vectorized. +// CHECK-LABEL: @eq_ignore_ascii_case_autovectorized +#[no_mangle] +pub fn eq_ignore_ascii_case_autovectorized(s: &str, other: &str) -> bool { + // CHECK: load <16 x i8> + // CHECK: load <16 x i8> + // CHECK: bitcast <16 x i1> + // CHECK-NOT: panic + s.eq_ignore_ascii_case(other) +} From 7b88f48b530f6610581a0d023199c9563e635730 Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Tue, 7 Oct 2025 15:35:02 -0400 Subject: [PATCH 2/4] Avoid scalar fallback in chunk remainder check Refactor the eq check into an inner function for reuse in tail checking Rather than fall back to the simple implementation for tail handling, load the last 16 bytes to take advantage of vectorization. This doesn't seem to negatively impact check time even when the remainder count is low. --- library/core/src/slice/ascii.rs | 35 +++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 1f9ca4bc66987..3f3f5a8c441dd 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -90,8 +90,8 @@ impl [u8] { true } - /// Optimized version of `eq_ignore_ascii_case` which processes chunks at a - /// time. + /// Optimized version of `eq_ignore_ascii_case` for byte lengths of at least + /// 16 bytes, which processes chunks at a time. /// /// Platforms that have SIMD instructions may benefit from this /// implementation over `eq_ignore_ascii_case_simple`. @@ -99,26 +99,41 @@ impl [u8] { #[inline] const fn eq_ignore_ascii_case_chunks(&self, other: &[u8]) -> bool { const N: usize = 16; - let (a, a_rem) = self.as_chunks::(); - let (b, b_rem) = other.as_chunks::(); + let (self_chunks, self_rem) = self.as_chunks::(); + let (other_chunks, _) = other.as_chunks::(); - let mut i = 0; - while i < a.len() && i < b.len() { + // Branchless check to encourage auto-vectorization + const fn eq_ignore_ascii_inner(lhs: &[u8; N], rhs: &[u8; N]) -> bool { let mut equal_ascii = true; let mut j = 0; while j < N { - equal_ascii &= a[i][j].eq_ignore_ascii_case(&b[i][j]); + equal_ascii &= lhs[j].eq_ignore_ascii_case(&rhs[j]); j += 1; } - if !equal_ascii { + equal_ascii + } + + // Process the chunks, returning early if an inequality is found + let mut i = 0; + while i < self_chunks.len() && i < other_chunks.len() { + if !eq_ignore_ascii_inner(&self_chunks[i], &other_chunks[i]) { return false; } - i += 1; } - a_rem.eq_ignore_ascii_case_simple(b_rem) + // If there are remaining tails, load the last N bytes in the slices to + // avoid falling back to per-byte checking. + if !self_rem.is_empty() { + if let (Some(a_rem), Some(b_rem)) = (self.last_chunk::(), other.last_chunk::()) { + if !eq_ignore_ascii_inner(a_rem, b_rem) { + return false; + } + } + } + + true } /// Converts this slice to its ASCII upper case equivalent in-place. From a5ba24843d6e4ceda580e49344bef73baec14204 Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Tue, 7 Oct 2025 19:04:32 -0400 Subject: [PATCH 3/4] Relocate bench and use str corpora for data Add #[inline(always)] to inner function and check not for filecheck test --- library/core/src/slice/ascii.rs | 1 + library/coretests/benches/ascii.rs | 1 - .../benches/ascii/eq_ignore_ascii_case.rs | 56 ------------------- library/coretests/benches/str.rs | 1 + .../benches/str/eq_ignore_ascii_case.rs | 45 +++++++++++++++ .../lib-optimizations/eq_ignore_ascii_case.rs | 4 +- 6 files changed, 50 insertions(+), 58 deletions(-) delete mode 100644 library/coretests/benches/ascii/eq_ignore_ascii_case.rs create mode 100644 library/coretests/benches/str/eq_ignore_ascii_case.rs diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 3f3f5a8c441dd..8b713947cd9c7 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -103,6 +103,7 @@ impl [u8] { let (other_chunks, _) = other.as_chunks::(); // Branchless check to encourage auto-vectorization + #[inline(always)] const fn eq_ignore_ascii_inner(lhs: &[u8; N], rhs: &[u8; N]) -> bool { let mut equal_ascii = true; let mut j = 0; diff --git a/library/coretests/benches/ascii.rs b/library/coretests/benches/ascii.rs index 17a520922bfa7..64bdc7fed118f 100644 --- a/library/coretests/benches/ascii.rs +++ b/library/coretests/benches/ascii.rs @@ -1,4 +1,3 @@ -mod eq_ignore_ascii_case; mod is_ascii; // Lower-case ASCII 'a' is the first byte that has its highest bit set diff --git a/library/coretests/benches/ascii/eq_ignore_ascii_case.rs b/library/coretests/benches/ascii/eq_ignore_ascii_case.rs deleted file mode 100644 index a51acb1e84637..0000000000000 --- a/library/coretests/benches/ascii/eq_ignore_ascii_case.rs +++ /dev/null @@ -1,56 +0,0 @@ -use test::Bencher; - -#[bench] -fn bench_str_under_8_bytes_eq(b: &mut Bencher) { - let s = "foo"; - let other = "FOo"; - b.iter(|| { - assert!(s.eq_ignore_ascii_case(other)); - }) -} - -#[bench] -fn bench_str_of_8_bytes_eq(b: &mut Bencher) { - let s = "foobar78"; - let other = "FOObAr78"; - b.iter(|| { - assert!(s.eq_ignore_ascii_case(other)); - }) -} - -#[bench] -fn bench_str_17_bytes_eq(b: &mut Bencher) { - let s = "performance-criti"; - let other = "performANce-cRIti"; - b.iter(|| { - assert!(s.eq_ignore_ascii_case(other)); - }) -} - -#[bench] -fn bench_str_31_bytes_eq(b: &mut Bencher) { - let s = "foobarbazquux02foobarbazquux025"; - let other = "fooBARbazQuuX02fooBARbazQuuX025"; - b.iter(|| { - assert!(s.eq_ignore_ascii_case(other)); - }) -} - -#[bench] -fn bench_long_str_eq(b: &mut Bencher) { - let s = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor \ - incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud \ - exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute \ - irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla \ - pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui \ - officia deserunt mollit anim id est laborum."; - let other = "Lorem ipsum dolor sit amet, CONSECTETUR adipisicing elit, sed do eiusmod tempor \ - incididunt ut labore et dolore MAGNA aliqua. Ut enim ad MINIM veniam, quis nostrud \ - exercitation ullamco LABORIS nisi ut aliquip ex ea commodo consequat. Duis aute \ - irure dolor in reprehenderit in voluptate velit esse cillum DOLORE eu fugiat nulla \ - pariatur. Excepteur sint occaecat CUPIDATAT non proident, sunt in culpa qui \ - officia deserunt mollit anim id est laborum."; - b.iter(|| { - assert!(s.eq_ignore_ascii_case(other)); - }) -} diff --git a/library/coretests/benches/str.rs b/library/coretests/benches/str.rs index 2f7d9d56a70b7..bf45a8f0a79bc 100644 --- a/library/coretests/benches/str.rs +++ b/library/coretests/benches/str.rs @@ -5,6 +5,7 @@ use test::{Bencher, black_box}; mod char_count; mod corpora; mod debug; +mod eq_ignore_ascii_case; mod iter; #[bench] diff --git a/library/coretests/benches/str/eq_ignore_ascii_case.rs b/library/coretests/benches/str/eq_ignore_ascii_case.rs new file mode 100644 index 0000000000000..29129b933bc46 --- /dev/null +++ b/library/coretests/benches/str/eq_ignore_ascii_case.rs @@ -0,0 +1,45 @@ +use test::{Bencher, black_box}; + +use super::corpora::*; + +#[bench] +fn bench_str_under_8_bytes_eq(b: &mut Bencher) { + let s = black_box("foo"); + let other = black_box("foo"); + b.iter(|| assert!(s.eq_ignore_ascii_case(other))) +} + +#[bench] +fn bench_str_of_8_bytes_eq(b: &mut Bencher) { + let s = black_box(en::TINY); + let other = black_box(en::TINY); + b.iter(|| assert!(s.eq_ignore_ascii_case(other))) +} + +#[bench] +fn bench_str_17_bytes_eq(b: &mut Bencher) { + let s = black_box(&en::SMALL[..17]); + let other = black_box(&en::SMALL[..17]); + b.iter(|| assert!(s.eq_ignore_ascii_case(other))) +} + +#[bench] +fn bench_str_31_bytes_eq(b: &mut Bencher) { + let s = black_box(&en::SMALL[..31]); + let other = black_box(&en::SMALL[..31]); + b.iter(|| assert!(s.eq_ignore_ascii_case(other))) +} + +#[bench] +fn bench_medium_str_eq(b: &mut Bencher) { + let s = black_box(en::MEDIUM); + let other = black_box(en::MEDIUM); + b.iter(|| assert!(s.eq_ignore_ascii_case(other))) +} + +#[bench] +fn bench_large_str_eq(b: &mut Bencher) { + let s = black_box(en::LARGE); + let other = black_box(en::LARGE); + b.iter(|| assert!(s.eq_ignore_ascii_case(other))) +} diff --git a/tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs b/tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs index b733f1812c92c..d4ac5d64585db 100644 --- a/tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs +++ b/tests/codegen-llvm/lib-optimizations/eq_ignore_ascii_case.rs @@ -2,13 +2,15 @@ //@ only-x86_64 #![crate_type = "lib"] -// Ensure that the optimized variant of the function gets auto-vectorized. +// Ensure that the optimized variant of the function gets auto-vectorized and +// that the inner helper function is inlined. // CHECK-LABEL: @eq_ignore_ascii_case_autovectorized #[no_mangle] pub fn eq_ignore_ascii_case_autovectorized(s: &str, other: &str) -> bool { // CHECK: load <16 x i8> // CHECK: load <16 x i8> // CHECK: bitcast <16 x i1> + // CHECK-NOT: call {{.*}}eq_ignore_ascii_inner // CHECK-NOT: panic s.eq_ignore_ascii_case(other) } From 31bf836c74e5b53e654bed70158cabfdfdca8acf Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Tue, 14 Oct 2025 02:23:44 -0400 Subject: [PATCH 4/4] Use a const generic parameter for the function chunk size Add comments for the optimized function invariants to the caller Add const-hack fixme for using while-loops Document the invariant for the `_chunks` function Add a debug assert for the tail handling invariant --- library/core/src/slice/ascii.rs | 35 +++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 8b713947cd9c7..0f3cf243f1647 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -61,8 +61,15 @@ impl [u8] { } #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] - if self.len() >= 16 { - return self.eq_ignore_ascii_case_chunks(other); + { + const CHUNK_SIZE: usize = 16; + // The following function has two invariants: + // 1. The slice lengths must be equal, which we checked above. + // 2. The slice lengths must greater than or equal to N, which this + // if-statement is checking. + if self.len() >= CHUNK_SIZE { + return self.eq_ignore_ascii_case_chunks::(other); + } } self.eq_ignore_ascii_case_simple(other) @@ -90,24 +97,30 @@ impl [u8] { true } - /// Optimized version of `eq_ignore_ascii_case` for byte lengths of at least - /// 16 bytes, which processes chunks at a time. + /// Optimized version of `eq_ignore_ascii_case` to process chunks at a time. /// /// Platforms that have SIMD instructions may benefit from this /// implementation over `eq_ignore_ascii_case_simple`. + /// + /// # Invariants + /// + /// The caller must guarantee that the slices are equal in length, and the + /// slice lengths are greater than or equal to `N` bytes. #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] #[inline] - const fn eq_ignore_ascii_case_chunks(&self, other: &[u8]) -> bool { - const N: usize = 16; + const fn eq_ignore_ascii_case_chunks(&self, other: &[u8]) -> bool { + // FIXME(const-hack): The while-loops that follow should be replaced by + // for-loops when available in const. + let (self_chunks, self_rem) = self.as_chunks::(); let (other_chunks, _) = other.as_chunks::(); // Branchless check to encourage auto-vectorization #[inline(always)] - const fn eq_ignore_ascii_inner(lhs: &[u8; N], rhs: &[u8; N]) -> bool { + const fn eq_ignore_ascii_inner(lhs: &[u8; L], rhs: &[u8; L]) -> bool { let mut equal_ascii = true; let mut j = 0; - while j < N { + while j < L { equal_ascii &= lhs[j].eq_ignore_ascii_case(&rhs[j]); j += 1; } @@ -124,6 +137,12 @@ impl [u8] { i += 1; } + // Check the length invariant which is necessary for the tail-handling + // logic to be correct. This should have been upheld by the caller, + // otherwise lengths less than N will compare as true without any + // checking. + debug_assert!(self.len() >= N); + // If there are remaining tails, load the last N bytes in the slices to // avoid falling back to per-byte checking. if !self_rem.is_empty() {