diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index 4efcf1e4b..e063390a8 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -131,7 +131,7 @@ impl IntervalSet { /// Union this set with the given set, in place. pub fn union(&mut self, other: &IntervalSet) { - if other.ranges.is_empty() { + if other.ranges.is_empty() || self.ranges == other.ranges { return; } // This could almost certainly be done more efficiently. diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6f3b08fe3..ce0ed4f4f 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1232,23 +1232,13 @@ impl Interval for ClassUnicodeRange { &self, ranges: &mut Vec, ) -> Result<(), unicode::CaseFoldError> { - if !unicode::contains_simple_case_mapping(self.start, self.end)? { + let mut folder = unicode::SimpleCaseFolder::new()?; + if !folder.overlaps(self.start, self.end) { return Ok(()); } let (start, end) = (u32::from(self.start), u32::from(self.end)); - let mut next_simple_cp = None; for cp in (start..=end).filter_map(char::from_u32) { - if next_simple_cp.map_or(false, |next| cp < next) { - continue; - } - let it = match unicode::simple_fold(cp)? { - Ok(it) => it, - Err(next) => { - next_simple_cp = next; - continue; - } - }; - for cp_folded in it { + for &cp_folded in folder.mapping(cp) { ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 3df9d1f8d..b22861fc7 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -824,8 +824,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } if self.flags().unicode() { // If case folding won't do anything, then don't bother trying. - let map = - unicode::contains_simple_case_mapping(c, c).map_err(|_| { + let map = unicode::SimpleCaseFolder::new() + .map(|f| f.overlaps(c, c)) + .map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; if !map { diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index a87fa23c6..91bd4b120 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -64,75 +64,122 @@ impl core::fmt::Display for UnicodeWordError { } } -/// Return an iterator over the equivalence class of simple case mappings -/// for the given codepoint. The equivalence class does not include the -/// given codepoint. -/// -/// If the equivalence class is empty, then this returns the next scalar -/// value that has a non-empty equivalence class, if it exists. If no such -/// scalar value exists, then `None` is returned. The point of this behavior -/// is to permit callers to avoid calling `simple_fold` more than they need -/// to, since there is some cost to fetching the equivalence class. -/// -/// This returns an error if the Unicode case folding tables are not available. -pub fn simple_fold( - c: char, -) -> Result, Option>, CaseFoldError> { - #[cfg(not(feature = "unicode-case"))] - fn imp( - _: char, - ) -> Result, Option>, CaseFoldError> - { - use core::option::IntoIter; - Err::, _>, _>(CaseFoldError(())) - } +/// A state oriented traverser of the simple case folding table. +/// +/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will +/// return an error if the underlying case folding table is unavailable. +/// +/// After construction, it is expected that callers will use +/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly +/// increasing order. For example, calling it on `b` and then on `a` is illegal +/// and will result in a panic. +/// +/// The main idea of this type is that it tries hard to make mapping lookups +/// fast by exploiting the structure of the underlying table, and the ordering +/// assumption enables this. +#[derive(Debug)] +pub struct SimpleCaseFolder { + /// The simple case fold table. It's a sorted association list, where the + /// keys are Unicode scalar values and the values are the corresponding + /// equivalence class (not including the key) of the "simple" case folded + /// Unicode scalar values. + table: &'static [(char, &'static [char])], + /// The last codepoint that was used for a lookup. + last: Option, + /// The index to the entry in `table` corresponding to the smallest key `k` + /// such that `k > k0`, where `k0` is the most recent key lookup. Note that + /// in particular, `k0` may not be in the table! + next: usize, +} - #[cfg(feature = "unicode-case")] - fn imp( - c: char, - ) -> Result, Option>, CaseFoldError> - { - use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; - - Ok(CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |&(c1, _)| c1) - .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied()) - .map_err(|i| { - if i >= CASE_FOLDING_SIMPLE.len() { - None - } else { - Some(CASE_FOLDING_SIMPLE[i].0) - } - })) +impl SimpleCaseFolder { + /// Create a new simple case folder, returning an error if the underlying + /// case folding table is unavailable. + pub fn new() -> Result { + #[cfg(not(feature = "unicode-case"))] + { + Err(CaseFoldError(())) + } + #[cfg(feature = "unicode-case")] + { + Ok(SimpleCaseFolder { + table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, + last: None, + next: 0, + }) + } } - imp(c) -} - -/// Returns true if and only if the given (inclusive) range contains at least -/// one Unicode scalar value that has a non-empty non-trivial simple case -/// mapping. -/// -/// This function panics if `end < start`. -/// -/// This returns an error if the Unicode case folding tables are not available. -pub fn contains_simple_case_mapping( - start: char, - end: char, -) -> Result { - #[cfg(not(feature = "unicode-case"))] - fn imp(_: char, _: char) -> Result { - Err(CaseFoldError(())) + /// Return the equivalence class of case folded codepoints for the given + /// codepoint. The equivalence class returned never includes the codepoint + /// given. If the given codepoint has no case folded codepoints (i.e., + /// no entry in the underlying case folding table), then this returns an + /// empty slice. + /// + /// # Panics + /// + /// This panics when called with a `c` that is less than or equal to the + /// previous call. In other words, callers need to use this method with + /// strictly increasing values of `c`. + pub fn mapping(&mut self, c: char) -> &'static [char] { + if let Some(last) = self.last { + assert!( + last < c, + "got codepoint U+{:X} which occurs before \ + last codepoint U+{:X}", + u32::from(c), + u32::from(last), + ); + } + self.last = Some(c); + if self.next >= self.table.len() { + return &[]; + } + let (k, v) = self.table[self.next]; + if k == c { + self.next += 1; + return v; + } + match self.get(c) { + Err(i) => { + self.next = i; + &[] + } + Ok(i) => { + // Since we require lookups to proceed + // in order, anything we find should be + // after whatever we thought might be + // next. Otherwise, the caller is either + // going out of order or we would have + // found our next key at 'self.next'. + assert!(i > self.next); + self.next = i + 1; + self.table[i].1 + } + } } - #[cfg(feature = "unicode-case")] - fn imp(start: char, end: char) -> Result { + /// Returns true if and only if the given range overlaps with any region + /// of the underlying case folding table. That is, when true, there exists + /// at least one codepoint in the inclusive range `[start, end]` that has + /// a non-trivial equivalence class of case folded codepoints. Conversely, + /// when this returns false, all codepoints in the range `[start, end]` + /// correspond to the trivial equivalence class of case folded codepoints, + /// i.e., itself. + /// + /// This is useful to call before iterating over the codepoints in the + /// range and looking up the mapping for each. If you know none of the + /// mappings will return anything, then you might be able to skip doing it + /// altogether. + /// + /// # Panics + /// + /// This panics when `end < start`. + pub fn overlaps(&self, start: char, end: char) -> bool { use core::cmp::Ordering; - use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; - assert!(start <= end); - Ok(CASE_FOLDING_SIMPLE + self.table .binary_search_by(|&(c, _)| { if start <= c && c <= end { Ordering::Equal @@ -142,10 +189,15 @@ pub fn contains_simple_case_mapping( Ordering::Less } }) - .is_ok()) + .is_ok() } - imp(start, end) + /// Returns the index at which `c` occurs in the simple case fold table. If + /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < + /// c` and `table[i].0 > c`. + fn get(&self, c: char) -> Result { + self.table.binary_search_by_key(&c, |&(c1, _)| c1) + } } /// A query for finding a character class defined by Unicode. This supports @@ -897,20 +949,12 @@ mod tests { #[cfg(feature = "unicode-case")] fn simple_fold_ok(c: char) -> impl Iterator { - simple_fold(c).unwrap().unwrap() - } - - #[cfg(feature = "unicode-case")] - fn simple_fold_err(c: char) -> Option { - match simple_fold(c).unwrap() { - Ok(_) => unreachable!("simple_fold returned Ok iterator"), - Err(next) => next, - } + SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() } #[cfg(feature = "unicode-case")] fn contains_case_map(start: char, end: char) -> bool { - contains_simple_case_mapping(start, end).unwrap() + SimpleCaseFolder::new().unwrap().overlaps(start, end) } #[test] @@ -936,26 +980,10 @@ mod tests { assert_eq!(xs, alloc::vec!['a']); } - #[test] - #[cfg(feature = "unicode-case")] - fn simple_fold_empty() { - assert_eq!(Some('A'), simple_fold_err('?')); - assert_eq!(Some('A'), simple_fold_err('@')); - assert_eq!(Some('a'), simple_fold_err('[')); - assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); - } - - #[test] - #[cfg(feature = "unicode-case")] - fn simple_fold_max() { - assert_eq!(None, simple_fold_err('\u{10FFFE}')); - assert_eq!(None, simple_fold_err('\u{10FFFF}')); - } - #[test] #[cfg(not(feature = "unicode-case"))] fn simple_fold_disabled() { - assert!(simple_fold('a').is_err()); + assert!(SimpleCaseFolder::new().is_err()); } #[test] @@ -974,12 +1002,6 @@ mod tests { assert!(!contains_case_map('☃', '☃')); } - #[test] - #[cfg(not(feature = "unicode-case"))] - fn range_contains_disabled() { - assert!(contains_simple_case_mapping('a', 'a').is_err()); - } - #[test] #[cfg(feature = "unicode-gencat")] fn regression_466() {