From da5490b492ee6cc5f677dc05ca6a0a044df526a3 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 5 Mar 2023 21:05:11 -0500
Subject: [PATCH] syntax: optimize case folding

It turns out that it's not too hard to get HIR translation to run pretty
slowly with some carefully crafted regexes. For example:

    (?i:[[:^space:]------------------------------------------------------------------------])

This regex is actually a [:^space:] class that has an empty class
subtracted from it 36 times. For each subtraction, the resulting
class--despite it not having changed---goes through Unicode case folding
again. This in turn slows things way down.

We introduce a fairly basic optimization that basically keeps track of
whether an interval set has been folded or not. The idea was taken from
PR #893, but was tweaked slightly. The magic of how it works is that if
two interval sets have already been folded, then they retain that
property after any of the set operations: negation, union, difference,
intersection and symmetric difference. So case folding should generally
only need to be run once for each "base" class, but then not again as
operations are performed.

Some benchmarks were added to rebar (which isn't public yet at time of
writing).

Closes #893
---
 regex-syntax/src/hir/interval.rs  | 67 ++++++++++++++++++++++++++++++-
 regex-syntax/src/hir/translate.rs |  2 +-
 2 files changed, 66 insertions(+), 3 deletions(-)
diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs
index fbe772ea4..4efcf1e4b 100644
--- a/regex-syntax/src/hir/interval.rs
+++ b/regex-syntax/src/hir/interval.rs
@@ -30,9 +30,38 @@ use crate::unicode;
 //
 // Tests on this are relegated to the public API of HIR in src/hir.rs.
 
-#[derive(Clone, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug)]
 pub struct IntervalSet<I> {
+    /// A sorted set of non-overlapping ranges.
     ranges: Vec<I>,
+    /// While not required at all for correctness, we keep track of whether an
+    /// interval set has been case folded or not. This helps us avoid doing
+    /// redundant work if, for example, a set has already been cased folded.
+    /// And note that whether a set is folded or not is preserved through
+    /// all of the pairwise set operations. That is, if both interval sets
+    /// have been case folded, then any of difference, union, intersection or
+    /// symmetric difference all produce a case folded set.
+    ///
+    /// Note that when this is true, it *must* be the case that the set is case
+    /// folded. But when it's false, the set *may* be case folded. In other
+    /// words, we only set this to true when we know it to be case, but we're
+    /// okay with it being false if it would otherwise be costly to determine
+    /// whether it should be true. This means code cannot assume that a false
+    /// value necessarily indicates that the set is not case folded.
+    ///
+    /// Bottom line: this is a performance optimization.
+    folded: bool,
+}
+
+impl<I: Interval> Eq for IntervalSet<I> {}
+
+// We implement PartialEq manually so that we don't consider the set's internal
+// 'folded' property to be part of its identity. The 'folded' property is
+// strictly an optimization.
+impl<I: Interval> PartialEq for IntervalSet<I> {
+    fn eq(&self, other: &IntervalSet<I>) -> bool {
+        self.ranges.eq(&other.ranges)
+    }
 }
 
 impl<I: Interval> IntervalSet<I> {
@@ -42,7 +71,10 @@ impl<I: Interval> IntervalSet<I> {
     /// The given ranges do not need to be in any specific order, and ranges
     /// may overlap.
     pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
-        let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
+        let ranges: Vec<I> = intervals.into_iter().collect();
+        // An empty set is case folded.
+        let folded = ranges.is_empty();
+        let mut set = IntervalSet { ranges, folded };
         set.canonicalize();
         set
     }
@@ -53,6 +85,10 @@ impl<I: Interval> IntervalSet<I> {
         // it preserves canonicalization.
         self.ranges.push(interval);
         self.canonicalize();
+        // We don't know whether the new interval added here is considered
+        // case folded, so we conservatively assume that the entire set is
+        // no longer case folded if it was previously.
+        self.folded = false;
     }
 
     /// Return an iterator over all intervals in this set.
@@ -77,6 +113,9 @@ impl<I: Interval> IntervalSet<I> {
     /// This returns an error if the necessary case mapping data is not
     /// available.
     pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
+        if self.folded {
+            return Ok(());
+        }
         let len = self.ranges.len();
         for i in 0..len {
             let range = self.ranges[i];
@@ -86,14 +125,19 @@ impl<I: Interval> IntervalSet<I> {
             }
         }
         self.canonicalize();
+        self.folded = true;
         Ok(())
     }
 
     /// Union this set with the given set, in place.
     pub fn union(&mut self, other: &IntervalSet<I>) {
+        if other.ranges.is_empty() {
+            return;
+        }
         // This could almost certainly be done more efficiently.
         self.ranges.extend(&other.ranges);
         self.canonicalize();
+        self.folded = self.folded && other.folded;
     }
 
     /// Intersect this set with the given set, in place.
@@ -103,6 +147,8 @@ impl<I: Interval> IntervalSet<I> {
         }
         if other.ranges.is_empty() {
             self.ranges.clear();
+            // An empty set is case folded.
+            self.folded = true;
             return;
         }
 
@@ -132,6 +178,7 @@ impl<I: Interval> IntervalSet<I> {
             }
         }
         self.ranges.drain(..drain_end);
+        self.folded = self.folded && other.folded;
     }
 
     /// Subtract the given set from this set, in place.
@@ -224,6 +271,7 @@ impl<I: Interval> IntervalSet<I> {
             a += 1;
         }
         self.ranges.drain(..drain_end);
+        self.folded = self.folded && other.folded;
     }
 
     /// Compute the symmetric difference of the two sets, in place.
@@ -249,6 +297,8 @@ impl<I: Interval> IntervalSet<I> {
         if self.ranges.is_empty() {
             let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
             self.ranges.push(I::create(min, max));
+            // The set containing everything must case folded.
+            self.folded = true;
             return;
         }
 
@@ -274,6 +324,19 @@ impl<I: Interval> IntervalSet<I> {
             self.ranges.push(I::create(lower, I::Bound::max_value()));
         }
         self.ranges.drain(..drain_end);
+        // We don't need to update whether this set is folded or not, because
+        // it is conservatively preserved through negation. Namely, if a set
+        // is not folded, then it is possible that its negation is folded, for
+        // example, [^☃]. But we're fine with assuming that the set is not
+        // folded in that case. (`folded` permits false negatives but not false
+        // positives.)
+        //
+        // But what about when a set is folded, is its negation also
+        // necessarily folded? Yes. Because if a set is folded, then for every
+        // character in the set, it necessarily included its equivalence class
+        // of case folded characters. Negating it in turn means that all
+        // equivalence classes in the set are negated, and any equivalence
+        // class that was previously not in the set is now entirely in the set.
     }
 
     /// Converts this set into a canonical ordering.
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 24774ddea..204653543 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -1083,7 +1083,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         class: &mut hir::ClassUnicode,
     ) -> Result<()> {
         // Note that we must apply case folding before negation!
-        // Consider `(?i)[^x]`. If we applied negation field, then
+        // Consider `(?i)[^x]`. If we applied negation first, then
         // the result would be the character class that matched any
         // Unicode scalar value.
         if self.flags().case_insensitive() {