diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 4102bfec5..495f489f1 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1833,6 +1833,7 @@ struct PropertiesI { look_set_suffix: LookSet, utf8: bool, captures_len: usize, + static_captures_len: Option, literal: bool, alternation_literal: bool, } @@ -1990,6 +1991,44 @@ impl Properties { self.0.captures_len } + /// Returns the total number of explicit capturing groups that appear in + /// every possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex_syntax::parse; + /// + /// let len = |pattern| { + /// parse(pattern).map(|h| h.properties().static_captures_len()) + /// }; + /// + /// assert_eq!(Some(0), len("a")?); + /// assert_eq!(Some(1), len("(a)")?); + /// assert_eq!(Some(1), len("(a)|(b)")?); + /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(1), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len + } + /// Return true if and only if this HIR is a simple literal. This is /// only true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s. @@ -2100,6 +2139,13 @@ impl Properties { } else { LookSet::full() }; + // And also, an empty alternate means we have 0 static capture groups, + // but we otherwise start with the number corresponding to the first + // alternate. If any subsequent alternate has a different number of + // static capture groups, then we overall have a variation and not a + // static number of groups. + let static_captures_len = + it.peek().and_then(|p| p.borrow().static_captures_len()); // The base case is an empty alternation, which matches nothing. // Note though that empty alternations aren't possible, because the // Hir::alternation smart constructor rewrites those as empty character @@ -2112,6 +2158,7 @@ impl Properties { look_set_suffix: fix, utf8: true, captures_len: 0, + static_captures_len, literal: false, alternation_literal: true, }; @@ -2125,6 +2172,9 @@ impl Properties { props.utf8 = props.utf8 && p.is_utf8(); props.captures_len = props.captures_len.saturating_add(p.captures_len()); + if props.static_captures_len != p.static_captures_len() { + props.static_captures_len = None; + } props.alternation_literal = props.alternation_literal && p.is_alternation_literal(); if !min_poisoned { @@ -2180,6 +2230,7 @@ impl Properties { // since it too can match the empty string. utf8: true, captures_len: 0, + static_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2196,6 +2247,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: core::str::from_utf8(&lit.0).is_ok(), captures_len: 0, + static_captures_len: Some(0), literal: true, alternation_literal: true, }; @@ -2212,6 +2264,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: class.is_utf8(), captures_len: 0, + static_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2241,6 +2294,7 @@ impl Properties { // property borderline useless. utf8: true, captures_len: 0, + static_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2268,6 +2322,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: p.is_utf8(), captures_len: p.captures_len(), + static_captures_len: p.static_captures_len(), literal: false, alternation_literal: false, }; @@ -2278,6 +2333,23 @@ impl Properties { inner.look_set_prefix = p.look_set_prefix(); inner.look_set_suffix = p.look_set_suffix(); } + // If the static captures len of the sub-expression is not known or is + // zero, then it automatically propagates to the repetition, regardless + // of the repetition. Otherwise, it might change, but only when the + // repetition can match 0 times. + if rep.min == 0 + && inner.static_captures_len.map_or(false, |len| len > 0) + { + // If we require a match 0 times, then our captures len is + // guaranteed to be zero. Otherwise, if we *can* match the empty + // string, then it's impossible to know how many captures will be + // in the resulting match. + if rep.max == Some(0) { + inner.static_captures_len = Some(0); + } else { + inner.static_captures_len = None; + } + } Properties(Box::new(inner)) } @@ -2286,6 +2358,9 @@ impl Properties { let p = capture.sub.properties(); Properties(Box::new(PropertiesI { captures_len: p.captures_len().saturating_add(1), + static_captures_len: p + .static_captures_len() + .map(|len| len.saturating_add(1)), literal: false, alternation_literal: false, ..*p.0.clone() @@ -2306,6 +2381,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: true, captures_len: 0, + static_captures_len: Some(0), literal: true, alternation_literal: true, }; @@ -2316,6 +2392,10 @@ impl Properties { props.utf8 = props.utf8 && p.is_utf8(); props.captures_len = props.captures_len.saturating_add(p.captures_len()); + props.static_captures_len = p + .static_captures_len() + .and_then(|len1| Some((len1, props.static_captures_len?))) + .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); props.literal = props.literal && p.is_literal(); props.alternation_literal = props.alternation_literal && p.is_alternation_literal(); diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 81ae9b898..766e19c07 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3204,6 +3204,41 @@ mod tests { assert_eq!(1, props(r"([a&&b])").captures_len()); } + #[test] + fn analysis_static_captures_len() { + let len = |pattern| props(pattern).static_captures_len(); + assert_eq!(Some(0), len(r"")); + assert_eq!(Some(0), len(r"foo|bar")); + assert_eq!(None, len(r"(foo)|bar")); + assert_eq!(None, len(r"foo|(bar)")); + assert_eq!(Some(1), len(r"(foo|bar)")); + assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); + assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); + assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); + assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); + assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); + assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); + assert_eq!(None, len(r"(a)(b)(extra)?")); + assert_eq!(Some(1), len(r"(foo)|(bar)")); + assert_eq!(Some(2), len(r"(foo)(bar)")); + assert_eq!(Some(2), len(r"(foo)+(bar)")); + assert_eq!(None, len(r"(foo)*(bar)")); + assert_eq!(Some(0), len(r"(foo)?{0}")); + assert_eq!(None, len(r"(foo)?{1}")); + assert_eq!(Some(1), len(r"(foo){1}")); + assert_eq!(Some(1), len(r"(foo){1,}")); + assert_eq!(Some(1), len(r"(foo){1,}?")); + assert_eq!(None, len(r"(foo){1,}??")); + assert_eq!(None, len(r"(foo){0,}")); + assert_eq!(Some(1), len(r"(foo)(?:bar)")); + assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); + assert_eq!(Some(2), len(r"(?Pfoo)(?:bar)(bal|loon)")); + assert_eq!( + Some(2), + len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) + ); + } + #[test] fn analysis_is_all_assertions() { // Positive examples. diff --git a/src/compile.rs b/src/compile.rs index c29196a72..c6eebcc35 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -161,6 +161,8 @@ impl Compiler { self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); + self.compiled.static_captures_len = + expr.properties().static_captures_len(); self.compile_finish() } diff --git a/src/exec.rs b/src/exec.rs index e36e367ba..778a39d4c 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -1361,6 +1361,12 @@ impl Exec { pub fn capture_name_idx(&self) -> &Arc> { &self.ro.nfa.capture_name_idx } + + /// If the number of capture groups in every match is always the same, then + /// return that number. Otherwise return `None`. + pub fn static_captures_len(&self) -> Option { + self.ro.nfa.static_captures_len + } } impl Clone for Exec { diff --git a/src/prog.rs b/src/prog.rs index c211f71d8..100862cf1 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -27,6 +27,9 @@ pub struct Program { pub captures: Vec>, /// Pointers to all named capture groups into `captures`. pub capture_name_idx: Arc>, + /// If the number of capture groups is the same for all possible matches, + /// then this is that number. + pub static_captures_len: Option, /// A pointer to the start instruction. This can vary depending on how /// the program was compiled. For example, programs for use with the DFA /// engine have a `.*?` inserted at the beginning of unanchored regular @@ -83,6 +86,7 @@ impl Program { matches: vec![], captures: vec![], capture_name_idx: Arc::new(HashMap::new()), + static_captures_len: None, start: 0, byte_classes: vec![0; 256], only_utf8: true, diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 07e9f98ac..b8f9738e8 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -667,6 +667,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 197510ea0..0e7fc70a4 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -725,6 +725,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations {