From a5e10ba9b5860c55495a73b62b7f36ec45f7aa16 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 4 Mar 2023 13:40:51 -0500 Subject: [PATCH] api: add new 'Regex::static_captures_len' method This adds a new routine for computing the static number of capture groups that will appear in every match. If the number of groups is not invariant across all matches, then there is no static capture length. This is meant to help implement higher level convenience APIs for extracting capture groups, such as the one described in #824. We may wind up including such APIs in the regex crate itself, but this commit stops short of that. Instead, we just add this new property which should permit those APIs to exist outside of this crate for now. Closes #908 --- regex-syntax/src/hir/mod.rs | 80 +++++++++++++++++++++++++++++++ regex-syntax/src/hir/translate.rs | 35 ++++++++++++++ src/compile.rs | 2 + src/exec.rs | 6 +++ src/prog.rs | 4 ++ src/re_bytes.rs | 40 ++++++++++++++++ src/re_unicode.rs | 40 ++++++++++++++++ 7 files changed, 207 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index f9bd51345..350030992 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1833,6 +1833,7 @@ struct PropertiesI { look_set_suffix: LookSet, utf8: bool, captures_len: usize, + static_captures_len: Option, literal: bool, alternation_literal: bool, } @@ -1990,6 +1991,44 @@ impl Properties { self.0.captures_len } + /// Returns the total number of explicit capturing groups that appear in + /// every possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex_syntax::parse; + /// + /// let len = |pattern| { + /// parse(pattern).map(|h| h.properties().static_captures_len()) + /// }; + /// + /// assert_eq!(Some(0), len("a")?); + /// assert_eq!(Some(1), len("(a)")?); + /// assert_eq!(Some(1), len("(a)|(b)")?); + /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(1), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len + } + /// Return true if and only if this HIR is a simple literal. This is /// only true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s. @@ -2100,6 +2139,13 @@ impl Properties { } else { LookSet::full() }; + // And also, an empty alternate means we have 0 static capture groups, + // but we otherwise start with the number corresponding to the first + // alternate. If any subsequent alternate has a different number of + // static capture groups, then we overall have a variation and not a + // static number of groups. + let static_captures_len = + it.peek().and_then(|p| p.borrow().static_captures_len()); // The base case is an empty alternation, which matches nothing. // Note though that empty alternations aren't possible, because the // Hir::alternation smart constructor rewrites those as empty character @@ -2112,6 +2158,7 @@ impl Properties { look_set_suffix: fix, utf8: true, captures_len: 0, + static_captures_len, literal: false, alternation_literal: true, }; @@ -2125,6 +2172,9 @@ impl Properties { props.utf8 = props.utf8 && p.is_utf8(); props.captures_len = props.captures_len.saturating_add(p.captures_len()); + if props.static_captures_len != p.static_captures_len() { + props.static_captures_len = None; + } props.alternation_literal = props.alternation_literal && p.is_alternation_literal(); if !min_poisoned { @@ -2180,6 +2230,7 @@ impl Properties { // since it too can match the empty string. utf8: true, captures_len: 0, + static_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2196,6 +2247,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: core::str::from_utf8(&lit.0).is_ok(), captures_len: 0, + static_captures_len: Some(0), literal: true, alternation_literal: true, }; @@ -2212,6 +2264,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: class.is_utf8(), captures_len: 0, + static_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2241,6 +2294,7 @@ impl Properties { // property borderline useless. utf8: true, captures_len: 0, + static_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2268,6 +2322,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: p.is_utf8(), captures_len: p.captures_len(), + static_captures_len: p.static_captures_len(), literal: false, alternation_literal: false, }; @@ -2278,6 +2333,23 @@ impl Properties { inner.look_set_prefix = p.look_set_prefix(); inner.look_set_suffix = p.look_set_suffix(); } + // If the static captures len of the sub-expression is not known or is + // zero, then it automatically propagates to the repetition, regardless + // of the repetition. Otherwise, it might change, but only when the + // repetition can match 0 times. + if rep.min == 0 + && inner.static_captures_len.map_or(false, |len| len > 0) + { + // If we require a match 0 times, then our captures len is + // guaranteed to be zero. Otherwise, if we *can* match the empty + // string, then it's impossible to know how many captures will be + // in the resulting match. + if rep.max == Some(0) { + inner.static_captures_len = Some(0); + } else { + inner.static_captures_len = None; + } + } Properties(Box::new(inner)) } @@ -2286,6 +2358,9 @@ impl Properties { let p = capture.sub.properties(); Properties(Box::new(PropertiesI { captures_len: p.captures_len().saturating_add(1), + static_captures_len: p + .static_captures_len() + .map(|len| len.saturating_add(1)), literal: false, alternation_literal: false, ..*p.0.clone() @@ -2306,6 +2381,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: true, captures_len: 0, + static_captures_len: Some(0), literal: true, alternation_literal: true, }; @@ -2316,6 +2392,10 @@ impl Properties { props.utf8 = props.utf8 && p.is_utf8(); props.captures_len = props.captures_len.saturating_add(p.captures_len()); + props.static_captures_len = p + .static_captures_len() + .and_then(|len1| Some((len1, props.static_captures_len?))) + .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); props.literal = props.literal && p.is_literal(); props.alternation_literal = props.alternation_literal && p.is_alternation_literal(); diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 81ae9b898..766e19c07 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3204,6 +3204,41 @@ mod tests { assert_eq!(1, props(r"([a&&b])").captures_len()); } + #[test] + fn analysis_static_captures_len() { + let len = |pattern| props(pattern).static_captures_len(); + assert_eq!(Some(0), len(r"")); + assert_eq!(Some(0), len(r"foo|bar")); + assert_eq!(None, len(r"(foo)|bar")); + assert_eq!(None, len(r"foo|(bar)")); + assert_eq!(Some(1), len(r"(foo|bar)")); + assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); + assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); + assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); + assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); + assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); + assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); + assert_eq!(None, len(r"(a)(b)(extra)?")); + assert_eq!(Some(1), len(r"(foo)|(bar)")); + assert_eq!(Some(2), len(r"(foo)(bar)")); + assert_eq!(Some(2), len(r"(foo)+(bar)")); + assert_eq!(None, len(r"(foo)*(bar)")); + assert_eq!(Some(0), len(r"(foo)?{0}")); + assert_eq!(None, len(r"(foo)?{1}")); + assert_eq!(Some(1), len(r"(foo){1}")); + assert_eq!(Some(1), len(r"(foo){1,}")); + assert_eq!(Some(1), len(r"(foo){1,}?")); + assert_eq!(None, len(r"(foo){1,}??")); + assert_eq!(None, len(r"(foo){0,}")); + assert_eq!(Some(1), len(r"(foo)(?:bar)")); + assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); + assert_eq!(Some(2), len(r"(?Pfoo)(?:bar)(bal|loon)")); + assert_eq!( + Some(2), + len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) + ); + } + #[test] fn analysis_is_all_assertions() { // Positive examples. diff --git a/src/compile.rs b/src/compile.rs index c29196a72..c6eebcc35 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -161,6 +161,8 @@ impl Compiler { self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); + self.compiled.static_captures_len = + expr.properties().static_captures_len(); self.compile_finish() } diff --git a/src/exec.rs b/src/exec.rs index f859fceba..a6814be58 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -1358,6 +1358,12 @@ impl Exec { pub fn capture_name_idx(&self) -> &Arc> { &self.ro.nfa.capture_name_idx } + + /// If the number of capture groups in every match is always the same, then + /// return that number. Otherwise return `None`. + pub fn static_captures_len(&self) -> Option { + self.ro.nfa.static_captures_len + } } impl Clone for Exec { diff --git a/src/prog.rs b/src/prog.rs index c211f71d8..100862cf1 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -27,6 +27,9 @@ pub struct Program { pub captures: Vec>, /// Pointers to all named capture groups into `captures`. pub capture_name_idx: Arc>, + /// If the number of capture groups is the same for all possible matches, + /// then this is that number. + pub static_captures_len: Option, /// A pointer to the start instruction. This can vary depending on how /// the program was compiled. For example, programs for use with the DFA /// engine have a `.*?` inserted at the beginning of unanchored regular @@ -83,6 +86,7 @@ impl Program { matches: vec![], captures: vec![], capture_name_idx: Arc::new(HashMap::new()), + static_captures_len: None, start: 0, byte_classes: vec![0; 256], only_utf8: true, diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 07e9f98ac..b8f9738e8 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -667,6 +667,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 197510ea0..0e7fc70a4 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -725,6 +725,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations {