From 5c5017f34017edf5547ebfe5753d2178cc477e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 9 Feb 2023 12:59:59 +0100 Subject: [PATCH] syntax: support `(?<` syntax for named groups It turns out that both '(?P...)' and '(?...)' are rather common among regex engines. There are several that support just one or the other. Until this commit, the regex crate only supported the former, along with both RE2, RE2/J and Go's regexp package. There are also several regex engines that only supported the latter, such as Onigmo, Onuguruma, Java, Ruby, Boost, .NET and Javascript. To decrease friction, and because there is somewhat little cost to doing so, we elect to support both. It looks like perhaps RE2 and Go's regexp package will go the same route, but it isn't fully decided yet: https://github.com/golang/go/issues/58458 Closes #955, Closes #956 --- regex-syntax/src/ast/mod.rs | 13 ++-- regex-syntax/src/ast/parse.rs | 103 ++++++++++++++++++++---------- regex-syntax/src/ast/print.rs | 8 ++- regex-syntax/src/hir/translate.rs | 4 +- src/lib.rs | 1 + 5 files changed, 87 insertions(+), 42 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9e717f4a8..c75995b9c 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -1162,7 +1162,7 @@ impl Group { /// Returns true if and only if this group is capturing. pub fn is_capturing(&self) -> bool { match self.kind { - GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true, + GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true, GroupKind::NonCapturing(_) => false, } } @@ -1173,7 +1173,7 @@ impl Group { pub fn capture_index(&self) -> Option { match self.kind { GroupKind::CaptureIndex(i) => Some(i), - GroupKind::CaptureName(ref x) => Some(x.index), + GroupKind::CaptureName { ref name, .. } => Some(name.index), GroupKind::NonCapturing(_) => None, } } @@ -1184,8 +1184,13 @@ impl Group { pub enum GroupKind { /// `(a)` CaptureIndex(u32), - /// `(?Pa)` - CaptureName(CaptureName), + /// `(?a)` or `(?Pa)` + CaptureName { + /// True if the `?P<` syntax is used and false if the `?<` syntax is used. + starts_with_p: bool, + /// The capture name. + name: CaptureName, + }, /// `(?:a)` and `(?i:a)` NonCapturing(Flags), } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 48a0507e2..1d6d4d046 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -1202,12 +1202,16 @@ impl<'s, P: Borrow> ParserI<'s, P> { )); } let inner_span = self.span(); - if self.bump_if("?P<") { + let mut starts_with_p = true; + if self.bump_if("?P<") || { + starts_with_p = false; + self.bump_if("?<") + } { let capture_index = self.next_capture_index(open_span)?; - let cap = self.parse_capture_name(capture_index)?; + let name = self.parse_capture_name(capture_index)?; Ok(Either::Right(ast::Group { span: open_span, - kind: ast::GroupKind::CaptureName(cap), + kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::Empty(self.span())), })) } else if self.bump_if("?") { @@ -2800,11 +2804,14 @@ bar flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::Group(ast::Group { span: span_range(pat, 4..pat.len()), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span_range(pat, 9..12), - name: s("foo"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span_range(pat, 9..12), + name: s("foo"), + index: 1, + } + }, ast: Box::new(lit_with('a', span_range(pat, 14..15))), }), ] @@ -3819,15 +3826,33 @@ bar #[test] fn parse_capture_name() { + assert_eq!( + parser("(?z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..7), + kind: ast::GroupKind::CaptureName { + starts_with_p: false, + name: ast::CaptureName { + span: span(3..4), + name: s("a"), + index: 1, + } + }, + ast: Box::new(lit('z', 5)), + })) + ); assert_eq!( parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..8), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..5), - name: s("a"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..5), + name: s("a"), + index: 1, + } + }, ast: Box::new(lit('z', 6)), })) ); @@ -3835,11 +3860,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("abc"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("abc"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3848,11 +3876,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("a_1"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a_1"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3861,11 +3892,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("a.1"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a.1"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3874,11 +3908,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..11), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..8), - name: s("a[1]"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..8), + name: s("a[1]"), + index: 1, + } + }, ast: Box::new(lit('z', 9)), })) ); diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index e6c000d57..0922ea0e3 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -160,9 +160,10 @@ impl Writer { use crate::ast::GroupKind::*; match ast.kind { CaptureIndex(_) => self.wtr.write_str("("), - CaptureName(ref x) => { - self.wtr.write_str("(?P<")?; - self.wtr.write_str(&x.name)?; + CaptureName { ref name, starts_with_p } => { + let start = if starts_with_p { "(?P<" } else { "(?<" }; + self.wtr.write_str(start)?; + self.wtr.write_str(&name.name)?; self.wtr.write_str(">")?; Ok(()) } @@ -505,6 +506,7 @@ mod tests { fn print_group() { roundtrip("(?i:a)"); roundtrip("(?Pa)"); + roundtrip("(?a)"); roundtrip("(a)"); } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index a787b79c4..b5bb41767 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -905,8 +905,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { let (index, name) = match group.kind { ast::GroupKind::CaptureIndex(index) => (index, None), - ast::GroupKind::CaptureName(ref cap) => { - (cap.index, Some(cap.name.clone().into_boxed_str())) + ast::GroupKind::CaptureName { ref name, .. } => { + (name.index, Some(name.name.clone().into_boxed_str())) } // The HIR doesn't need to use non-capturing groups, since the way // in which the data type is defined handles this automatically. diff --git a/src/lib.rs b/src/lib.rs index 6b95739c5..1de347861 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -361,6 +361,7 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
 (exp)          numbered capture group (indexed by opening parenthesis)
 (?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?<name>exp)   named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
 (?:exp)        non-capturing group
 (?flags)       set flags within current group
 (?flags:exp)   set flags for exp (non-capturing)