Skip to content

Commit

Permalink
syntax: simplify hir::Repetition
Browse files Browse the repository at this point in the history
This greatly simplifies how repetitions are represented in the HIR from
a sprawling set of variants down to just a simple `(u32, Option<u32>)`.
This is much simpler and still permits us to specialize all of the cases
we did before if necessary.

This also simplifies some of the HIR printer's output. e.g., 'a{1}' is
just 'a'.
  • Loading branch information
BurntSushi committed Apr 17, 2023
1 parent 91121d7 commit bcb57c7
Show file tree
Hide file tree
Showing 5 changed files with 188 additions and 147 deletions.
48 changes: 18 additions & 30 deletions regex-syntax/src/hir/literal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -602,26 +602,19 @@ fn prefixes(expr: &Hir, lits: &mut Literals) {
HirKind::Group(hir::Group { ref hir, .. }) => {
prefixes(&**hir, lits);
}
HirKind::Repetition(ref x) => match x.kind {
hir::RepetitionKind::ZeroOrOne => {
HirKind::Repetition(ref x) => match (x.min, x.max) {
(0, Some(1)) => {
repeat_zero_or_one_literals(&x.hir, lits, prefixes);
}
hir::RepetitionKind::ZeroOrMore => {
(0, None) => {
repeat_zero_or_more_literals(&x.hir, lits, prefixes);
}
hir::RepetitionKind::OneOrMore => {
(1, None) => {
repeat_one_or_more_literals(&x.hir, lits, prefixes);
}
hir::RepetitionKind::Range(ref rng) => {
let (min, max) = match *rng {
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
hir::RepetitionRange::AtLeast(m) => (m, None),
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
};
repeat_range_literals(
&x.hir, min, max, x.greedy, lits, prefixes,
)
}
(min, max) => repeat_range_literals(
&x.hir, min, max, x.greedy, lits, prefixes,
),
},
HirKind::Concat(ref es) if es.is_empty() => {}
HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
Expand Down Expand Up @@ -678,26 +671,19 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
HirKind::Group(hir::Group { ref hir, .. }) => {
suffixes(&**hir, lits);
}
HirKind::Repetition(ref x) => match x.kind {
hir::RepetitionKind::ZeroOrOne => {
HirKind::Repetition(ref x) => match (x.min, x.max) {
(0, Some(1)) => {
repeat_zero_or_one_literals(&x.hir, lits, suffixes);
}
hir::RepetitionKind::ZeroOrMore => {
(0, None) => {
repeat_zero_or_more_literals(&x.hir, lits, suffixes);
}
hir::RepetitionKind::OneOrMore => {
(1, None) => {
repeat_one_or_more_literals(&x.hir, lits, suffixes);
}
hir::RepetitionKind::Range(ref rng) => {
let (min, max) = match *rng {
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
hir::RepetitionRange::AtLeast(m) => (m, None),
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
};
repeat_range_literals(
&x.hir, min, max, x.greedy, lits, suffixes,
)
}
(min, max) => repeat_range_literals(
&x.hir, min, max, x.greedy, lits, suffixes,
),
},
HirKind::Concat(ref es) if es.is_empty() => {}
HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
Expand Down Expand Up @@ -736,7 +722,8 @@ fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
) {
f(
&Hir::repetition(hir::Repetition {
kind: hir::RepetitionKind::ZeroOrMore,
min: 0,
max: None,
// FIXME: Our literal extraction doesn't care about greediness.
// Which is partially why we're treating 'e?' as 'e*'. Namely,
// 'ab??' yields [Complete(ab), Complete(a)], but it should yield
Expand Down Expand Up @@ -794,7 +781,8 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
// just treat it as `e*`.
f(
&Hir::repetition(hir::Repetition {
kind: hir::RepetitionKind::ZeroOrMore,
min: 0,
max: None,
greedy,
hir: Box::new(e.clone()),
}),
Expand Down
53 changes: 18 additions & 35 deletions regex-syntax/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1362,8 +1362,18 @@ pub enum GroupKind {
/// sub-expression.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Repetition {
/// The kind of this repetition operator.
pub kind: RepetitionKind,
/// The minimum range of the repetition.
///
/// Note that special cases like `?`, `+` and `*` all get translated into
/// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
pub min: u32,
/// The maximum range of the repetition.
///
/// Note that when `max` is `None`, `min` acts as a lower bound but where
/// there is no upper bound. For something like `x{5}` where the min and
/// max are equivalent, `min` will be set to `5` and `max` will be set to
/// `Some(5)`.
pub max: Option<u32>,
/// Whether this repetition operator is greedy or not. A greedy operator
/// will match as much as it can. A non-greedy operator will match as
/// little as it can.
Expand All @@ -1385,42 +1395,14 @@ impl Repetition {
/// string and one or more occurrences of something that matches the
/// empty string will always match the empty string. In order to get the
/// inductive definition, see the corresponding method on [`Hir`].
///
/// This returns true in precisely the cases that [`Repetition::min`]
/// is equal to `0`.
pub fn is_match_empty(&self) -> bool {
match self.kind {
RepetitionKind::ZeroOrOne => true,
RepetitionKind::ZeroOrMore => true,
RepetitionKind::OneOrMore => false,
RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
}
self.min == 0
}
}

/// The kind of a repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum RepetitionKind {
/// Matches a sub-expression zero or one times.
ZeroOrOne,
/// Matches a sub-expression zero or more times.
ZeroOrMore,
/// Matches a sub-expression one or more times.
OneOrMore,
/// Matches a sub-expression within a bounded range of times.
Range(RepetitionRange),
}

/// The kind of a counted repetition operator.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum RepetitionRange {
/// Matches a sub-expression exactly this many times.
Exactly(u32),
/// Matches a sub-expression at least this many times.
AtLeast(u32),
/// Matches a sub-expression at least `m` times and at most `n` times.
Bounded(u32, u32),
}

/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
/// space but heap space proportional to the depth of the total `Hir`.
impl Drop for Hir {
Expand Down Expand Up @@ -2257,7 +2239,8 @@ mod tests {
hir: Box::new(expr),
});
expr = Hir::repetition(Repetition {
kind: RepetitionKind::ZeroOrOne,
min: 0,
max: Some(1),
greedy: true,
hir: Box::new(expr),
});
Expand Down
135 changes: 113 additions & 22 deletions regex-syntax/src/hir/print.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,27 +176,31 @@ impl<W: fmt::Write> Visitor for Writer<W> {
| HirKind::Concat(_)
| HirKind::Alternation(_) => {}
HirKind::Repetition(ref x) => {
match x.kind {
hir::RepetitionKind::ZeroOrOne => {
match (x.min, x.max) {
(0, Some(1)) => {
self.wtr.write_str("?")?;
}
hir::RepetitionKind::ZeroOrMore => {
(0, None) => {
self.wtr.write_str("*")?;
}
hir::RepetitionKind::OneOrMore => {
(1, None) => {
self.wtr.write_str("+")?;
}
hir::RepetitionKind::Range(ref x) => match *x {
hir::RepetitionRange::Exactly(m) => {
write!(self.wtr, "{{{}}}", m)?;
}
hir::RepetitionRange::AtLeast(m) => {
write!(self.wtr, "{{{},}}", m)?;
}
hir::RepetitionRange::Bounded(m, n) => {
write!(self.wtr, "{{{},{}}}", m, n)?;
}
},
(1, Some(1)) => {
// 'a{1}' and 'a{1}?' are exactly equivalent to 'a'.
return Ok(());
}
(m, None) => {
write!(self.wtr, "{{{},}}", m)?;
}
(m, Some(n)) if m == n => {
write!(self.wtr, "{{{}}}", m)?;
// a{m} and a{m}? are always exactly equivalent.
return Ok(());
}
(m, Some(n)) => {
write!(self.wtr, "{{{},{}}}", m, n)?;
}
}
if !x.greedy {
self.wtr.write_str("?")?;
Expand Down Expand Up @@ -241,7 +245,10 @@ impl<W: fmt::Write> Writer<W> {

#[cfg(test)]
mod tests {
use alloc::string::String;
use alloc::{
boxed::Box,
string::{String, ToString},
};

use crate::ParserBuilder;

Expand Down Expand Up @@ -338,14 +345,17 @@ mod tests {
roundtrip("a+?", "a+?");
roundtrip("(?U)a+", "a+?");

roundtrip("a{1}", "a{1}");
roundtrip("a{1,}", "a{1,}");
roundtrip("a{1}", "a");
roundtrip("a{2}", "a{2}");
roundtrip("a{1,}", "a+");
roundtrip("a{1,5}", "a{1,5}");
roundtrip("a{1}?", "a{1}?");
roundtrip("a{1,}?", "a{1,}?");
roundtrip("a{1}?", "a");
roundtrip("a{2}?", "a{2}");
roundtrip("a{1,}?", "a+?");
roundtrip("a{1,5}?", "a{1,5}?");
roundtrip("(?U)a{1}", "a{1}?");
roundtrip("(?U)a{1,}", "a{1,}?");
roundtrip("(?U)a{1}", "a");
roundtrip("(?U)a{2}", "a{2}");
roundtrip("(?U)a{1,}", "a+?");
roundtrip("(?U)a{1,5}", "a{1,5}?");
}

Expand All @@ -371,4 +381,85 @@ mod tests {
roundtrip("a|b|c", "a|b|c");
roundtrip("foo|bar|quux", "foo|bar|quux");
}

// This is a regression test that stresses a peculiarity of how the HIR
// is both constructed and printed. Namely, it is legal for a repetition
// to directly contain a concatenation. This particular construct isn't
// really possible to build from the concrete syntax directly, since you'd
// be forced to put the concatenation into (at least) a non-capturing
// group. Concurrently, the printer doesn't consider this case and just
// kind of naively prints the child expression and tacks on the repetition
// operator.
//
// As a result, if you attached '+' to a 'concat(a, b)', the printer gives
// you 'ab+', but clearly it really should be '(?:ab)+'.
//
// This bug isn't easy to surface because most ways of building an HIR
// come directly from the concrete syntax, and as mentioned above, it just
// isn't possible to build this kind of HIR from the concrete syntax.
// Nevertheless, this is definitely a bug.
//
// See: https://github.com/rust-lang/regex/issues/731
#[test]
fn regression_repetition_concat() {
let expr = Hir::concat(alloc::vec![
Hir::literal(hir::Literal::Unicode('x')),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
hir: Box::new(Hir::concat(alloc::vec![
Hir::literal(hir::Literal::Unicode('a')),
Hir::literal(hir::Literal::Unicode('b')),
])),
}),
Hir::literal(hir::Literal::Unicode('y')),
]);
assert_eq!(r"x(?:ab)+y", expr.to_string());
}

// Just like regression_repetition_concat, but with the repetition using
// an alternation as a child expression instead.
//
// See: https://github.com/rust-lang/regex/issues/731
#[test]
fn regression_repetition_alternation() {
let expr = Hir::concat(alloc::vec![
Hir::literal(hir::Literal::Unicode('x')),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
hir: Box::new(Hir::alternation(alloc::vec![
Hir::literal(hir::Literal::Unicode('a')),
Hir::literal(hir::Literal::Unicode('b')),
])),
}),
Hir::literal(hir::Literal::Unicode('y')),
]);
assert_eq!(r"x(?:a|b)+y", expr.to_string());
}

// This regression test is very similar in flavor to
// regression_repetition_concat in that the root of the issue lies in a
// peculiarity of how the HIR is represented and how the printer writes it
// out. Like the other regression, this one is also rooted in the fact that
// you can't produce the peculiar HIR from the concrete syntax. Namely, you
// just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally
// be in (at least) a non-capturing group. Why? Because the '|' has very
// low precedence (lower that concatenation), and so something like 'ab|c'
// is actually 'alt(ab, c)'.
//
// See: https://github.com/rust-lang/regex/issues/516
#[test]
fn regression_alternation_concat() {
let expr = Hir::concat(alloc::vec![
Hir::literal(hir::Literal::Unicode('a')),
Hir::alternation(alloc::vec![
Hir::literal(hir::Literal::Unicode('b')),
Hir::literal(hir::Literal::Unicode('c')),
]),
]);
assert_eq!(r"a(?:b|c)", expr.to_string());
}
}
Loading

0 comments on commit bcb57c7

Please sign in to comment.