diff --git a/CHANGELOG.md b/CHANGELOG.md index abd906fc3b..ba2d0bd794 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Bug fixes: * [BUG #523](https://github.com/rust-lang/regex/pull/523): Add note to documentation that spaces can be escaped in `x` mode. +* [BUG #524](https://github.com/rust-lang/regex/issue/524): + Add support for empty sub-expressions, including empty alternations. +* [BUG #659](https://github.com/rust-lang/regex/issue/659): + Fix match bug caused by an empty sub-expression miscompilation. 1.3.7 (2020-04-17) diff --git a/src/compile.rs b/src/compile.rs index 24184008c5..ad54040a76 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -15,6 +15,7 @@ use prog::{ use Error; type Result = result::Result; +type ResultOrEmpty = result::Result, Error>; #[derive(Debug)] struct Patch { @@ -132,7 +133,7 @@ impl Compiler { self.compiled.start = dotstar_patch.entry; } self.compiled.captures = vec![None]; - let patch = self.c_capture(0, expr)?; + let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst()); if self.compiled.needs_dotstar() { self.fill(dotstar_patch.hole, patch.entry); } else { @@ -167,14 +168,16 @@ impl Compiler { for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { self.fill_to_next(prev_hole); let split = self.push_split_hole(); - let Patch { hole, entry } = self.c_capture(0, expr)?; + let Patch { hole, entry } = + self.c_capture(0, expr)?.unwrap_or(self.next_inst()); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); self.push_compiled(Inst::Match(i)); prev_hole = self.fill_split(split, Some(entry), None); } let i = exprs.len() - 1; - let Patch { hole, entry } = self.c_capture(0, &exprs[i])?; + let Patch { hole, entry } = + self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst()); self.fill(prev_hole, entry); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); @@ -242,13 +245,16 @@ impl Compiler { /// method you will see that it does exactly this, though it handles /// a list of expressions rather than just the two that we use for /// an example. - fn c(&mut self, expr: &Hir) -> Result { + /// + /// Ok(None) is returned when an expression is compiled to no + /// instruction, and so no patch.entry value makes sense. + fn c(&mut self, expr: &Hir) -> ResultOrEmpty { use prog; use syntax::hir::HirKind::*; self.check_size()?; match *expr.kind() { - Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }), + Empty => Ok(None), Literal(hir::Literal::Unicode(c)) => self.c_char(c), Literal(hir::Literal::Byte(b)) => { assert!(self.compiled.uses_bytes()); @@ -357,7 +363,7 @@ impl Compiler { } } - fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result { + fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { if self.num_exprs > 1 || self.compiled.is_dfa { // Don't ever compile Save instructions for regex sets because // they are never used. They are also never used in DFA programs @@ -366,11 +372,11 @@ impl Compiler { } else { let entry = self.insts.len(); let hole = self.push_hole(InstHole::Save { slot: first_slot }); - let patch = self.c(expr)?; + let patch = self.c(expr)?.unwrap_or(self.next_inst()); self.fill(hole, patch.entry); self.fill_to_next(patch.hole); let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); - Ok(Patch { hole: hole, entry: entry }) + Ok(Some(Patch { hole: hole, entry: entry })) } } @@ -381,36 +387,38 @@ impl Compiler { greedy: false, hir: Box::new(Hir::any(true)), }))? + .unwrap() } else { self.c(&Hir::repetition(hir::Repetition { kind: hir::RepetitionKind::ZeroOrMore, greedy: false, hir: Box::new(Hir::any(false)), }))? + .unwrap() }) } - fn c_char(&mut self, c: char) -> Result { + fn c_char(&mut self, c: char) -> ResultOrEmpty { if self.compiled.uses_bytes() { if c.is_ascii() { let b = c as u8; let hole = self.push_hole(InstHole::Bytes { start: b, end: b }); self.byte_classes.set_range(b, b); - Ok(Patch { hole, entry: self.insts.len() - 1 }) + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } else { self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) } } else { let hole = self.push_hole(InstHole::Char { c: c }); - Ok(Patch { hole, entry: self.insts.len() - 1 }) + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } } - fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result { + fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { assert!(!ranges.is_empty()); if self.compiled.uses_bytes() { - CompileClass { c: self, ranges: ranges }.compile() + Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?)) } else { let ranges: Vec<(char, char)> = ranges.iter().map(|r| (r.start(), r.end())).collect(); @@ -419,15 +427,18 @@ impl Compiler { } else { self.push_hole(InstHole::Ranges { ranges: ranges }) }; - Ok(Patch { hole: hole, entry: self.insts.len() - 1 }) + Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) } } - fn c_byte(&mut self, b: u8) -> Result { + fn c_byte(&mut self, b: u8) -> ResultOrEmpty { self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) } - fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result { + fn c_class_bytes( + &mut self, + ranges: &[hir::ClassBytesRange], + ) -> ResultOrEmpty { debug_assert!(!ranges.is_empty()); let first_split_entry = self.insts.len(); @@ -451,35 +462,39 @@ impl Compiler { self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), ); self.fill(prev_hole, next); - Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry }) + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) } - fn c_empty_look(&mut self, look: EmptyLook) -> Result { + fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { let hole = self.push_hole(InstHole::EmptyLook { look: look }); - Ok(Patch { hole: hole, entry: self.insts.len() - 1 }) + Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) } - fn c_concat<'a, I>(&mut self, exprs: I) -> Result + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator, { let mut exprs = exprs.into_iter(); - let first = match exprs.next() { - Some(expr) => expr, - None => { - return Ok(Patch { hole: Hole::None, entry: self.insts.len() }) + let Patch { mut hole, entry } = loop { + match exprs.next() { + None => return Ok(None), + Some(e) => { + if let Some(p) = self.c(e)? { + break p; + } + } } }; - let Patch { mut hole, entry } = self.c(first)?; for e in exprs { - let p = self.c(e)?; - self.fill(hole, p.entry); - hole = p.hole; + if let Some(p) = self.c(e)? { + self.fill(hole, p.entry); + hole = p.hole; + } } - Ok(Patch { hole: hole, entry: entry }) + Ok(Some(Patch { hole: hole, entry: entry })) } - fn c_alternate(&mut self, exprs: &[Hir]) -> Result { + fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { debug_assert!( exprs.len() >= 2, "alternates must have at least 2 exprs" @@ -492,43 +507,43 @@ impl Compiler { // patched to point to the same location. let mut holes = vec![]; - let mut prev_hole = Hole::None; + // true indicates that the hole is a split where we want to fill + // the second branch. + let mut prev_hole = (Hole::None, false); for e in &exprs[0..exprs.len() - 1] { - self.fill_to_next(prev_hole); + if prev_hole.1 { + let next = self.insts.len(); + self.fill_split(prev_hole.0, None, Some(next)); + } else { + self.fill_to_next(prev_hole.0); + } let split = self.push_split_hole(); - let prev_entry = self.insts.len(); - let Patch { hole, entry } = self.c(e)?; - if prev_entry == self.insts.len() { - // TODO(burntsushi): It is kind of silly that we don't support - // empty-subexpressions in alternates, but it is supremely - // awkward to support them in the existing compiler - // infrastructure. This entire compiler needs to be thrown out - // anyway, so don't feel too bad. - return Err(Error::Syntax( - "alternations cannot currently contain \ - empty sub-expressions" - .to_string(), - )); + if let Some(Patch { hole, entry }) = self.c(e)? { + holes.push(hole); + prev_hole = (self.fill_split(split, Some(entry), None), false); + } else { + let (split1, split2) = split.dup_one(); + holes.push(split1); + prev_hole = (split2, true); } - holes.push(hole); - prev_hole = self.fill_split(split, Some(entry), None); } - let prev_entry = self.insts.len(); - let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?; - if prev_entry == self.insts.len() { - // TODO(burntsushi): See TODO above. - return Err(Error::Syntax( - "alternations cannot currently contain \ - empty sub-expressions" - .to_string(), - )); + if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { + holes.push(hole); + if prev_hole.1 { + self.fill_split(prev_hole.0, None, Some(entry)); + } else { + self.fill(prev_hole.0, entry); + } + } else { + // We ignore prev_hole.1. When it's true, it means we have two + // empty branches both pushing prev_hole.0 into holes, so both + // branches will go to the same place anyway. + holes.push(prev_hole.0); } - holes.push(hole); - self.fill(prev_hole, entry); - Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry }) + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) } - fn c_repeat(&mut self, rep: &hir::Repetition) -> Result { + fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { use syntax::hir::RepetitionKind::*; match rep.kind { ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), @@ -546,24 +561,37 @@ impl Compiler { } } - fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result { + fn c_repeat_zero_or_one( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { let split_entry = self.insts.len(); let split = self.push_split_hole(); - let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?; - + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; let split_hole = if greedy { self.fill_split(split, Some(entry_rep), None) } else { self.fill_split(split, None, Some(entry_rep)) }; let holes = vec![hole_rep, split_hole]; - Ok(Patch { hole: Hole::Many(holes), entry: split_entry }) + Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) } - fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result { + fn c_repeat_zero_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { let split_entry = self.insts.len(); let split = self.push_split_hole(); - let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?; + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; self.fill(hole_rep, split_entry); let split_hole = if greedy { @@ -571,11 +599,18 @@ impl Compiler { } else { self.fill_split(split, None, Some(entry_rep)) }; - Ok(Patch { hole: split_hole, entry: split_entry }) + Ok(Some(Patch { hole: split_hole, entry: split_entry })) } - fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result { - let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?; + fn c_repeat_one_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return Ok(None), + }; self.fill_to_next(hole_rep); let split = self.push_split_hole(); @@ -584,7 +619,7 @@ impl Compiler { } else { self.fill_split(split, None, Some(entry_rep)) }; - Ok(Patch { hole: split_hole, entry: entry_rep }) + Ok(Some(Patch { hole: split_hole, entry: entry_rep })) } fn c_repeat_range_min_or_more( @@ -592,12 +627,20 @@ impl Compiler { expr: &Hir, greedy: bool, min: u32, - ) -> Result { + ) -> ResultOrEmpty { let min = u32_to_usize(min); - let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; - let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?; - self.fill(patch_concat.hole, patch_rep.entry); - Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry }) + // Using next_inst() is ok, because we can't return it (concat would + // have to return Some(_) while c_repeat_range_min_or_more returns + // None). + let patch_concat = self + .c_concat(iter::repeat(expr).take(min))? + .unwrap_or(self.next_inst()); + if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { + self.fill(patch_concat.hole, patch_rep.entry); + Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) + } else { + Ok(None) + } } fn c_repeat_range( @@ -606,13 +649,17 @@ impl Compiler { greedy: bool, min: u32, max: u32, - ) -> Result { + ) -> ResultOrEmpty { let (min, max) = (u32_to_usize(min), u32_to_usize(max)); + debug_assert!(min <= max); let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; - let initial_entry = patch_concat.entry; if min == max { return Ok(patch_concat); } + // Same reasoning as in c_repeat_range_min_or_more (we know that min < + // max at this point). + let patch_concat = patch_concat.unwrap_or(self.next_inst()); + let initial_entry = patch_concat.entry; // It is much simpler to compile, e.g., `a{2,5}` as: // // aaa?a?a? @@ -637,7 +684,10 @@ impl Compiler { for _ in min..max { self.fill_to_next(prev_hole); let split = self.push_split_hole(); - let Patch { hole, entry } = self.c(expr)?; + let Patch { hole, entry } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; prev_hole = hole; if greedy { holes.push(self.fill_split(split, Some(entry), None)); @@ -646,7 +696,14 @@ impl Compiler { } } holes.push(prev_hole); - Ok(Patch { hole: Hole::Many(holes), entry: initial_entry }) + Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) + } + + /// Can be used as a default value for the c_* functions when the call to + /// c_function is followed by inserting at least one instruction that is + /// always executed after the ones written by the c* function. + fn next_inst(&self) -> Patch { + Patch { hole: Hole::None, entry: self.insts.len() } } fn fill(&mut self, hole: Hole, goto: InstPtr) { @@ -726,6 +783,11 @@ impl Compiler { Hole::One(hole) } + fn pop_split_hole(&mut self) -> ResultOrEmpty { + self.insts.pop(); + Ok(None) + } + fn check_size(&self) -> result::Result<(), Error> { use std::mem::size_of; @@ -744,6 +806,17 @@ enum Hole { Many(Vec), } +impl Hole { + fn dup_one(self) -> (Self, Self) { + match self { + Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), + Hole::None | Hole::Many(_) => { + unreachable!("must be called on single hole") + } + } + } +} + #[derive(Clone, Debug)] enum MaybeInst { Compiled(Inst), @@ -755,13 +828,22 @@ enum MaybeInst { impl MaybeInst { fn fill(&mut self, goto: InstPtr) { - let filled = match *self { - MaybeInst::Uncompiled(ref inst) => inst.fill(goto), + let maybeinst = match *self { + MaybeInst::Split => MaybeInst::Split1(goto), + MaybeInst::Uncompiled(ref inst) => { + MaybeInst::Compiled(inst.fill(goto)) + } MaybeInst::Split1(goto1) => { - Inst::Split(InstSplit { goto1: goto1, goto2: goto }) + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1: goto1, + goto2: goto, + })) } MaybeInst::Split2(goto2) => { - Inst::Split(InstSplit { goto1: goto, goto2: goto2 }) + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1: goto, + goto2: goto2, + })) } _ => unreachable!( "not all instructions were compiled! \ @@ -769,7 +851,7 @@ impl MaybeInst { self ), }; - *self = MaybeInst::Compiled(filled); + *self = maybeinst; } fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {