diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index c75995b9c..a79133272 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -1314,6 +1314,8 @@ pub enum Flag { SwapGreed, /// `u` Unicode, + /// `R` + CRLF, /// `x` IgnoreWhitespace, } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 1d6d4d046..93452cb18 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -1381,6 +1381,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { 's' => Ok(ast::Flag::DotMatchesNewLine), 'U' => Ok(ast::Flag::SwapGreed), 'u' => Ok(ast::Flag::Unicode), + 'R' => Ok(ast::Flag::CRLF), 'x' => Ok(ast::Flag::IgnoreWhitespace), _ => { Err(self @@ -4084,6 +4085,34 @@ bar ], }) ); + assert_eq!( + parser("i-sR:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF), + }, + ], + }) + ); assert_eq!( parser("isU").parse_flags().unwrap_err(), @@ -4145,6 +4174,7 @@ bar assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); + assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF)); assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); assert_eq!( diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 0922ea0e3..40f967cfa 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -289,6 +289,7 @@ impl Writer { Flag::DotMatchesNewLine => self.wtr.write_str("s"), Flag::SwapGreed => self.wtr.write_str("U"), Flag::Unicode => self.wtr.write_str("u"), + Flag::CRLF => self.wtr.write_str("R"), Flag::IgnoreWhitespace => self.wtr.write_str("x"), }, }?; diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index dfa8e47f3..9193366a9 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -471,10 +471,12 @@ impl Hir { /// Returns an HIR expression for `.`. /// - /// * [`Dot::AnyChar`] maps to `(?su:.)`. - /// * [`Dot::AnyByte`] maps to `(?s-u:.)`. - /// * [`Dot::AnyCharExceptNL`] maps to `(?u-s:.)`. - /// * [`Dot::AnyByteExceptNL`] maps to `(?-su:.)`. + /// * [`Dot::AnyChar`] maps to `(?su-R:.)`. + /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`. + /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`. + /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`. + /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`. + /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`. /// /// Note that this is a convenience routine for constructing the correct /// character class based on the value of `Dot`. There is no explicit "dot" @@ -492,18 +494,32 @@ impl Hir { cls.push(ClassBytesRange::new(b'\0', b'\xFF')); Hir::class(Class::Bytes(cls)) } - Dot::AnyCharExceptNL => { + Dot::AnyCharExceptLF => { let mut cls = ClassUnicode::empty(); cls.push(ClassUnicodeRange::new('\0', '\x09')); cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); Hir::class(Class::Unicode(cls)) } - Dot::AnyByteExceptNL => { + Dot::AnyCharExceptCRLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\x0C')); + cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByteExceptLF => { let mut cls = ClassBytes::empty(); cls.push(ClassBytesRange::new(b'\0', b'\x09')); cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); Hir::class(Class::Bytes(cls)) } + Dot::AnyByteExceptCRLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\x0C')); + cls.push(ClassBytesRange::new(b'\x0E', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } } } } @@ -1365,6 +1381,16 @@ pub enum Look { /// at the end position of the input, or at the position immediately /// preceding a `\n` character. EndLF, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF, /// Match an ASCII-only word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. @@ -1380,30 +1406,34 @@ pub enum Look { } impl Look { - fn from_repr(repr: u8) -> Option { + fn from_repr(repr: u16) -> Option { match repr { 0 => Some(Look::Start), 1 => Some(Look::End), 2 => Some(Look::StartLF), 3 => Some(Look::EndLF), - 4 => Some(Look::WordAscii), - 5 => Some(Look::WordAsciiNegate), - 6 => Some(Look::WordUnicode), - 7 => Some(Look::WordUnicodeNegate), + 4 => Some(Look::StartCRLF), + 5 => Some(Look::EndCRLF), + 6 => Some(Look::WordAscii), + 7 => Some(Look::WordAsciiNegate), + 8 => Some(Look::WordUnicode), + 9 => Some(Look::WordUnicodeNegate), _ => None, } } - fn as_repr(&self) -> u8 { + fn as_repr(&self) -> u16 { match *self { Look::Start => 0, Look::End => 1, Look::StartLF => 2, Look::EndLF => 3, - Look::WordAscii => 4, - Look::WordAsciiNegate => 5, - Look::WordUnicode => 6, - Look::WordUnicodeNegate => 7, + Look::StartCRLF => 5, + Look::EndCRLF => 5, + Look::WordAscii => 6, + Look::WordAsciiNegate => 7, + Look::WordUnicode => 8, + Look::WordUnicodeNegate => 9, } } @@ -1413,6 +1443,8 @@ impl Look { Look::End => 'z', Look::StartLF => '^', Look::EndLF => '$', + Look::StartCRLF => '^', + Look::EndCRLF => '$', Look::WordAscii => 'b', Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', @@ -1505,11 +1537,20 @@ pub enum Dot { /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. /// /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. - AnyCharExceptNL, + AnyCharExceptLF, + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r` + /// and `\n`. + /// + /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. + AnyCharExceptCRLF, /// Matches any byte value except for `\n`. /// /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. - AnyByteExceptNL, + AnyByteExceptLF, + /// Matches any byte value except for `\r` and `\n`. + /// + /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`. + AnyByteExceptCRLF, } /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack @@ -2038,7 +2079,7 @@ impl Properties { /// example, an [`Hir`] provides properties that return `LookSet`s. #[derive(Clone, Copy, Default, Eq, PartialEq)] pub struct LookSet { - bits: u8, + bits: u16, } impl LookSet { @@ -2170,8 +2211,8 @@ impl Iterator for LookSetIter { #[inline] fn next(&mut self) -> Option { // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a usize. - let repr = u8::try_from(self.set.bits.trailing_zeros()).unwrap(); + // so 'repr' will always fit into a u16. + let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); let look = Look::from_repr(repr)?; self.set.remove(look); Some(look) diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 40f8905b7..fcb7cd252 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -177,6 +177,12 @@ impl Visitor for Writer { hir::Look::EndLF => { self.wtr.write_str("(?m:$)")?; } + hir::Look::StartCRLF => { + self.wtr.write_str("(?mR:^)")?; + } + hir::Look::EndCRLF => { + self.wtr.write_str("(?mR:$)")?; + } hir::Look::WordAscii => { self.wtr.write_str(r"(?-u:\b)")?; } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index b1ebf7b17..c1ebf85c2 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -85,6 +85,12 @@ impl TranslatorBuilder { self } + /// Enable or disable the CRLF mode flag (`R`) by default. + pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.crlf = if yes { Some(true) } else { None }; + self + } + /// Enable or disable the "swap greed" flag (`U`) by default. pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.swap_greed = if yes { Some(true) } else { None }; @@ -866,14 +872,23 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_assertion(&self, asst: &ast::Assertion) -> Result { let unicode = self.flags().unicode(); let multi_line = self.flags().multi_line(); + let crlf = self.flags().crlf(); Ok(match asst.kind { ast::AssertionKind::StartLine => Hir::look(if multi_line { - hir::Look::StartLF + if crlf { + hir::Look::StartCRLF + } else { + hir::Look::StartLF + } } else { hir::Look::Start }), ast::AssertionKind::EndLine => Hir::look(if multi_line { - hir::Look::EndLF + if crlf { + hir::Look::EndCRLF + } else { + hir::Look::EndLF + } } else { hir::Look::End }), @@ -1146,6 +1161,7 @@ struct Flags { dot_matches_new_line: Option, swap_greed: Option, unicode: Option, + crlf: Option, // Note that `ignore_whitespace` is omitted here because it is handled // entirely in the parser. } @@ -1174,6 +1190,9 @@ impl Flags { ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { flags.unicode = Some(enable); } + ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { + flags.crlf = Some(enable); + } ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} } } @@ -1196,6 +1215,9 @@ impl Flags { if self.unicode.is_none() { self.unicode = previous.unicode; } + if self.crlf.is_none() { + self.crlf = previous.crlf; + } } fn dot(&self) -> hir::Dot { @@ -1207,9 +1229,17 @@ impl Flags { } } else { if self.unicode() { - hir::Dot::AnyCharExceptNL + if self.crlf() { + hir::Dot::AnyCharExceptCRLF + } else { + hir::Dot::AnyCharExceptLF + } } else { - hir::Dot::AnyByteExceptNL + if self.crlf() { + hir::Dot::AnyByteExceptCRLF + } else { + hir::Dot::AnyByteExceptLF + } } } } @@ -1233,6 +1263,10 @@ impl Flags { fn unicode(&self) -> bool { self.unicode.unwrap_or(true) } + + fn crlf(&self) -> bool { + self.crlf.unwrap_or(false) + } } fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { @@ -1678,14 +1712,32 @@ mod tests { fn dot() { assert_eq!( t("."), - hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),]) + hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) ); - assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),])); + assert_eq!( + t("(?R)."), + hir_uclass(&[ + ('\0', '\t'), + ('\x0B', '\x0C'), + ('\x0E', '\u{10FFFF}'), + ]) + ); + assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); + assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); assert_eq!( t_bytes("(?-u)."), - hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),]) + hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) + ); + assert_eq!( + t_bytes("(?R-u)."), + hir_bclass(&[ + (b'\0', b'\t'), + (b'\x0B', b'\x0C'), + (b'\x0E', b'\xFF'), + ]) ); assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. assert_eq!( @@ -1698,6 +1750,16 @@ mod tests { ), } ); + assert_eq!( + t_err("(?R-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } + ); assert_eq!( t_err("(?s-u)."), TestError { @@ -1708,6 +1770,16 @@ mod tests { ), } ); + assert_eq!( + t_err("(?Rs-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(7, 1, 8), + Position::new(8, 1, 9) + ), + } + ); } #[test] @@ -1795,6 +1867,29 @@ mod tests { ); } + #[test] + fn line_anchors() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + + assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); + assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); + assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); + } + #[test] fn flags() { #[cfg(feature = "unicode-case")] diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 8eb88e042..2851cda33 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -134,6 +134,23 @@ impl ParserBuilder { self } + /// Enable or disable the CRLF mode flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `R` flag. + /// + /// When CRLF mode is enabled, the following happens: + /// + /// * Unless `dot_matches_new_line` is enabled, `.` will match any character + /// except for `\r` and `\n`. + /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, + /// `\r` and `\n` as line terminators. And in particular, neither will + /// match between a `\r` and a `\n`. + pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.crlf(yes); + self + } + /// Enable or disable the "swap greed" flag by default. /// /// By default this is disabled. It may alternatively be selectively diff --git a/src/compile.rs b/src/compile.rs index 9ee52354d..20eebf0ed 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -326,6 +326,12 @@ impl Compiler { self.byte_classes.set_range(b'\n', b'\n'); self.c_empty_look(prog::EmptyLook::EndLine) } + hir::Look::StartCRLF | hir::Look::EndCRLF => { + return Err(Error::Syntax( + "CRLF-aware line anchors are not supported yet" + .to_string(), + )); + } hir::Look::WordAscii => { self.byte_classes.set_word_boundary(); self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)