From ef4f094287e0ceb0178e8c6591ad18e016df54bf Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 26 Aug 2022 19:41:37 -0400 Subject: [PATCH] syntax: reject '(?-u)\W' when UTF-8 mode is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Unicode mode is disabled (i.e., (?-u)), the Perl character classes (\w, \d and \s) revert to their ASCII definitions. The negated forms of these classes are also derived from their ASCII definitions, and this means that they may actually match bytes outside of ASCII and thus possibly invalid UTF-8. For this reason, when the translator is configured to only produce HIR that matches valid UTF-8, '(?-u)\W' should be rejected. Previously, it was not being rejected, which could actually lead to matches that produced offsets that split codepoints, and thus lead to panics when match offsets are used to slice a string. For example, this code fn main() { let re = regex::Regex::new(r"(?-u)\W").unwrap(); let haystack = "☃"; if let Some(m) = re.find(haystack) { println!("{:?}", &haystack[m.range()]); } } panics with byte index 1 is not a char boundary; it is inside '☃' (bytes 0..3) of `☃` That is, it reports a match at 0..1, which is technically correct, but the regex itself should have been rejected in the first place since the top-level Regex API always has UTF-8 mode enabled. Also, many of the replacement tests were using '(?-u)\W' (or similar) for some reason. I'm not sure why, so I just removed the '(?-u)' to make those tests pass. Whether Unicode is enabled or not doesn't seem to be an interesting detail for those tests. (All haystacks and replacements appear to be ASCII.) Fixes #895, Partially addresses #738 --- regex-syntax/src/hir/translate.rs | 95 +++++++++++++++++++++++++++---- tests/replace.rs | 37 +++--------- 2 files changed, 92 insertions(+), 40 deletions(-) diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index d7686988a..988384ede 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -305,7 +305,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { let hcls = hir::Class::Unicode(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } else { - let cls = self.hir_perl_byte_class(x); + let cls = self.hir_perl_byte_class(x)?; let hcls = hir::Class::Bytes(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } @@ -445,7 +445,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { - let xcls = self.hir_perl_byte_class(x); + let xcls = self.hir_perl_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); @@ -879,7 +879,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, - ) -> hir::ClassBytes { + ) -> Result { use crate::ast::ClassPerlKind::*; assert!(!self.flags().unicode()); @@ -893,7 +893,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if ast_class.negated { class.negate(); } - class + // Negating a Perl byte class is likely to cause it to match invalid + // UTF-8. That's only OK if the translator is configured to allow such + // things. + if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); + } + Ok(class) } /// Converts the given Unicode specific error to an HIR translation error. @@ -1971,7 +1977,7 @@ mod tests { #[test] #[cfg(feature = "unicode-perl")] - fn class_perl() { + fn class_perl_unicode() { // Unicode assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); @@ -2011,7 +2017,10 @@ mod tests { ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); + } + #[test] + fn class_perl_ascii() { // ASCII only assert_eq!( t(r"(?-u)\d"), @@ -2040,29 +2049,93 @@ mod tests { // ASCII only, negated assert_eq!( - t(r"(?-u)\D"), + t_bytes(r"(?-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?-u)\S"), + t_bytes(r"(?-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?-u)\W"), + t_bytes(r"(?-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); assert_eq!( - t(r"(?i-u)\D"), + t_bytes(r"(?i-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?i-u)\S"), + t_bytes(r"(?i-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?i-u)\W"), + t_bytes(r"(?i-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); + + // ASCII only, negated, with UTF-8 mode enabled. + // In this case, negating any Perl class results in an error because + // all such classes can match invalid UTF-8. + assert_eq!( + t_err(r"(?-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); } #[test] diff --git a/tests/replace.rs b/tests/replace.rs index d65be072f..ec2ef9eff 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -12,18 +12,11 @@ macro_rules! replace( replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6"); replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z"); replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ"); -replace!( - groups, - replace, - r"(?-u)(\S+)\s+(\S+)", - "w1 w2", - t!("$2 $1"), - "w2 w1" -); +replace!(groups, replace, r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $1"), "w2 w1"); replace!( double_dollar, replace, - r"(?-u)(\S+)\s+(\S+)", + r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $$1"), "w2 $1" @@ -33,7 +26,7 @@ replace!( replace!( named, replace_all, - r"(?-u)(?P\S+)\s+(?P\S+)(?P\s*)", + r"(?P\S+)\s+(?P\S+)(?P\s*)", "w1 w2 w3 w4", t!("$last $first$space"), "w2 w1 w4 w3" @@ -48,26 +41,12 @@ replace!( ); replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b"); // replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b"); -replace!( - simple_expand, - replace_all, - r"(?-u)(\w) (\w)", - "a b", - t!("$2 $1"), - "b a" -); -replace!( - literal_dollar1, - replace_all, - r"(?-u)(\w+) (\w+)", - "a b", - t!("$$1"), - "$1" -); +replace!(simple_expand, replace_all, r"(\w) (\w)", "a b", t!("$2 $1"), "b a"); +replace!(literal_dollar1, replace_all, r"(\w+) (\w+)", "a b", t!("$$1"), "$1"); replace!( literal_dollar2, replace_all, - r"(?-u)(\w+) (\w+)", + r"(\w+) (\w+)", "a b", t!("$2 $$c $1"), "b $c a" @@ -75,7 +54,7 @@ replace!( replace!( no_expand1, replace, - r"(?-u)(\S+)\s+(\S+)", + r"(\S+)\s+(\S+)", "w1 w2", no_expand!("$2 $1"), "$2 $1" @@ -83,7 +62,7 @@ replace!( replace!( no_expand2, replace, - r"(?-u)(\S+)\s+(\S+)", + r"(\S+)\s+(\S+)", "w1 w2", no_expand!("$$1"), "$$1"