diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index d7686988a..988384ede 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -305,7 +305,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { let hcls = hir::Class::Unicode(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } else { - let cls = self.hir_perl_byte_class(x); + let cls = self.hir_perl_byte_class(x)?; let hcls = hir::Class::Bytes(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } @@ -445,7 +445,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { - let xcls = self.hir_perl_byte_class(x); + let xcls = self.hir_perl_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); @@ -879,7 +879,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, - ) -> hir::ClassBytes { + ) -> Result { use crate::ast::ClassPerlKind::*; assert!(!self.flags().unicode()); @@ -893,7 +893,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if ast_class.negated { class.negate(); } - class + // Negating a Perl byte class is likely to cause it to match invalid + // UTF-8. That's only OK if the translator is configured to allow such + // things. + if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); + } + Ok(class) } /// Converts the given Unicode specific error to an HIR translation error. @@ -1971,7 +1977,7 @@ mod tests { #[test] #[cfg(feature = "unicode-perl")] - fn class_perl() { + fn class_perl_unicode() { // Unicode assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); @@ -2011,7 +2017,10 @@ mod tests { ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); + } + #[test] + fn class_perl_ascii() { // ASCII only assert_eq!( t(r"(?-u)\d"), @@ -2040,29 +2049,93 @@ mod tests { // ASCII only, negated assert_eq!( - t(r"(?-u)\D"), + t_bytes(r"(?-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?-u)\S"), + t_bytes(r"(?-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?-u)\W"), + t_bytes(r"(?-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); assert_eq!( - t(r"(?i-u)\D"), + t_bytes(r"(?i-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?i-u)\S"), + t_bytes(r"(?i-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?i-u)\W"), + t_bytes(r"(?i-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); + + // ASCII only, negated, with UTF-8 mode enabled. + // In this case, negating any Perl class results in an error because + // all such classes can match invalid UTF-8. + assert_eq!( + t_err(r"(?-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); } #[test] diff --git a/tests/replace.rs b/tests/replace.rs index d65be072f..ec2ef9eff 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -12,18 +12,11 @@ macro_rules! replace( replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6"); replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z"); replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ"); -replace!( - groups, - replace, - r"(?-u)(\S+)\s+(\S+)", - "w1 w2", - t!("$2 $1"), - "w2 w1" -); +replace!(groups, replace, r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $1"), "w2 w1"); replace!( double_dollar, replace, - r"(?-u)(\S+)\s+(\S+)", + r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $$1"), "w2 $1" @@ -33,7 +26,7 @@ replace!( replace!( named, replace_all, - r"(?-u)(?P\S+)\s+(?P\S+)(?P\s*)", + r"(?P\S+)\s+(?P\S+)(?P\s*)", "w1 w2 w3 w4", t!("$last $first$space"), "w2 w1 w4 w3" @@ -48,26 +41,12 @@ replace!( ); replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b"); // replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b"); -replace!( - simple_expand, - replace_all, - r"(?-u)(\w) (\w)", - "a b", - t!("$2 $1"), - "b a" -); -replace!( - literal_dollar1, - replace_all, - r"(?-u)(\w+) (\w+)", - "a b", - t!("$$1"), - "$1" -); +replace!(simple_expand, replace_all, r"(\w) (\w)", "a b", t!("$2 $1"), "b a"); +replace!(literal_dollar1, replace_all, r"(\w+) (\w+)", "a b", t!("$$1"), "$1"); replace!( literal_dollar2, replace_all, - r"(?-u)(\w+) (\w+)", + r"(\w+) (\w+)", "a b", t!("$2 $$c $1"), "b $c a" @@ -75,7 +54,7 @@ replace!( replace!( no_expand1, replace, - r"(?-u)(\S+)\s+(\S+)", + r"(\S+)\s+(\S+)", "w1 w2", no_expand!("$2 $1"), "$2 $1" @@ -83,7 +62,7 @@ replace!( replace!( no_expand2, replace, - r"(?-u)(\S+)\s+(\S+)", + r"(\S+)\s+(\S+)", "w1 w2", no_expand!("$$1"), "$$1"