diff --git a/proptest/src/string.rs b/proptest/src/string.rs index 8777388f..b4e1fac2 100644 --- a/proptest/src/string.rs +++ b/proptest/src/string.rs @@ -23,7 +23,7 @@ use regex_syntax::hir::{ RepetitionKind::{self, *}, RepetitionRange::*, }; -use regex_syntax::{Error as ParseError, Parser}; +use regex_syntax::{Error as ParseError, ParserBuilder}; use crate::bool; use crate::char; @@ -150,7 +150,8 @@ impl StrategyFromRegex for Vec { /// If you don't need error handling and aren't limited by setup time, it is /// also possible to directly use a `&str` as a strategy with the same effect. pub fn string_regex(regex: &str) -> ParseResult { - string_regex_parsed(®ex_to_hir(regex)?) + let hir = ParserBuilder::new().build().parse(regex)?; + string_regex_parsed(&hir) } /// Like `string_regex()`, but allows providing a pre-parsed expression. @@ -167,8 +168,20 @@ pub fn string_regex_parsed(expr: &Hir) -> ParseResult { /// Creates a strategy which generates byte strings matching the given regular /// expression. +/// +/// By default, the byte strings generated by this strategy _will_ be valid +/// UTF-8. If you wish to generate byte strings that aren't (necessarily) +/// valid UTF-8, wrap your regex (or some subsection of it) in `(?-u: ... )`. +/// You may want to turn on the `s` flag as well (`(?s-u: ... )`) so that `.` +/// will generate newline characters (byte value `0x0A`). See the +/// [`regex` crate's documentation](https://docs.rs/regex/*/regex/#opt-out-of-unicode-support) +/// for more information. pub fn bytes_regex(regex: &str) -> ParseResult> { - bytes_regex_parsed(®ex_to_hir(regex)?) + let hir = ParserBuilder::new() + .allow_invalid_utf8(true) + .build() + .parse(regex)?; + bytes_regex_parsed(&hir) } /// Like `bytes_regex()`, but allows providing a pre-parsed expression. @@ -357,10 +370,6 @@ fn to_bytes(khar: char) -> Vec { khar.encode_utf8(&mut buf).as_bytes().to_owned() } -fn regex_to_hir(pattern: &str) -> Result { - Ok(Parser::new().parse(pattern)?) -} - fn unsupported(error: &'static str) -> Result { Err(Error::UnsupportedRegex(error)) } @@ -370,9 +379,17 @@ mod test { use std::collections::HashSet; use regex::Regex; + use regex::bytes::Regex as BytesRegex; use super::*; + fn printable_ascii(v: &[u8]) -> String { + v.iter() + .flat_map(|c| std::ascii::escape_default(*c)) + .map(|c| char::from_u32(c.into()).unwrap()) + .collect() + } + fn do_test( pattern: &str, min_distinct: usize, @@ -396,6 +413,29 @@ mod test { ); } + fn do_test_bytes( + pattern: &str, + min_distinct: usize, + max_distinct: usize, + iterations: usize, + ) { + let generated = generate_byte_values_matching_regex(pattern, iterations); + assert!( + generated.len() >= min_distinct, + "Expected to generate at least {} strings, but only \ + generated {}", + min_distinct, + generated.len() + ); + assert!( + generated.len() <= max_distinct, + "Expected to generate at most {} strings, but \ + generated {}", + max_distinct, + generated.len() + ); + } + fn generate_values_matching_regex( pattern: &str, iterations: usize, @@ -432,6 +472,42 @@ mod test { generated } + fn generate_byte_values_matching_regex( + pattern: &str, + iterations: usize, + ) -> HashSet> { + let rx = BytesRegex::new(pattern).unwrap(); + let mut generated = HashSet::new(); + + let strategy = bytes_regex(pattern).unwrap(); + let mut runner = TestRunner::deterministic(); + for _ in 0..iterations { + let mut value = strategy.new_tree(&mut runner).unwrap(); + + loop { + let s = value.current(); + let ok = if let Some(matsch) = rx.find(&s) { + 0 == matsch.start() && s.len() == matsch.end() + } else { + false + }; + if !ok { + panic!( + "Generated string {:?} which does not match {:?}", + printable_ascii(&s), pattern + ); + } + + generated.insert(s); + + if !value.simplify() { + break; + } + } + } + generated + } + #[test] fn test_case_insensitive_produces_all_available_values() { let mut expected: HashSet = HashSet::new(); @@ -445,6 +521,7 @@ mod test { #[test] fn test_literal() { do_test("foo", 1, 1, 8); + do_test_bytes("foo", 1, 1, 8); } #[test] @@ -455,36 +532,43 @@ mod test { #[test] fn test_alternation() { do_test("foo|bar|baz", 3, 3, 16); + do_test_bytes("foo|bar|baz", 3, 3, 16); } #[test] - fn test_repitition() { + fn test_repetition() { do_test("a{0,8}", 9, 9, 64); + do_test_bytes("a{0,8}", 9, 9, 64); } #[test] fn test_question() { do_test("a?", 2, 2, 16); + do_test_bytes("a?", 2, 2, 16); } #[test] fn test_star() { do_test("a*", 33, 33, 256); + do_test_bytes("a*", 33, 33, 256); } #[test] fn test_plus() { do_test("a+", 32, 32, 256); + do_test_bytes("a+", 32, 32, 256); } #[test] fn test_n_to_range() { do_test("a{4,}", 4, 4, 64); + do_test_bytes("a{4,}", 4, 4, 64); } #[test] fn test_concatenation() { do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32); + do_test_bytes("(foo|bar)(xyzzy|plugh)", 4, 4, 32); } #[test] @@ -505,6 +589,7 @@ mod test { #[test] fn test_dot_s() { do_test("(?s).", 200, 65536, 256); + do_test_bytes("(?s-u).", 256, 256, 2048); } #[test] @@ -512,6 +597,16 @@ mod test { do_test("\\d+", 1, 65536, 256); } + #[test] + fn test_non_utf8_byte_strings() { + do_test_bytes(r"(?-u)[\xC0-\xFF]\x20", 64, 64, 512); + do_test_bytes(r"(?-u)\x20[\x80-\xBF]", 64, 64, 512); + do_test_bytes(r#"(?x-u) + \xed (( ( \xa0\x80 | \xad\xbf | \xae\x80 | \xaf\xbf ) + ( \xed ( \xb0\x80 | \xbf\xbf ) )? ) + | \xb0\x80 | \xbe\x80 | \xbf\xbf )"#, 15, 15, 120); + } + fn assert_send_and_sync(_: T) {} #[test]