Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix case-insensitivity-related bugs #78

Merged
merged 3 commits into from
Apr 19, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 8 additions & 14 deletions regex_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ use regex::native::{
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
Program, Dynamic, ExDynamic, Native,
FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
simple_case_fold,
};

/// For the `regex!` syntax extension. Do not use.
Expand Down Expand Up @@ -154,7 +155,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
use regex::native::{
MatchKind, Exists, Location, Submatches,
StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
CharReader, find_prefix,
CharReader, find_prefix, simple_case_fold,
};

return Nfa {
Expand Down Expand Up @@ -459,11 +460,9 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
}
OneChar(c, flags) => {
if flags & FLAG_NOCASE > 0 {
let upc = c.to_uppercase().next().unwrap();
let upc = simple_case_fold(c);
quote_expr!(self.cx, {
let upc = self.chars.prev.map(|c| {
c.to_uppercase().next().unwrap()
});
let upc = self.chars.prev.map(simple_case_fold);
if upc == Some($upc) {
self.add(nlist, $nextpc, caps);
}
Expand All @@ -483,8 +482,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
if casei {
quote_expr!(
self.cx,
self.chars.prev.unwrap()
.to_uppercase().next().unwrap())
simple_case_fold(self.chars.prev.unwrap()))
} else {
quote_expr!(self.cx, self.chars.prev.unwrap())
};
Expand All @@ -494,7 +492,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
} else {
quote_expr!(self.cx, found)
};
let mranges = self.match_class(casei, &ranges);
let mranges = self.match_class(&ranges);
quote_expr!(self.cx, {
if self.chars.prev.is_some() {
let c = $get_char;
Expand Down Expand Up @@ -529,12 +527,8 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
// Translates a character class into a match expression.
// This avoids a binary search (and is hopefully replaced by a jump
// table).
fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> P<ast::Expr> {
let mut arms = ranges.iter().map(|&(mut start, mut end)| {
if casei {
start = start.to_uppercase().next().unwrap();
end = end.to_uppercase().next().unwrap();
}
fn match_class(&self, ranges: &[(char, char)]) -> P<ast::Expr> {
let mut arms = ranges.iter().map(|&(start, end)| {
let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start),
quote_expr!(self.cx, $end)));
self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true))
Expand Down
7 changes: 7 additions & 0 deletions regex_macros/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,9 @@ mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));

// https://github.com/rust-lang/regex/issues/76
mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10)));

// Test the Unicode friendliness of Perl character classes.
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
mat!(uni_perl_w_not, r"\w+", "⥡", None);
Expand All @@ -355,6 +358,10 @@ mat!(negclass_space_comma, r"[^,\s]", ", a", Some((2, 3)));
mat!(negclass_comma_space, r"[^\s,]", " ,a", Some((2, 3)));
mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2)));

// Regression test for https://github.com/rust-lang/regex/issues/75
mat!(regression_unsorted_binary_search_1, r"(?i)[a_]+", "A_", Some((0, 2)));
mat!(regression_unsorted_binary_search_2, r"(?i)[A_]+", "a_", Some((0, 2)));

// A whole mess of tests from Glenn Fowler's regex test suite.
// Generated by the 'src/etc/regex-match-tests' program.
#[path = "matches.rs"]
Expand Down
15 changes: 15 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,19 @@ def load_properties(f, interestingprops):

return props

def load_case_folding(f):
fetch(f)
re1 = re.compile("^ *([0-9A-F]+) *; *[CS] *; *([0-9A-F]+) *;")
c_plus_s = []
for line in fileinput.input(f):
m = re1.match(line)
if m:
a = int(m.group(1), 16)
b = int(m.group(2), 16)
c_plus_s.append((a, b))

return {"C_plus_S": c_plus_s}

def escape_char(c):
return "'\\u{%x}'" % c

Expand Down Expand Up @@ -258,6 +271,7 @@ def emit_regex_module(f, cats, w_data):
scripts = load_properties("Scripts.txt", [])
props = load_properties("PropList.txt",
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
case_folding = load_case_folding("CaseFolding.txt")

# all of these categories will also be available as \p{} in libregex
allcats = []
Expand All @@ -280,3 +294,4 @@ def emit_regex_module(f, cats, w_data):

# emit lookup tables for \p{}, along with \d, \w, and \s for libregex
emit_regex_module(rf, allcats, perl_words)
emit_property_module(rf, "case_folding", case_folding)
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ pub mod native {
};
pub use re::{ExDynamic, ExNative};
pub use re::Regex::{Dynamic, Native};
pub use vm::{CharReader, find_prefix};
pub use vm::{CharReader, find_prefix, simple_case_fold};
pub use vm::MatchKind::{self, Exists, Location, Submatches};
pub use vm::StepState::{
self, StepMatchEarlyReturn, StepMatch, StepContinue,
Expand Down
45 changes: 43 additions & 2 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use std::fmt;

/// Static data containing Unicode ranges for general categories and scripts.
use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
use vm::simple_case_fold;

use self::Ast::*;
use self::Repeater::*;
Expand Down Expand Up @@ -213,7 +214,14 @@ impl Parser {
'?' | '*' | '+' => try!(self.push_repeater(c)),
'\\' => {
let ast = try!(self.parse_escape());
self.push(ast)
if let AstClass(mut ranges, flags) = ast {
if flags & FLAG_NOCASE > 0 {
ranges = case_fold_and_combine_ranges(ranges);
}
self.push(AstClass(ranges, flags))
} else {
self.push(ast)
}
}
'{' => try!(self.parse_counted()),
'[' => match self.try_parse_ascii() {
Expand Down Expand Up @@ -421,7 +429,11 @@ impl Parser {
}
}
']' if ranges.len() > 0 => {
ranges = combine_ranges(ranges);
if self.flags & FLAG_NOCASE > 0 {
ranges = case_fold_and_combine_ranges(ranges)
} else {
ranges = combine_ranges(ranges);
}
if negated {
ranges = invert_ranges(ranges);
}
Expand Down Expand Up @@ -976,6 +988,35 @@ fn combine_ranges(mut unordered: Vec<(char, char)>) -> Vec<(char, char)> {
ordered
}

// FIXME: Is there a clever way to do this by considering ranges rather than individual chars?
// E.g. binary search for overlap with entries in unicode::case_folding::C_plus_S_table
fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
if ranges.is_empty() {
return ranges
}
let mut chars: Vec<char> = ranges
.into_iter()
.flat_map(|(start, end)| start as u32 .. end as u32 + 1)
.filter_map(char::from_u32)
.map(simple_case_fold)
.collect();
chars.sort();
chars.dedup();
let mut chars = chars.into_iter();
let mut start = chars.next().unwrap();
let mut end = start;
let mut ranges = Vec::new();
for c in chars {
if c != inc_char(end) {
ranges.push((start, end));
start = c;
}
end = c;
}
ranges.push((start, end));
ranges
}

fn invert_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
if ranges.is_empty() { return ranges; }

Expand Down
Loading