Skip to content

Commit

Permalink
Use Unicode simple case folding for case-insensitivity. Fix #55.
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonSapin committed Apr 19, 2015
1 parent 2561bfd commit 7c2e83e
Show file tree
Hide file tree
Showing 6 changed files with 457 additions and 17 deletions.
10 changes: 4 additions & 6 deletions regex_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ use regex::native::{
Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
Program, Dynamic, ExDynamic, Native,
FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
simple_case_fold,
};

/// For the `regex!` syntax extension. Do not use.
Expand Down Expand Up @@ -459,11 +460,9 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
}
OneChar(c, flags) => {
if flags & FLAG_NOCASE > 0 {
let upc = c.to_uppercase().next().unwrap();
let upc = simple_case_fold(c);
quote_expr!(self.cx, {
let upc = self.chars.prev.map(|c| {
c.to_uppercase().next().unwrap()
});
let upc = self.chars.prev.map(simple_case_fold);
if upc == Some($upc) {
self.add(nlist, $nextpc, caps);
}
Expand All @@ -483,8 +482,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
if casei {
quote_expr!(
self.cx,
self.chars.prev.unwrap()
.to_uppercase().next().unwrap())
simple_case_fold(self.chars.prev.unwrap()))
} else {
quote_expr!(self.cx, self.chars.prev.unwrap())
};
Expand Down
15 changes: 15 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,19 @@ def load_properties(f, interestingprops):

return props

def load_case_folding(f):
fetch(f)
re1 = re.compile("^ *([0-9A-F]+) *; *[CS] *; *([0-9A-F]+) *;")
c_plus_s = []
for line in fileinput.input(f):
m = re1.match(line)
if m:
a = int(m.group(1), 16)
b = int(m.group(2), 16)
c_plus_s.append((a, b))

return {"C_plus_S": c_plus_s}

def escape_char(c):
return "'\\u{%x}'" % c

Expand Down Expand Up @@ -258,6 +271,7 @@ def emit_regex_module(f, cats, w_data):
scripts = load_properties("Scripts.txt", [])
props = load_properties("PropList.txt",
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
case_folding = load_case_folding("CaseFolding.txt")

# all of these categories will also be available as \p{} in libregex
allcats = []
Expand All @@ -280,3 +294,4 @@ def emit_regex_module(f, cats, w_data):

# emit lookup tables for \p{}, along with \d, \w, and \s for libregex
emit_regex_module(rf, allcats, perl_words)
emit_property_module(rf, "case_folding", case_folding)
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ pub mod native {
};
pub use re::{ExDynamic, ExNative};
pub use re::Regex::{Dynamic, Native};
pub use vm::{CharReader, find_prefix};
pub use vm::{CharReader, find_prefix, simple_case_fold};
pub use vm::MatchKind::{self, Exists, Location, Submatches};
pub use vm::StepState::{
self, StepMatchEarlyReturn, StepMatch, StepContinue,
Expand Down
5 changes: 4 additions & 1 deletion src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use std::fmt;

/// Static data containing Unicode ranges for general categories and scripts.
use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
use vm::simple_case_fold;

use self::Ast::*;
use self::Repeater::*;
Expand Down Expand Up @@ -987,6 +988,8 @@ fn combine_ranges(mut unordered: Vec<(char, char)>) -> Vec<(char, char)> {
ordered
}

// FIXME: Is there a clever way to do this by considering ranges rather than individual chars?
// E.g. binary search for overlap with entries in unicode::case_folding::C_plus_S_table
fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> {
if ranges.is_empty() {
return ranges
Expand All @@ -995,7 +998,7 @@ fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)>
.into_iter()
.flat_map(|(start, end)| start as u32 .. end as u32 + 1)
.filter_map(char::from_u32)
.map(|c| c.to_uppercase().next().unwrap())
.map(simple_case_fold)
.collect();
chars.sort();
let mut chars = chars.into_iter();
Expand Down
Loading

0 comments on commit 7c2e83e

Please sign in to comment.