From 640c579242235c0d5b6b538c4cad39d7fc3c91b0 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sun, 19 Apr 2015 14:47:08 +0200 Subject: [PATCH] Use Unicode simple case folding for case-insensitivity. Fix #55. --- regex_macros/src/lib.rs | 10 +- scripts/unicode.py | 15 ++ src/lib.rs | 2 +- src/parse.rs | 3 +- src/unicode.rs | 416 ++++++++++++++++++++++++++++++++++++++++ src/vm.rs | 26 ++- 6 files changed, 455 insertions(+), 17 deletions(-) diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs index 27ff27a43d..37523fb9b2 100644 --- a/regex_macros/src/lib.rs +++ b/regex_macros/src/lib.rs @@ -40,6 +40,7 @@ use regex::native::{ Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, Program, Dynamic, ExDynamic, Native, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED, + simple_case_fold, }; /// For the `regex!` syntax extension. Do not use. @@ -459,11 +460,9 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, } OneChar(c, flags) => { if flags & FLAG_NOCASE > 0 { - let upc = c.to_uppercase().next().unwrap(); + let upc = simple_case_fold(c); quote_expr!(self.cx, { - let upc = self.chars.prev.map(|c| { - c.to_uppercase().next().unwrap() - }); + let upc = self.chars.prev.map(simple_case_fold); if upc == Some($upc) { self.add(nlist, $nextpc, caps); } @@ -483,8 +482,7 @@ fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, if casei { quote_expr!( self.cx, - self.chars.prev.unwrap() - .to_uppercase().next().unwrap()) + simple_case_fold(self.chars.prev.unwrap())) } else { quote_expr!(self.cx, self.chars.prev.unwrap()) }; diff --git a/scripts/unicode.py b/scripts/unicode.py index 62ec96a41b..f734b78099 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -194,6 +194,19 @@ def load_properties(f, interestingprops): return props +def load_case_folding(f): + fetch(f) + re1 = re.compile("^ *([0-9A-F]+) *; *[CS] *; *([0-9A-F]+) *;") + c_plus_s = [] + for line in fileinput.input(f): + m = re1.match(line) + if m: + a = int(m.group(1), 16) + b = int(m.group(2), 16) + c_plus_s.append((a, b)) + + return {"C_plus_S": c_plus_s} + def escape_char(c): return "'\\u{%x}'" % c @@ -258,6 +271,7 @@ def emit_regex_module(f, cats, w_data): scripts = load_properties("Scripts.txt", []) props = load_properties("PropList.txt", ["White_Space", "Join_Control", "Noncharacter_Code_Point"]) + case_folding = load_case_folding("CaseFolding.txt") # all of these categories will also be available as \p{} in libregex allcats = [] @@ -280,3 +294,4 @@ def emit_regex_module(f, cats, w_data): # emit lookup tables for \p{}, along with \d, \w, and \s for libregex emit_regex_module(rf, allcats, perl_words) + emit_property_module(rf, "case_folding", case_folding) diff --git a/src/lib.rs b/src/lib.rs index 5fa01f9444..6b727296d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -413,7 +413,7 @@ pub mod native { }; pub use re::{ExDynamic, ExNative}; pub use re::Regex::{Dynamic, Native}; - pub use vm::{CharReader, find_prefix}; + pub use vm::{CharReader, find_prefix, simple_case_fold}; pub use vm::MatchKind::{self, Exists, Location, Submatches}; pub use vm::StepState::{ self, StepMatchEarlyReturn, StepMatch, StepContinue, diff --git a/src/parse.rs b/src/parse.rs index 2535a84529..a6dd2c00af 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -14,6 +14,7 @@ use std::fmt; /// Static data containing Unicode ranges for general categories and scripts. use unicode::regex::{UNICODE_CLASSES, PERLD, PERLS, PERLW}; +use vm::simple_case_fold; use self::Ast::*; use self::Repeater::*; @@ -995,7 +996,7 @@ fn case_fold_and_combine_ranges(ranges: Vec<(char, char)>) -> Vec<(char, char)> .into_iter() .flat_map(|(start, end)| start as u32 .. end as u32 + 1) .filter_map(char::from_u32) - .map(|c| c.to_uppercase().next().unwrap()) + .map(simple_case_fold) .collect(); chars.sort(); let mut chars = chars.into_iter(); diff --git a/src/unicode.rs b/src/unicode.rs index 2041260ce8..c583438c03 100644 --- a/src/unicode.rs +++ b/src/unicode.rs @@ -4635,3 +4635,419 @@ pub mod regex { } +pub mod case_folding { + pub const C_plus_S_table: &'static [(char, char)] = &[ + ('\u{41}', '\u{61}'), ('\u{42}', '\u{62}'), ('\u{43}', '\u{63}'), + ('\u{44}', '\u{64}'), ('\u{45}', '\u{65}'), ('\u{46}', '\u{66}'), + ('\u{47}', '\u{67}'), ('\u{48}', '\u{68}'), ('\u{49}', '\u{69}'), + ('\u{4a}', '\u{6a}'), ('\u{4b}', '\u{6b}'), ('\u{4c}', '\u{6c}'), + ('\u{4d}', '\u{6d}'), ('\u{4e}', '\u{6e}'), ('\u{4f}', '\u{6f}'), + ('\u{50}', '\u{70}'), ('\u{51}', '\u{71}'), ('\u{52}', '\u{72}'), + ('\u{53}', '\u{73}'), ('\u{54}', '\u{74}'), ('\u{55}', '\u{75}'), + ('\u{56}', '\u{76}'), ('\u{57}', '\u{77}'), ('\u{58}', '\u{78}'), + ('\u{59}', '\u{79}'), ('\u{5a}', '\u{7a}'), ('\u{b5}', '\u{3bc}'), + ('\u{c0}', '\u{e0}'), ('\u{c1}', '\u{e1}'), ('\u{c2}', '\u{e2}'), + ('\u{c3}', '\u{e3}'), ('\u{c4}', '\u{e4}'), ('\u{c5}', '\u{e5}'), + ('\u{c6}', '\u{e6}'), ('\u{c7}', '\u{e7}'), ('\u{c8}', '\u{e8}'), + ('\u{c9}', '\u{e9}'), ('\u{ca}', '\u{ea}'), ('\u{cb}', '\u{eb}'), + ('\u{cc}', '\u{ec}'), ('\u{cd}', '\u{ed}'), ('\u{ce}', '\u{ee}'), + ('\u{cf}', '\u{ef}'), ('\u{d0}', '\u{f0}'), ('\u{d1}', '\u{f1}'), + ('\u{d2}', '\u{f2}'), ('\u{d3}', '\u{f3}'), ('\u{d4}', '\u{f4}'), + ('\u{d5}', '\u{f5}'), ('\u{d6}', '\u{f6}'), ('\u{d8}', '\u{f8}'), + ('\u{d9}', '\u{f9}'), ('\u{da}', '\u{fa}'), ('\u{db}', '\u{fb}'), + ('\u{dc}', '\u{fc}'), ('\u{dd}', '\u{fd}'), ('\u{de}', '\u{fe}'), + ('\u{100}', '\u{101}'), ('\u{102}', '\u{103}'), ('\u{104}', '\u{105}'), + ('\u{106}', '\u{107}'), ('\u{108}', '\u{109}'), ('\u{10a}', '\u{10b}'), + ('\u{10c}', '\u{10d}'), ('\u{10e}', '\u{10f}'), ('\u{110}', '\u{111}'), + ('\u{112}', '\u{113}'), ('\u{114}', '\u{115}'), ('\u{116}', '\u{117}'), + ('\u{118}', '\u{119}'), ('\u{11a}', '\u{11b}'), ('\u{11c}', '\u{11d}'), + ('\u{11e}', '\u{11f}'), ('\u{120}', '\u{121}'), ('\u{122}', '\u{123}'), + ('\u{124}', '\u{125}'), ('\u{126}', '\u{127}'), ('\u{128}', '\u{129}'), + ('\u{12a}', '\u{12b}'), ('\u{12c}', '\u{12d}'), ('\u{12e}', '\u{12f}'), + ('\u{132}', '\u{133}'), ('\u{134}', '\u{135}'), ('\u{136}', '\u{137}'), + ('\u{139}', '\u{13a}'), ('\u{13b}', '\u{13c}'), ('\u{13d}', '\u{13e}'), + ('\u{13f}', '\u{140}'), ('\u{141}', '\u{142}'), ('\u{143}', '\u{144}'), + ('\u{145}', '\u{146}'), ('\u{147}', '\u{148}'), ('\u{14a}', '\u{14b}'), + ('\u{14c}', '\u{14d}'), ('\u{14e}', '\u{14f}'), ('\u{150}', '\u{151}'), + ('\u{152}', '\u{153}'), ('\u{154}', '\u{155}'), ('\u{156}', '\u{157}'), + ('\u{158}', '\u{159}'), ('\u{15a}', '\u{15b}'), ('\u{15c}', '\u{15d}'), + ('\u{15e}', '\u{15f}'), ('\u{160}', '\u{161}'), ('\u{162}', '\u{163}'), + ('\u{164}', '\u{165}'), ('\u{166}', '\u{167}'), ('\u{168}', '\u{169}'), + ('\u{16a}', '\u{16b}'), ('\u{16c}', '\u{16d}'), ('\u{16e}', '\u{16f}'), + ('\u{170}', '\u{171}'), ('\u{172}', '\u{173}'), ('\u{174}', '\u{175}'), + ('\u{176}', '\u{177}'), ('\u{178}', '\u{ff}'), ('\u{179}', '\u{17a}'), + ('\u{17b}', '\u{17c}'), ('\u{17d}', '\u{17e}'), ('\u{17f}', '\u{73}'), + ('\u{181}', '\u{253}'), ('\u{182}', '\u{183}'), ('\u{184}', '\u{185}'), + ('\u{186}', '\u{254}'), ('\u{187}', '\u{188}'), ('\u{189}', '\u{256}'), + ('\u{18a}', '\u{257}'), ('\u{18b}', '\u{18c}'), ('\u{18e}', '\u{1dd}'), + ('\u{18f}', '\u{259}'), ('\u{190}', '\u{25b}'), ('\u{191}', '\u{192}'), + ('\u{193}', '\u{260}'), ('\u{194}', '\u{263}'), ('\u{196}', '\u{269}'), + ('\u{197}', '\u{268}'), ('\u{198}', '\u{199}'), ('\u{19c}', '\u{26f}'), + ('\u{19d}', '\u{272}'), ('\u{19f}', '\u{275}'), ('\u{1a0}', '\u{1a1}'), + ('\u{1a2}', '\u{1a3}'), ('\u{1a4}', '\u{1a5}'), ('\u{1a6}', '\u{280}'), + ('\u{1a7}', '\u{1a8}'), ('\u{1a9}', '\u{283}'), ('\u{1ac}', '\u{1ad}'), + ('\u{1ae}', '\u{288}'), ('\u{1af}', '\u{1b0}'), ('\u{1b1}', '\u{28a}'), + ('\u{1b2}', '\u{28b}'), ('\u{1b3}', '\u{1b4}'), ('\u{1b5}', '\u{1b6}'), + ('\u{1b7}', '\u{292}'), ('\u{1b8}', '\u{1b9}'), ('\u{1bc}', '\u{1bd}'), + ('\u{1c4}', '\u{1c6}'), ('\u{1c5}', '\u{1c6}'), ('\u{1c7}', '\u{1c9}'), + ('\u{1c8}', '\u{1c9}'), ('\u{1ca}', '\u{1cc}'), ('\u{1cb}', '\u{1cc}'), + ('\u{1cd}', '\u{1ce}'), ('\u{1cf}', '\u{1d0}'), ('\u{1d1}', '\u{1d2}'), + ('\u{1d3}', '\u{1d4}'), ('\u{1d5}', '\u{1d6}'), ('\u{1d7}', '\u{1d8}'), + ('\u{1d9}', '\u{1da}'), ('\u{1db}', '\u{1dc}'), ('\u{1de}', '\u{1df}'), + ('\u{1e0}', '\u{1e1}'), ('\u{1e2}', '\u{1e3}'), ('\u{1e4}', '\u{1e5}'), + ('\u{1e6}', '\u{1e7}'), ('\u{1e8}', '\u{1e9}'), ('\u{1ea}', '\u{1eb}'), + ('\u{1ec}', '\u{1ed}'), ('\u{1ee}', '\u{1ef}'), ('\u{1f1}', '\u{1f3}'), + ('\u{1f2}', '\u{1f3}'), ('\u{1f4}', '\u{1f5}'), ('\u{1f6}', '\u{195}'), + ('\u{1f7}', '\u{1bf}'), ('\u{1f8}', '\u{1f9}'), ('\u{1fa}', '\u{1fb}'), + ('\u{1fc}', '\u{1fd}'), ('\u{1fe}', '\u{1ff}'), ('\u{200}', '\u{201}'), + ('\u{202}', '\u{203}'), ('\u{204}', '\u{205}'), ('\u{206}', '\u{207}'), + ('\u{208}', '\u{209}'), ('\u{20a}', '\u{20b}'), ('\u{20c}', '\u{20d}'), + ('\u{20e}', '\u{20f}'), ('\u{210}', '\u{211}'), ('\u{212}', '\u{213}'), + ('\u{214}', '\u{215}'), ('\u{216}', '\u{217}'), ('\u{218}', '\u{219}'), + ('\u{21a}', '\u{21b}'), ('\u{21c}', '\u{21d}'), ('\u{21e}', '\u{21f}'), + ('\u{220}', '\u{19e}'), ('\u{222}', '\u{223}'), ('\u{224}', '\u{225}'), + ('\u{226}', '\u{227}'), ('\u{228}', '\u{229}'), ('\u{22a}', '\u{22b}'), + ('\u{22c}', '\u{22d}'), ('\u{22e}', '\u{22f}'), ('\u{230}', '\u{231}'), + ('\u{232}', '\u{233}'), ('\u{23a}', '\u{2c65}'), ('\u{23b}', '\u{23c}'), + ('\u{23d}', '\u{19a}'), ('\u{23e}', '\u{2c66}'), ('\u{241}', '\u{242}'), + ('\u{243}', '\u{180}'), ('\u{244}', '\u{289}'), ('\u{245}', '\u{28c}'), + ('\u{246}', '\u{247}'), ('\u{248}', '\u{249}'), ('\u{24a}', '\u{24b}'), + ('\u{24c}', '\u{24d}'), ('\u{24e}', '\u{24f}'), ('\u{345}', '\u{3b9}'), + ('\u{370}', '\u{371}'), ('\u{372}', '\u{373}'), ('\u{376}', '\u{377}'), + ('\u{37f}', '\u{3f3}'), ('\u{386}', '\u{3ac}'), ('\u{388}', '\u{3ad}'), + ('\u{389}', '\u{3ae}'), ('\u{38a}', '\u{3af}'), ('\u{38c}', '\u{3cc}'), + ('\u{38e}', '\u{3cd}'), ('\u{38f}', '\u{3ce}'), ('\u{391}', '\u{3b1}'), + ('\u{392}', '\u{3b2}'), ('\u{393}', '\u{3b3}'), ('\u{394}', '\u{3b4}'), + ('\u{395}', '\u{3b5}'), ('\u{396}', '\u{3b6}'), ('\u{397}', '\u{3b7}'), + ('\u{398}', '\u{3b8}'), ('\u{399}', '\u{3b9}'), ('\u{39a}', '\u{3ba}'), + ('\u{39b}', '\u{3bb}'), ('\u{39c}', '\u{3bc}'), ('\u{39d}', '\u{3bd}'), + ('\u{39e}', '\u{3be}'), ('\u{39f}', '\u{3bf}'), ('\u{3a0}', '\u{3c0}'), + ('\u{3a1}', '\u{3c1}'), ('\u{3a3}', '\u{3c3}'), ('\u{3a4}', '\u{3c4}'), + ('\u{3a5}', '\u{3c5}'), ('\u{3a6}', '\u{3c6}'), ('\u{3a7}', '\u{3c7}'), + ('\u{3a8}', '\u{3c8}'), ('\u{3a9}', '\u{3c9}'), ('\u{3aa}', '\u{3ca}'), + ('\u{3ab}', '\u{3cb}'), ('\u{3c2}', '\u{3c3}'), ('\u{3cf}', '\u{3d7}'), + ('\u{3d0}', '\u{3b2}'), ('\u{3d1}', '\u{3b8}'), ('\u{3d5}', '\u{3c6}'), + ('\u{3d6}', '\u{3c0}'), ('\u{3d8}', '\u{3d9}'), ('\u{3da}', '\u{3db}'), + ('\u{3dc}', '\u{3dd}'), ('\u{3de}', '\u{3df}'), ('\u{3e0}', '\u{3e1}'), + ('\u{3e2}', '\u{3e3}'), ('\u{3e4}', '\u{3e5}'), ('\u{3e6}', '\u{3e7}'), + ('\u{3e8}', '\u{3e9}'), ('\u{3ea}', '\u{3eb}'), ('\u{3ec}', '\u{3ed}'), + ('\u{3ee}', '\u{3ef}'), ('\u{3f0}', '\u{3ba}'), ('\u{3f1}', '\u{3c1}'), + ('\u{3f4}', '\u{3b8}'), ('\u{3f5}', '\u{3b5}'), ('\u{3f7}', '\u{3f8}'), + ('\u{3f9}', '\u{3f2}'), ('\u{3fa}', '\u{3fb}'), ('\u{3fd}', '\u{37b}'), + ('\u{3fe}', '\u{37c}'), ('\u{3ff}', '\u{37d}'), ('\u{400}', '\u{450}'), + ('\u{401}', '\u{451}'), ('\u{402}', '\u{452}'), ('\u{403}', '\u{453}'), + ('\u{404}', '\u{454}'), ('\u{405}', '\u{455}'), ('\u{406}', '\u{456}'), + ('\u{407}', '\u{457}'), ('\u{408}', '\u{458}'), ('\u{409}', '\u{459}'), + ('\u{40a}', '\u{45a}'), ('\u{40b}', '\u{45b}'), ('\u{40c}', '\u{45c}'), + ('\u{40d}', '\u{45d}'), ('\u{40e}', '\u{45e}'), ('\u{40f}', '\u{45f}'), + ('\u{410}', '\u{430}'), ('\u{411}', '\u{431}'), ('\u{412}', '\u{432}'), + ('\u{413}', '\u{433}'), ('\u{414}', '\u{434}'), ('\u{415}', '\u{435}'), + ('\u{416}', '\u{436}'), ('\u{417}', '\u{437}'), ('\u{418}', '\u{438}'), + ('\u{419}', '\u{439}'), ('\u{41a}', '\u{43a}'), ('\u{41b}', '\u{43b}'), + ('\u{41c}', '\u{43c}'), ('\u{41d}', '\u{43d}'), ('\u{41e}', '\u{43e}'), + ('\u{41f}', '\u{43f}'), ('\u{420}', '\u{440}'), ('\u{421}', '\u{441}'), + ('\u{422}', '\u{442}'), ('\u{423}', '\u{443}'), ('\u{424}', '\u{444}'), + ('\u{425}', '\u{445}'), ('\u{426}', '\u{446}'), ('\u{427}', '\u{447}'), + ('\u{428}', '\u{448}'), ('\u{429}', '\u{449}'), ('\u{42a}', '\u{44a}'), + ('\u{42b}', '\u{44b}'), ('\u{42c}', '\u{44c}'), ('\u{42d}', '\u{44d}'), + ('\u{42e}', '\u{44e}'), ('\u{42f}', '\u{44f}'), ('\u{460}', '\u{461}'), + ('\u{462}', '\u{463}'), ('\u{464}', '\u{465}'), ('\u{466}', '\u{467}'), + ('\u{468}', '\u{469}'), ('\u{46a}', '\u{46b}'), ('\u{46c}', '\u{46d}'), + ('\u{46e}', '\u{46f}'), ('\u{470}', '\u{471}'), ('\u{472}', '\u{473}'), + ('\u{474}', '\u{475}'), ('\u{476}', '\u{477}'), ('\u{478}', '\u{479}'), + ('\u{47a}', '\u{47b}'), ('\u{47c}', '\u{47d}'), ('\u{47e}', '\u{47f}'), + ('\u{480}', '\u{481}'), ('\u{48a}', '\u{48b}'), ('\u{48c}', '\u{48d}'), + ('\u{48e}', '\u{48f}'), ('\u{490}', '\u{491}'), ('\u{492}', '\u{493}'), + ('\u{494}', '\u{495}'), ('\u{496}', '\u{497}'), ('\u{498}', '\u{499}'), + ('\u{49a}', '\u{49b}'), ('\u{49c}', '\u{49d}'), ('\u{49e}', '\u{49f}'), + ('\u{4a0}', '\u{4a1}'), ('\u{4a2}', '\u{4a3}'), ('\u{4a4}', '\u{4a5}'), + ('\u{4a6}', '\u{4a7}'), ('\u{4a8}', '\u{4a9}'), ('\u{4aa}', '\u{4ab}'), + ('\u{4ac}', '\u{4ad}'), ('\u{4ae}', '\u{4af}'), ('\u{4b0}', '\u{4b1}'), + ('\u{4b2}', '\u{4b3}'), ('\u{4b4}', '\u{4b5}'), ('\u{4b6}', '\u{4b7}'), + ('\u{4b8}', '\u{4b9}'), ('\u{4ba}', '\u{4bb}'), ('\u{4bc}', '\u{4bd}'), + ('\u{4be}', '\u{4bf}'), ('\u{4c0}', '\u{4cf}'), ('\u{4c1}', '\u{4c2}'), + ('\u{4c3}', '\u{4c4}'), ('\u{4c5}', '\u{4c6}'), ('\u{4c7}', '\u{4c8}'), + ('\u{4c9}', '\u{4ca}'), ('\u{4cb}', '\u{4cc}'), ('\u{4cd}', '\u{4ce}'), + ('\u{4d0}', '\u{4d1}'), ('\u{4d2}', '\u{4d3}'), ('\u{4d4}', '\u{4d5}'), + ('\u{4d6}', '\u{4d7}'), ('\u{4d8}', '\u{4d9}'), ('\u{4da}', '\u{4db}'), + ('\u{4dc}', '\u{4dd}'), ('\u{4de}', '\u{4df}'), ('\u{4e0}', '\u{4e1}'), + ('\u{4e2}', '\u{4e3}'), ('\u{4e4}', '\u{4e5}'), ('\u{4e6}', '\u{4e7}'), + ('\u{4e8}', '\u{4e9}'), ('\u{4ea}', '\u{4eb}'), ('\u{4ec}', '\u{4ed}'), + ('\u{4ee}', '\u{4ef}'), ('\u{4f0}', '\u{4f1}'), ('\u{4f2}', '\u{4f3}'), + ('\u{4f4}', '\u{4f5}'), ('\u{4f6}', '\u{4f7}'), ('\u{4f8}', '\u{4f9}'), + ('\u{4fa}', '\u{4fb}'), ('\u{4fc}', '\u{4fd}'), ('\u{4fe}', '\u{4ff}'), + ('\u{500}', '\u{501}'), ('\u{502}', '\u{503}'), ('\u{504}', '\u{505}'), + ('\u{506}', '\u{507}'), ('\u{508}', '\u{509}'), ('\u{50a}', '\u{50b}'), + ('\u{50c}', '\u{50d}'), ('\u{50e}', '\u{50f}'), ('\u{510}', '\u{511}'), + ('\u{512}', '\u{513}'), ('\u{514}', '\u{515}'), ('\u{516}', '\u{517}'), + ('\u{518}', '\u{519}'), ('\u{51a}', '\u{51b}'), ('\u{51c}', '\u{51d}'), + ('\u{51e}', '\u{51f}'), ('\u{520}', '\u{521}'), ('\u{522}', '\u{523}'), + ('\u{524}', '\u{525}'), ('\u{526}', '\u{527}'), ('\u{528}', '\u{529}'), + ('\u{52a}', '\u{52b}'), ('\u{52c}', '\u{52d}'), ('\u{52e}', '\u{52f}'), + ('\u{531}', '\u{561}'), ('\u{532}', '\u{562}'), ('\u{533}', '\u{563}'), + ('\u{534}', '\u{564}'), ('\u{535}', '\u{565}'), ('\u{536}', '\u{566}'), + ('\u{537}', '\u{567}'), ('\u{538}', '\u{568}'), ('\u{539}', '\u{569}'), + ('\u{53a}', '\u{56a}'), ('\u{53b}', '\u{56b}'), ('\u{53c}', '\u{56c}'), + ('\u{53d}', '\u{56d}'), ('\u{53e}', '\u{56e}'), ('\u{53f}', '\u{56f}'), + ('\u{540}', '\u{570}'), ('\u{541}', '\u{571}'), ('\u{542}', '\u{572}'), + ('\u{543}', '\u{573}'), ('\u{544}', '\u{574}'), ('\u{545}', '\u{575}'), + ('\u{546}', '\u{576}'), ('\u{547}', '\u{577}'), ('\u{548}', '\u{578}'), + ('\u{549}', '\u{579}'), ('\u{54a}', '\u{57a}'), ('\u{54b}', '\u{57b}'), + ('\u{54c}', '\u{57c}'), ('\u{54d}', '\u{57d}'), ('\u{54e}', '\u{57e}'), + ('\u{54f}', '\u{57f}'), ('\u{550}', '\u{580}'), ('\u{551}', '\u{581}'), + ('\u{552}', '\u{582}'), ('\u{553}', '\u{583}'), ('\u{554}', '\u{584}'), + ('\u{555}', '\u{585}'), ('\u{556}', '\u{586}'), ('\u{10a0}', + '\u{2d00}'), ('\u{10a1}', '\u{2d01}'), ('\u{10a2}', '\u{2d02}'), + ('\u{10a3}', '\u{2d03}'), ('\u{10a4}', '\u{2d04}'), ('\u{10a5}', + '\u{2d05}'), ('\u{10a6}', '\u{2d06}'), ('\u{10a7}', '\u{2d07}'), + ('\u{10a8}', '\u{2d08}'), ('\u{10a9}', '\u{2d09}'), ('\u{10aa}', + '\u{2d0a}'), ('\u{10ab}', '\u{2d0b}'), ('\u{10ac}', '\u{2d0c}'), + ('\u{10ad}', '\u{2d0d}'), ('\u{10ae}', '\u{2d0e}'), ('\u{10af}', + '\u{2d0f}'), ('\u{10b0}', '\u{2d10}'), ('\u{10b1}', '\u{2d11}'), + ('\u{10b2}', '\u{2d12}'), ('\u{10b3}', '\u{2d13}'), ('\u{10b4}', + '\u{2d14}'), ('\u{10b5}', '\u{2d15}'), ('\u{10b6}', '\u{2d16}'), + ('\u{10b7}', '\u{2d17}'), ('\u{10b8}', '\u{2d18}'), ('\u{10b9}', + '\u{2d19}'), ('\u{10ba}', '\u{2d1a}'), ('\u{10bb}', '\u{2d1b}'), + ('\u{10bc}', '\u{2d1c}'), ('\u{10bd}', '\u{2d1d}'), ('\u{10be}', + '\u{2d1e}'), ('\u{10bf}', '\u{2d1f}'), ('\u{10c0}', '\u{2d20}'), + ('\u{10c1}', '\u{2d21}'), ('\u{10c2}', '\u{2d22}'), ('\u{10c3}', + '\u{2d23}'), ('\u{10c4}', '\u{2d24}'), ('\u{10c5}', '\u{2d25}'), + ('\u{10c7}', '\u{2d27}'), ('\u{10cd}', '\u{2d2d}'), ('\u{1e00}', + '\u{1e01}'), ('\u{1e02}', '\u{1e03}'), ('\u{1e04}', '\u{1e05}'), + ('\u{1e06}', '\u{1e07}'), ('\u{1e08}', '\u{1e09}'), ('\u{1e0a}', + '\u{1e0b}'), ('\u{1e0c}', '\u{1e0d}'), ('\u{1e0e}', '\u{1e0f}'), + ('\u{1e10}', '\u{1e11}'), ('\u{1e12}', '\u{1e13}'), ('\u{1e14}', + '\u{1e15}'), ('\u{1e16}', '\u{1e17}'), ('\u{1e18}', '\u{1e19}'), + ('\u{1e1a}', '\u{1e1b}'), ('\u{1e1c}', '\u{1e1d}'), ('\u{1e1e}', + '\u{1e1f}'), ('\u{1e20}', '\u{1e21}'), ('\u{1e22}', '\u{1e23}'), + ('\u{1e24}', '\u{1e25}'), ('\u{1e26}', '\u{1e27}'), ('\u{1e28}', + '\u{1e29}'), ('\u{1e2a}', '\u{1e2b}'), ('\u{1e2c}', '\u{1e2d}'), + ('\u{1e2e}', '\u{1e2f}'), ('\u{1e30}', '\u{1e31}'), ('\u{1e32}', + '\u{1e33}'), ('\u{1e34}', '\u{1e35}'), ('\u{1e36}', '\u{1e37}'), + ('\u{1e38}', '\u{1e39}'), ('\u{1e3a}', '\u{1e3b}'), ('\u{1e3c}', + '\u{1e3d}'), ('\u{1e3e}', '\u{1e3f}'), ('\u{1e40}', '\u{1e41}'), + ('\u{1e42}', '\u{1e43}'), ('\u{1e44}', '\u{1e45}'), ('\u{1e46}', + '\u{1e47}'), ('\u{1e48}', '\u{1e49}'), ('\u{1e4a}', '\u{1e4b}'), + ('\u{1e4c}', '\u{1e4d}'), ('\u{1e4e}', '\u{1e4f}'), ('\u{1e50}', + '\u{1e51}'), ('\u{1e52}', '\u{1e53}'), ('\u{1e54}', '\u{1e55}'), + ('\u{1e56}', '\u{1e57}'), ('\u{1e58}', '\u{1e59}'), ('\u{1e5a}', + '\u{1e5b}'), ('\u{1e5c}', '\u{1e5d}'), ('\u{1e5e}', '\u{1e5f}'), + ('\u{1e60}', '\u{1e61}'), ('\u{1e62}', '\u{1e63}'), ('\u{1e64}', + '\u{1e65}'), ('\u{1e66}', '\u{1e67}'), ('\u{1e68}', '\u{1e69}'), + ('\u{1e6a}', '\u{1e6b}'), ('\u{1e6c}', '\u{1e6d}'), ('\u{1e6e}', + '\u{1e6f}'), ('\u{1e70}', '\u{1e71}'), ('\u{1e72}', '\u{1e73}'), + ('\u{1e74}', '\u{1e75}'), ('\u{1e76}', '\u{1e77}'), ('\u{1e78}', + '\u{1e79}'), ('\u{1e7a}', '\u{1e7b}'), ('\u{1e7c}', '\u{1e7d}'), + ('\u{1e7e}', '\u{1e7f}'), ('\u{1e80}', '\u{1e81}'), ('\u{1e82}', + '\u{1e83}'), ('\u{1e84}', '\u{1e85}'), ('\u{1e86}', '\u{1e87}'), + ('\u{1e88}', '\u{1e89}'), ('\u{1e8a}', '\u{1e8b}'), ('\u{1e8c}', + '\u{1e8d}'), ('\u{1e8e}', '\u{1e8f}'), ('\u{1e90}', '\u{1e91}'), + ('\u{1e92}', '\u{1e93}'), ('\u{1e94}', '\u{1e95}'), ('\u{1e9b}', + '\u{1e61}'), ('\u{1e9e}', '\u{df}'), ('\u{1ea0}', '\u{1ea1}'), + ('\u{1ea2}', '\u{1ea3}'), ('\u{1ea4}', '\u{1ea5}'), ('\u{1ea6}', + '\u{1ea7}'), ('\u{1ea8}', '\u{1ea9}'), ('\u{1eaa}', '\u{1eab}'), + ('\u{1eac}', '\u{1ead}'), ('\u{1eae}', '\u{1eaf}'), ('\u{1eb0}', + '\u{1eb1}'), ('\u{1eb2}', '\u{1eb3}'), ('\u{1eb4}', '\u{1eb5}'), + ('\u{1eb6}', '\u{1eb7}'), ('\u{1eb8}', '\u{1eb9}'), ('\u{1eba}', + '\u{1ebb}'), ('\u{1ebc}', '\u{1ebd}'), ('\u{1ebe}', '\u{1ebf}'), + ('\u{1ec0}', '\u{1ec1}'), ('\u{1ec2}', '\u{1ec3}'), ('\u{1ec4}', + '\u{1ec5}'), ('\u{1ec6}', '\u{1ec7}'), ('\u{1ec8}', '\u{1ec9}'), + ('\u{1eca}', '\u{1ecb}'), ('\u{1ecc}', '\u{1ecd}'), ('\u{1ece}', + '\u{1ecf}'), ('\u{1ed0}', '\u{1ed1}'), ('\u{1ed2}', '\u{1ed3}'), + ('\u{1ed4}', '\u{1ed5}'), ('\u{1ed6}', '\u{1ed7}'), ('\u{1ed8}', + '\u{1ed9}'), ('\u{1eda}', '\u{1edb}'), ('\u{1edc}', '\u{1edd}'), + ('\u{1ede}', '\u{1edf}'), ('\u{1ee0}', '\u{1ee1}'), ('\u{1ee2}', + '\u{1ee3}'), ('\u{1ee4}', '\u{1ee5}'), ('\u{1ee6}', '\u{1ee7}'), + ('\u{1ee8}', '\u{1ee9}'), ('\u{1eea}', '\u{1eeb}'), ('\u{1eec}', + '\u{1eed}'), ('\u{1eee}', '\u{1eef}'), ('\u{1ef0}', '\u{1ef1}'), + ('\u{1ef2}', '\u{1ef3}'), ('\u{1ef4}', '\u{1ef5}'), ('\u{1ef6}', + '\u{1ef7}'), ('\u{1ef8}', '\u{1ef9}'), ('\u{1efa}', '\u{1efb}'), + ('\u{1efc}', '\u{1efd}'), ('\u{1efe}', '\u{1eff}'), ('\u{1f08}', + '\u{1f00}'), ('\u{1f09}', '\u{1f01}'), ('\u{1f0a}', '\u{1f02}'), + ('\u{1f0b}', '\u{1f03}'), ('\u{1f0c}', '\u{1f04}'), ('\u{1f0d}', + '\u{1f05}'), ('\u{1f0e}', '\u{1f06}'), ('\u{1f0f}', '\u{1f07}'), + ('\u{1f18}', '\u{1f10}'), ('\u{1f19}', '\u{1f11}'), ('\u{1f1a}', + '\u{1f12}'), ('\u{1f1b}', '\u{1f13}'), ('\u{1f1c}', '\u{1f14}'), + ('\u{1f1d}', '\u{1f15}'), ('\u{1f28}', '\u{1f20}'), ('\u{1f29}', + '\u{1f21}'), ('\u{1f2a}', '\u{1f22}'), ('\u{1f2b}', '\u{1f23}'), + ('\u{1f2c}', '\u{1f24}'), ('\u{1f2d}', '\u{1f25}'), ('\u{1f2e}', + '\u{1f26}'), ('\u{1f2f}', '\u{1f27}'), ('\u{1f38}', '\u{1f30}'), + ('\u{1f39}', '\u{1f31}'), ('\u{1f3a}', '\u{1f32}'), ('\u{1f3b}', + '\u{1f33}'), ('\u{1f3c}', '\u{1f34}'), ('\u{1f3d}', '\u{1f35}'), + ('\u{1f3e}', '\u{1f36}'), ('\u{1f3f}', '\u{1f37}'), ('\u{1f48}', + '\u{1f40}'), ('\u{1f49}', '\u{1f41}'), ('\u{1f4a}', '\u{1f42}'), + ('\u{1f4b}', '\u{1f43}'), ('\u{1f4c}', '\u{1f44}'), ('\u{1f4d}', + '\u{1f45}'), ('\u{1f59}', '\u{1f51}'), ('\u{1f5b}', '\u{1f53}'), + ('\u{1f5d}', '\u{1f55}'), ('\u{1f5f}', '\u{1f57}'), ('\u{1f68}', + '\u{1f60}'), ('\u{1f69}', '\u{1f61}'), ('\u{1f6a}', '\u{1f62}'), + ('\u{1f6b}', '\u{1f63}'), ('\u{1f6c}', '\u{1f64}'), ('\u{1f6d}', + '\u{1f65}'), ('\u{1f6e}', '\u{1f66}'), ('\u{1f6f}', '\u{1f67}'), + ('\u{1f88}', '\u{1f80}'), ('\u{1f89}', '\u{1f81}'), ('\u{1f8a}', + '\u{1f82}'), ('\u{1f8b}', '\u{1f83}'), ('\u{1f8c}', '\u{1f84}'), + ('\u{1f8d}', '\u{1f85}'), ('\u{1f8e}', '\u{1f86}'), ('\u{1f8f}', + '\u{1f87}'), ('\u{1f98}', '\u{1f90}'), ('\u{1f99}', '\u{1f91}'), + ('\u{1f9a}', '\u{1f92}'), ('\u{1f9b}', '\u{1f93}'), ('\u{1f9c}', + '\u{1f94}'), ('\u{1f9d}', '\u{1f95}'), ('\u{1f9e}', '\u{1f96}'), + ('\u{1f9f}', '\u{1f97}'), ('\u{1fa8}', '\u{1fa0}'), ('\u{1fa9}', + '\u{1fa1}'), ('\u{1faa}', '\u{1fa2}'), ('\u{1fab}', '\u{1fa3}'), + ('\u{1fac}', '\u{1fa4}'), ('\u{1fad}', '\u{1fa5}'), ('\u{1fae}', + '\u{1fa6}'), ('\u{1faf}', '\u{1fa7}'), ('\u{1fb8}', '\u{1fb0}'), + ('\u{1fb9}', '\u{1fb1}'), ('\u{1fba}', '\u{1f70}'), ('\u{1fbb}', + '\u{1f71}'), ('\u{1fbc}', '\u{1fb3}'), ('\u{1fbe}', '\u{3b9}'), + ('\u{1fc8}', '\u{1f72}'), ('\u{1fc9}', '\u{1f73}'), ('\u{1fca}', + '\u{1f74}'), ('\u{1fcb}', '\u{1f75}'), ('\u{1fcc}', '\u{1fc3}'), + ('\u{1fd8}', '\u{1fd0}'), ('\u{1fd9}', '\u{1fd1}'), ('\u{1fda}', + '\u{1f76}'), ('\u{1fdb}', '\u{1f77}'), ('\u{1fe8}', '\u{1fe0}'), + ('\u{1fe9}', '\u{1fe1}'), ('\u{1fea}', '\u{1f7a}'), ('\u{1feb}', + '\u{1f7b}'), ('\u{1fec}', '\u{1fe5}'), ('\u{1ff8}', '\u{1f78}'), + ('\u{1ff9}', '\u{1f79}'), ('\u{1ffa}', '\u{1f7c}'), ('\u{1ffb}', + '\u{1f7d}'), ('\u{1ffc}', '\u{1ff3}'), ('\u{2126}', '\u{3c9}'), + ('\u{212a}', '\u{6b}'), ('\u{212b}', '\u{e5}'), ('\u{2132}', + '\u{214e}'), ('\u{2160}', '\u{2170}'), ('\u{2161}', '\u{2171}'), + ('\u{2162}', '\u{2172}'), ('\u{2163}', '\u{2173}'), ('\u{2164}', + '\u{2174}'), ('\u{2165}', '\u{2175}'), ('\u{2166}', '\u{2176}'), + ('\u{2167}', '\u{2177}'), ('\u{2168}', '\u{2178}'), ('\u{2169}', + '\u{2179}'), ('\u{216a}', '\u{217a}'), ('\u{216b}', '\u{217b}'), + ('\u{216c}', '\u{217c}'), ('\u{216d}', '\u{217d}'), ('\u{216e}', + '\u{217e}'), ('\u{216f}', '\u{217f}'), ('\u{2183}', '\u{2184}'), + ('\u{24b6}', '\u{24d0}'), ('\u{24b7}', '\u{24d1}'), ('\u{24b8}', + '\u{24d2}'), ('\u{24b9}', '\u{24d3}'), ('\u{24ba}', '\u{24d4}'), + ('\u{24bb}', '\u{24d5}'), ('\u{24bc}', '\u{24d6}'), ('\u{24bd}', + '\u{24d7}'), ('\u{24be}', '\u{24d8}'), ('\u{24bf}', '\u{24d9}'), + ('\u{24c0}', '\u{24da}'), ('\u{24c1}', '\u{24db}'), ('\u{24c2}', + '\u{24dc}'), ('\u{24c3}', '\u{24dd}'), ('\u{24c4}', '\u{24de}'), + ('\u{24c5}', '\u{24df}'), ('\u{24c6}', '\u{24e0}'), ('\u{24c7}', + '\u{24e1}'), ('\u{24c8}', '\u{24e2}'), ('\u{24c9}', '\u{24e3}'), + ('\u{24ca}', '\u{24e4}'), ('\u{24cb}', '\u{24e5}'), ('\u{24cc}', + '\u{24e6}'), ('\u{24cd}', '\u{24e7}'), ('\u{24ce}', '\u{24e8}'), + ('\u{24cf}', '\u{24e9}'), ('\u{2c00}', '\u{2c30}'), ('\u{2c01}', + '\u{2c31}'), ('\u{2c02}', '\u{2c32}'), ('\u{2c03}', '\u{2c33}'), + ('\u{2c04}', '\u{2c34}'), ('\u{2c05}', '\u{2c35}'), ('\u{2c06}', + '\u{2c36}'), ('\u{2c07}', '\u{2c37}'), ('\u{2c08}', '\u{2c38}'), + ('\u{2c09}', '\u{2c39}'), ('\u{2c0a}', '\u{2c3a}'), ('\u{2c0b}', + '\u{2c3b}'), ('\u{2c0c}', '\u{2c3c}'), ('\u{2c0d}', '\u{2c3d}'), + ('\u{2c0e}', '\u{2c3e}'), ('\u{2c0f}', '\u{2c3f}'), ('\u{2c10}', + '\u{2c40}'), ('\u{2c11}', '\u{2c41}'), ('\u{2c12}', '\u{2c42}'), + ('\u{2c13}', '\u{2c43}'), ('\u{2c14}', '\u{2c44}'), ('\u{2c15}', + '\u{2c45}'), ('\u{2c16}', '\u{2c46}'), ('\u{2c17}', '\u{2c47}'), + ('\u{2c18}', '\u{2c48}'), ('\u{2c19}', '\u{2c49}'), ('\u{2c1a}', + '\u{2c4a}'), ('\u{2c1b}', '\u{2c4b}'), ('\u{2c1c}', '\u{2c4c}'), + ('\u{2c1d}', '\u{2c4d}'), ('\u{2c1e}', '\u{2c4e}'), ('\u{2c1f}', + '\u{2c4f}'), ('\u{2c20}', '\u{2c50}'), ('\u{2c21}', '\u{2c51}'), + ('\u{2c22}', '\u{2c52}'), ('\u{2c23}', '\u{2c53}'), ('\u{2c24}', + '\u{2c54}'), ('\u{2c25}', '\u{2c55}'), ('\u{2c26}', '\u{2c56}'), + ('\u{2c27}', '\u{2c57}'), ('\u{2c28}', '\u{2c58}'), ('\u{2c29}', + '\u{2c59}'), ('\u{2c2a}', '\u{2c5a}'), ('\u{2c2b}', '\u{2c5b}'), + ('\u{2c2c}', '\u{2c5c}'), ('\u{2c2d}', '\u{2c5d}'), ('\u{2c2e}', + '\u{2c5e}'), ('\u{2c60}', '\u{2c61}'), ('\u{2c62}', '\u{26b}'), + ('\u{2c63}', '\u{1d7d}'), ('\u{2c64}', '\u{27d}'), ('\u{2c67}', + '\u{2c68}'), ('\u{2c69}', '\u{2c6a}'), ('\u{2c6b}', '\u{2c6c}'), + ('\u{2c6d}', '\u{251}'), ('\u{2c6e}', '\u{271}'), ('\u{2c6f}', + '\u{250}'), ('\u{2c70}', '\u{252}'), ('\u{2c72}', '\u{2c73}'), + ('\u{2c75}', '\u{2c76}'), ('\u{2c7e}', '\u{23f}'), ('\u{2c7f}', + '\u{240}'), ('\u{2c80}', '\u{2c81}'), ('\u{2c82}', '\u{2c83}'), + ('\u{2c84}', '\u{2c85}'), ('\u{2c86}', '\u{2c87}'), ('\u{2c88}', + '\u{2c89}'), ('\u{2c8a}', '\u{2c8b}'), ('\u{2c8c}', '\u{2c8d}'), + ('\u{2c8e}', '\u{2c8f}'), ('\u{2c90}', '\u{2c91}'), ('\u{2c92}', + '\u{2c93}'), ('\u{2c94}', '\u{2c95}'), ('\u{2c96}', '\u{2c97}'), + ('\u{2c98}', '\u{2c99}'), ('\u{2c9a}', '\u{2c9b}'), ('\u{2c9c}', + '\u{2c9d}'), ('\u{2c9e}', '\u{2c9f}'), ('\u{2ca0}', '\u{2ca1}'), + ('\u{2ca2}', '\u{2ca3}'), ('\u{2ca4}', '\u{2ca5}'), ('\u{2ca6}', + '\u{2ca7}'), ('\u{2ca8}', '\u{2ca9}'), ('\u{2caa}', '\u{2cab}'), + ('\u{2cac}', '\u{2cad}'), ('\u{2cae}', '\u{2caf}'), ('\u{2cb0}', + '\u{2cb1}'), ('\u{2cb2}', '\u{2cb3}'), ('\u{2cb4}', '\u{2cb5}'), + ('\u{2cb6}', '\u{2cb7}'), ('\u{2cb8}', '\u{2cb9}'), ('\u{2cba}', + '\u{2cbb}'), ('\u{2cbc}', '\u{2cbd}'), ('\u{2cbe}', '\u{2cbf}'), + ('\u{2cc0}', '\u{2cc1}'), ('\u{2cc2}', '\u{2cc3}'), ('\u{2cc4}', + '\u{2cc5}'), ('\u{2cc6}', '\u{2cc7}'), ('\u{2cc8}', '\u{2cc9}'), + ('\u{2cca}', '\u{2ccb}'), ('\u{2ccc}', '\u{2ccd}'), ('\u{2cce}', + '\u{2ccf}'), ('\u{2cd0}', '\u{2cd1}'), ('\u{2cd2}', '\u{2cd3}'), + ('\u{2cd4}', '\u{2cd5}'), ('\u{2cd6}', '\u{2cd7}'), ('\u{2cd8}', + '\u{2cd9}'), ('\u{2cda}', '\u{2cdb}'), ('\u{2cdc}', '\u{2cdd}'), + ('\u{2cde}', '\u{2cdf}'), ('\u{2ce0}', '\u{2ce1}'), ('\u{2ce2}', + '\u{2ce3}'), ('\u{2ceb}', '\u{2cec}'), ('\u{2ced}', '\u{2cee}'), + ('\u{2cf2}', '\u{2cf3}'), ('\u{a640}', '\u{a641}'), ('\u{a642}', + '\u{a643}'), ('\u{a644}', '\u{a645}'), ('\u{a646}', '\u{a647}'), + ('\u{a648}', '\u{a649}'), ('\u{a64a}', '\u{a64b}'), ('\u{a64c}', + '\u{a64d}'), ('\u{a64e}', '\u{a64f}'), ('\u{a650}', '\u{a651}'), + ('\u{a652}', '\u{a653}'), ('\u{a654}', '\u{a655}'), ('\u{a656}', + '\u{a657}'), ('\u{a658}', '\u{a659}'), ('\u{a65a}', '\u{a65b}'), + ('\u{a65c}', '\u{a65d}'), ('\u{a65e}', '\u{a65f}'), ('\u{a660}', + '\u{a661}'), ('\u{a662}', '\u{a663}'), ('\u{a664}', '\u{a665}'), + ('\u{a666}', '\u{a667}'), ('\u{a668}', '\u{a669}'), ('\u{a66a}', + '\u{a66b}'), ('\u{a66c}', '\u{a66d}'), ('\u{a680}', '\u{a681}'), + ('\u{a682}', '\u{a683}'), ('\u{a684}', '\u{a685}'), ('\u{a686}', + '\u{a687}'), ('\u{a688}', '\u{a689}'), ('\u{a68a}', '\u{a68b}'), + ('\u{a68c}', '\u{a68d}'), ('\u{a68e}', '\u{a68f}'), ('\u{a690}', + '\u{a691}'), ('\u{a692}', '\u{a693}'), ('\u{a694}', '\u{a695}'), + ('\u{a696}', '\u{a697}'), ('\u{a698}', '\u{a699}'), ('\u{a69a}', + '\u{a69b}'), ('\u{a722}', '\u{a723}'), ('\u{a724}', '\u{a725}'), + ('\u{a726}', '\u{a727}'), ('\u{a728}', '\u{a729}'), ('\u{a72a}', + '\u{a72b}'), ('\u{a72c}', '\u{a72d}'), ('\u{a72e}', '\u{a72f}'), + ('\u{a732}', '\u{a733}'), ('\u{a734}', '\u{a735}'), ('\u{a736}', + '\u{a737}'), ('\u{a738}', '\u{a739}'), ('\u{a73a}', '\u{a73b}'), + ('\u{a73c}', '\u{a73d}'), ('\u{a73e}', '\u{a73f}'), ('\u{a740}', + '\u{a741}'), ('\u{a742}', '\u{a743}'), ('\u{a744}', '\u{a745}'), + ('\u{a746}', '\u{a747}'), ('\u{a748}', '\u{a749}'), ('\u{a74a}', + '\u{a74b}'), ('\u{a74c}', '\u{a74d}'), ('\u{a74e}', '\u{a74f}'), + ('\u{a750}', '\u{a751}'), ('\u{a752}', '\u{a753}'), ('\u{a754}', + '\u{a755}'), ('\u{a756}', '\u{a757}'), ('\u{a758}', '\u{a759}'), + ('\u{a75a}', '\u{a75b}'), ('\u{a75c}', '\u{a75d}'), ('\u{a75e}', + '\u{a75f}'), ('\u{a760}', '\u{a761}'), ('\u{a762}', '\u{a763}'), + ('\u{a764}', '\u{a765}'), ('\u{a766}', '\u{a767}'), ('\u{a768}', + '\u{a769}'), ('\u{a76a}', '\u{a76b}'), ('\u{a76c}', '\u{a76d}'), + ('\u{a76e}', '\u{a76f}'), ('\u{a779}', '\u{a77a}'), ('\u{a77b}', + '\u{a77c}'), ('\u{a77d}', '\u{1d79}'), ('\u{a77e}', '\u{a77f}'), + ('\u{a780}', '\u{a781}'), ('\u{a782}', '\u{a783}'), ('\u{a784}', + '\u{a785}'), ('\u{a786}', '\u{a787}'), ('\u{a78b}', '\u{a78c}'), + ('\u{a78d}', '\u{265}'), ('\u{a790}', '\u{a791}'), ('\u{a792}', + '\u{a793}'), ('\u{a796}', '\u{a797}'), ('\u{a798}', '\u{a799}'), + ('\u{a79a}', '\u{a79b}'), ('\u{a79c}', '\u{a79d}'), ('\u{a79e}', + '\u{a79f}'), ('\u{a7a0}', '\u{a7a1}'), ('\u{a7a2}', '\u{a7a3}'), + ('\u{a7a4}', '\u{a7a5}'), ('\u{a7a6}', '\u{a7a7}'), ('\u{a7a8}', + '\u{a7a9}'), ('\u{a7aa}', '\u{266}'), ('\u{a7ab}', '\u{25c}'), + ('\u{a7ac}', '\u{261}'), ('\u{a7ad}', '\u{26c}'), ('\u{a7b0}', + '\u{29e}'), ('\u{a7b1}', '\u{287}'), ('\u{ff21}', '\u{ff41}'), + ('\u{ff22}', '\u{ff42}'), ('\u{ff23}', '\u{ff43}'), ('\u{ff24}', + '\u{ff44}'), ('\u{ff25}', '\u{ff45}'), ('\u{ff26}', '\u{ff46}'), + ('\u{ff27}', '\u{ff47}'), ('\u{ff28}', '\u{ff48}'), ('\u{ff29}', + '\u{ff49}'), ('\u{ff2a}', '\u{ff4a}'), ('\u{ff2b}', '\u{ff4b}'), + ('\u{ff2c}', '\u{ff4c}'), ('\u{ff2d}', '\u{ff4d}'), ('\u{ff2e}', + '\u{ff4e}'), ('\u{ff2f}', '\u{ff4f}'), ('\u{ff30}', '\u{ff50}'), + ('\u{ff31}', '\u{ff51}'), ('\u{ff32}', '\u{ff52}'), ('\u{ff33}', + '\u{ff53}'), ('\u{ff34}', '\u{ff54}'), ('\u{ff35}', '\u{ff55}'), + ('\u{ff36}', '\u{ff56}'), ('\u{ff37}', '\u{ff57}'), ('\u{ff38}', + '\u{ff58}'), ('\u{ff39}', '\u{ff59}'), ('\u{ff3a}', '\u{ff5a}'), + ('\u{10400}', '\u{10428}'), ('\u{10401}', '\u{10429}'), ('\u{10402}', + '\u{1042a}'), ('\u{10403}', '\u{1042b}'), ('\u{10404}', '\u{1042c}'), + ('\u{10405}', '\u{1042d}'), ('\u{10406}', '\u{1042e}'), ('\u{10407}', + '\u{1042f}'), ('\u{10408}', '\u{10430}'), ('\u{10409}', '\u{10431}'), + ('\u{1040a}', '\u{10432}'), ('\u{1040b}', '\u{10433}'), ('\u{1040c}', + '\u{10434}'), ('\u{1040d}', '\u{10435}'), ('\u{1040e}', '\u{10436}'), + ('\u{1040f}', '\u{10437}'), ('\u{10410}', '\u{10438}'), ('\u{10411}', + '\u{10439}'), ('\u{10412}', '\u{1043a}'), ('\u{10413}', '\u{1043b}'), + ('\u{10414}', '\u{1043c}'), ('\u{10415}', '\u{1043d}'), ('\u{10416}', + '\u{1043e}'), ('\u{10417}', '\u{1043f}'), ('\u{10418}', '\u{10440}'), + ('\u{10419}', '\u{10441}'), ('\u{1041a}', '\u{10442}'), ('\u{1041b}', + '\u{10443}'), ('\u{1041c}', '\u{10444}'), ('\u{1041d}', '\u{10445}'), + ('\u{1041e}', '\u{10446}'), ('\u{1041f}', '\u{10447}'), ('\u{10420}', + '\u{10448}'), ('\u{10421}', '\u{10449}'), ('\u{10422}', '\u{1044a}'), + ('\u{10423}', '\u{1044b}'), ('\u{10424}', '\u{1044c}'), ('\u{10425}', + '\u{1044d}'), ('\u{10426}', '\u{1044e}'), ('\u{10427}', '\u{1044f}'), + ('\u{118a0}', '\u{118c0}'), ('\u{118a1}', '\u{118c1}'), ('\u{118a2}', + '\u{118c2}'), ('\u{118a3}', '\u{118c3}'), ('\u{118a4}', '\u{118c4}'), + ('\u{118a5}', '\u{118c5}'), ('\u{118a6}', '\u{118c6}'), ('\u{118a7}', + '\u{118c7}'), ('\u{118a8}', '\u{118c8}'), ('\u{118a9}', '\u{118c9}'), + ('\u{118aa}', '\u{118ca}'), ('\u{118ab}', '\u{118cb}'), ('\u{118ac}', + '\u{118cc}'), ('\u{118ad}', '\u{118cd}'), ('\u{118ae}', '\u{118ce}'), + ('\u{118af}', '\u{118cf}'), ('\u{118b0}', '\u{118d0}'), ('\u{118b1}', + '\u{118d1}'), ('\u{118b2}', '\u{118d2}'), ('\u{118b3}', '\u{118d3}'), + ('\u{118b4}', '\u{118d4}'), ('\u{118b5}', '\u{118d5}'), ('\u{118b6}', + '\u{118d6}'), ('\u{118b7}', '\u{118d7}'), ('\u{118b8}', '\u{118d8}'), + ('\u{118b9}', '\u{118d9}'), ('\u{118ba}', '\u{118da}'), ('\u{118bb}', + '\u{118db}'), ('\u{118bc}', '\u{118dc}'), ('\u{118bd}', '\u{118dd}'), + ('\u{118be}', '\u{118de}'), ('\u{118bf}', '\u{118df}') + ]; + +} + diff --git a/src/vm.rs b/src/vm.rs index 10c8bae268..84da8e5089 100644 --- a/src/vm.rs +++ b/src/vm.rs @@ -47,6 +47,7 @@ use compile::Inst::{ }; use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED}; use unicode::regex::PERLW; +use unicode::case_folding; pub type CaptureLocs = Vec>; @@ -232,7 +233,7 @@ impl<'r, 't> Nfa<'r, 't> { if let Some(mut c) = self.chars.prev { let negate = flags & FLAG_NEGATED > 0; if flags & FLAG_NOCASE > 0 { - c = c.to_uppercase().next().unwrap(); + c = simple_case_fold(c); } let found = ranges.binary_search_by(|&rc| class_cmp(c, rc)).is_ok(); if found ^ negate { @@ -326,19 +327,14 @@ impl<'r, 't> Nfa<'r, 't> { } } - // FIXME: For case insensitive comparisons, it uses the uppercase - // character and tests for equality. IIUC, this does not generalize to - // all of Unicode. I believe we need to check the entire fold for each - // character. This will be easy to add if and when it gets added to Rust's - // standard library. + // Use Unicode simple case folding for case insensitive comparisons, + // as we’re matching individual code points. #[inline] fn char_eq(&self, casei: bool, textc: Option, regc: char) -> bool { match textc { None => false, Some(textc) => { - let uregc = regc.to_uppercase().next().unwrap(); - let utextc = textc.to_uppercase().next().unwrap(); - regc == textc || (casei && uregc == utextc) + regc == textc || (casei && simple_case_fold(regc) == simple_case_fold(textc)) } } } @@ -536,6 +532,18 @@ pub fn is_word(c: Option) -> bool { } } + +/// Returns the Unicode *simple* case folding of `c`. +/// Uses the mappings with status C + S form Unicode’s `CaseFolding.txt`. +/// This is not as “correct” as full case folding, but preserves the number of code points. +pub fn simple_case_fold(c: char) -> char { + match case_folding::C_plus_S_table.binary_search_by(|&(x, _)| x.cmp(&c)) { + Ok(i) => case_folding::C_plus_S_table[i].1, + Err(_) => c + } +} + + /// Given a character and a single character class range, return an ordering /// indicating whether the character is less than the start of the range, /// in the range (inclusive) or greater than the end of the range.