diff --git a/mk/crates.mk b/mk/crates.mk index 28330010dc2ac..fc0afa6df6271 100644 --- a/mk/crates.mk +++ b/mk/crates.mk @@ -51,8 +51,8 @@ TARGET_CRATES := libc std green rustuv native flate arena glob term semver \ uuid serialize sync getopts collections num test time rand \ - workcache url log -HOST_CRATES := syntax rustc rustdoc fourcc hexfloat + workcache url log regex +HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros CRATES := $(TARGET_CRATES) $(HOST_CRATES) TOOLS := compiletest rustdoc rustc @@ -84,6 +84,8 @@ DEPS_rand := std DEPS_url := std collections DEPS_workcache := std serialize collections log DEPS_log := std sync +DEPS_regex := std collections +DEPS_regex_macros = syntax std regex TOOL_DEPS_compiletest := test green rustuv getopts TOOL_DEPS_rustdoc := rustdoc native diff --git a/mk/main.mk b/mk/main.mk index 24ab522ec6098..685dd0b51829b 100644 --- a/mk/main.mk +++ b/mk/main.mk @@ -311,8 +311,6 @@ HSREQ$(1)_H_$(3) = $$(HBIN$(1)_H_$(3))/rustc$$(X_$(3)) else HSREQ$(1)_H_$(3) = \ $$(HBIN$(1)_H_$(3))/rustc$$(X_$(3)) \ - $$(HLIB$(1)_H_$(3))/stamp.rustc \ - $$(foreach dep,$$(RUST_DEPS_rustc),$$(HLIB$(1)_H_$(3))/stamp.$$(dep)) \ $$(MKFILE_DEPS) endif @@ -334,8 +332,7 @@ SREQ$(1)_T_$(2)_H_$(3) = \ CSREQ$(1)_T_$(2)_H_$(3) = \ $$(TSREQ$(1)_T_$(2)_H_$(3)) \ $$(HBIN$(1)_H_$(3))/rustdoc$$(X_$(3)) \ - $$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep)) \ - $$(foreach dep,$$(HOST_CRATES),$$(HLIB$(1)_H_$(3))/stamp.$$(dep)) + $$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep)) ifeq ($(1),0) # Don't run the stage0 compiler under valgrind - that ship has sailed diff --git a/src/README.md b/src/README.md index 0ac310df1b866..de9a793bafc96 100644 --- a/src/README.md +++ b/src/README.md @@ -19,6 +19,7 @@ Source layout: | `libfourcc/` | Data format identifier library | | `libgetopts/` | Get command-line-options library | | `libglob/` | Unix glob patterns library | +| `libregex/` | Regular expressions | | `libsemver/` | Rust's semantic versioning library | | `libserialize/` | Encode-Decode types library | | `libsync/` | Concurrency mechanisms and primitives | diff --git a/src/doc/index.md b/src/doc/index.md index 57d75d7fc469e..0bfc9baaa1688 100644 --- a/src/doc/index.md +++ b/src/doc/index.md @@ -41,6 +41,7 @@ li {list-style-type: none; } * [The `native` 1:1 threading runtime](native/index.html) * [The `num` arbitrary precision numerics library](num/index.html) * [The `rand` library for random numbers and distributions](rand/index.html) +* [The `regex` library for regular expressions](regex/index.html) * [The `rustc` compiler](rustc/index.html) * [The `rustuv` M:N I/O library](rustuv/index.html) * [The `semver` version collation library](semver/index.html) diff --git a/src/etc/regex-match-tests.py b/src/etc/regex-match-tests.py new file mode 100755 index 0000000000000..826af961fce06 --- /dev/null +++ b/src/etc/regex-match-tests.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python2 + +# Copyright 2014 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +from __future__ import absolute_import, division, print_function +import argparse +import datetime +import os.path as path + + +def print_tests(tests): + print('\n'.join([test_tostr(t) for t in tests])) + + +def read_tests(f): + basename, _ = path.splitext(path.basename(f)) + tests = [] + for lineno, line in enumerate(open(f), 1): + fields = filter(None, map(str.strip, line.split('\t'))) + if not (4 <= len(fields) <= 5) \ + or 'E' not in fields[0] or fields[0][0] == '#': + continue + + opts, pat, text, sgroups = fields[0:4] + groups = [] # groups as integer ranges + if sgroups == 'NOMATCH': + groups = [None] + elif ',' in sgroups: + noparen = map(lambda s: s.strip('()'), sgroups.split(')(')) + for g in noparen: + s, e = map(str.strip, g.split(',')) + if s == '?' and e == '?': + groups.append(None) + else: + groups.append((int(s), int(e))) + else: + # This skips tests that should result in an error. + # There aren't many, so I think we can just capture those + # manually. Possibly fix this in future. + continue + + if pat == 'SAME': + pat = tests[-1][1] + if '$' in opts: + pat = pat.decode('string_escape') + text = text.decode('string_escape') + if 'i' in opts: + pat = '(?i)%s' % pat + + name = '%s_%d' % (basename, lineno) + tests.append((name, pat, text, groups)) + return tests + + +def test_tostr(t): + lineno, pat, text, groups = t + options = map(group_tostr, groups) + return 'mat!(match_%s, r"%s", r"%s", %s)' \ + % (lineno, pat, '' if text == "NULL" else text, ', '.join(options)) + + +def group_tostr(g): + if g is None: + return 'None' + else: + return 'Some((%d, %d))' % (g[0], g[1]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generate match tests from an AT&T POSIX test file.') + aa = parser.add_argument + aa('files', nargs='+', + help='A list of dat AT&T POSIX test files. See src/libregexp/testdata') + args = parser.parse_args() + + tests = [] + for f in args.files: + tests += read_tests(f) + + tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// ignore-tidy-linelength + +// DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests' +// on {date}. +''' + print(tpl.format(date=str(datetime.datetime.now()))) + + for f in args.files: + print('// Tests from %s' % path.basename(f)) + print_tests(read_tests(f)) + print('') diff --git a/src/etc/regex-unicode-tables.py b/src/etc/regex-unicode-tables.py new file mode 100755 index 0000000000000..5dc404736a403 --- /dev/null +++ b/src/etc/regex-unicode-tables.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python2 + +# Copyright 2014 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +from __future__ import absolute_import, division, print_function +import argparse +from collections import defaultdict +import csv +import datetime +import urllib2 + +BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/' +DATA = 'UnicodeData.txt' +SCRIPTS = 'Scripts.txt' + +# Mapping taken from Table 12 from: +# http://www.unicode.org/reports/tr44/#General_Category_Values +expanded_categories = { + 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], + 'Lm': ['L'], 'Lo': ['L'], + 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], + 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], + 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], + 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], + 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], + 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], + 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], +} + + +def as_4byte_uni(n): + s = hex(n)[2:] + return '\\U%s%s' % ('0' * (8 - len(s)), s) + + +def expand_cat(c): + return expanded_categories.get(c, []) + [c] + + +def is_valid_unicode(n): + return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF + + +def read_cats(f): + assigned = defaultdict(list) + for row in csv.reader(f, delimiter=';'): + (hex, cats) = (int(row[0], 16), expand_cat(row[2])) + if not is_valid_unicode(hex): + continue + for cat in cats: + assigned[cat].append(hex) + return assigned + + +def read_scripts(f): + assigned = defaultdict(list) + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + hexes, name = map(str.strip, line.split(';'))[:2] + name = name[:name.index('#')].strip() + if '..' not in hexes: + hex = int(hexes, 16) + if is_valid_unicode(hex): + assigned[name].append(hex) + else: + hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..')) + for hex in xrange(hex1, hex2 + 1): + if is_valid_unicode(hex): + assigned[name].append(hex) + return assigned + + +def group(letters): + letters = sorted(set(letters)) + grouped = [] + cur_start = letters.pop(0) + cur_end = cur_start + for letter in letters: + assert letter > cur_end, \ + 'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter)) + + if letter == cur_end + 1: + cur_end = letter + else: + grouped.append((cur_start, cur_end)) + cur_start, cur_end = letter, letter + grouped.append((cur_start, cur_end)) + return grouped + + +def ranges_to_rust(rs): + rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs) + return ',\n '.join(rs) + + +def groups_to_rust(groups): + rust_groups = [] + for group_name in sorted(groups): + rust_groups.append('("%s", &[\n %s\n ]),' + % (group_name, ranges_to_rust(groups[group_name]))) + return '\n'.join(rust_groups) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generate Unicode character class tables.') + aa = parser.add_argument + aa('--local', action='store_true', + help='When set, Scripts.txt and UnicodeData.txt will be read from ' + 'the CWD.') + aa('--base-url', type=str, default=BASE_URL, + help='The base URL to use for downloading Unicode data files.') + args = parser.parse_args() + + if args.local: + cats = read_cats(open(DATA)) + scripts = read_scripts(open(SCRIPTS)) + else: + cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA)) + scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS)) + + # Get Rust code for all Unicode general categories and scripts. + combined = dict(cats, **scripts) + unigroups = groups_to_rust({k: group(letters) + for k, letters in combined.items()}) + + # Now get Perl character classes that are Unicode friendly. + perld = range(ord('0'), ord('9') + 1) + dgroups = ranges_to_rust(group(perld + cats['Nd'][:])) + + perls = map(ord, ['\t', '\n', '\x0C', '\r', ' ']) + sgroups = ranges_to_rust(group(perls + cats['Z'][:])) + + low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1)) + perlw = [ord('_')] + perld + low + up + wgroups = ranges_to_rust(group(perlw + cats['L'][:])) + + tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables' +// on {date}. + +use parse::{{Class, NamedClasses}}; + +pub static UNICODE_CLASSES: NamedClasses = &[ + +{groups} + +]; + +pub static PERLD: Class = &[ + {dgroups} +]; + +pub static PERLS: Class = &[ + {sgroups} +]; + +pub static PERLW: Class = &[ + {wgroups} +]; +''' + now = datetime.datetime.now() + print(tpl.format(date=str(now), groups=unigroups, + dgroups=dgroups, sgroups=sgroups, wgroups=wgroups)) diff --git a/src/libregex/compile.rs b/src/libregex/compile.rs new file mode 100644 index 0000000000000..3987d75505099 --- /dev/null +++ b/src/libregex/compile.rs @@ -0,0 +1,274 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// Enable this to squash warnings due to exporting pieces of the representation +// for use with the regex! macro. See lib.rs for explanation. +#![allow(visible_private_types)] + +use std::cmp; +use std::iter; +use parse; +use parse::{ + Flags, FLAG_EMPTY, + Nothing, Literal, Dot, Class, Begin, End, WordBoundary, Capture, Cat, Alt, + Rep, + ZeroOne, ZeroMore, OneMore, +}; + +type InstIdx = uint; + +#[deriving(Show, Clone)] +pub enum Inst { + // When a Match instruction is executed, the current thread is successful. + Match, + + // The OneChar instruction matches a literal character. + // The flags indicate whether to do a case insensitive match. + OneChar(char, Flags), + + // The CharClass instruction tries to match one input character against + // the range of characters given. + // The flags indicate whether to do a case insentivie match and whether + // the character class is negated or not. + CharClass(Vec<(char, char)>, Flags), + + // Matches any character except new lines. + // The flags indicate whether to include the '\n' character. + Any(Flags), + + // Matches the beginning of the string, consumes no characters. + // The flags indicate whether it matches if the preceding character + // is a new line. + EmptyBegin(Flags), + + // Matches the end of the string, consumes no characters. + // The flags indicate whether it matches if the proceding character + // is a new line. + EmptyEnd(Flags), + + // Matches a word boundary (\w on one side and \W \A or \z on the other), + // and consumes no character. + // The flags indicate whether this matches a word boundary or something + // that isn't a word boundary. + EmptyWordBoundary(Flags), + + // Saves the current position in the input string to the Nth save slot. + Save(uint), + + // Jumps to the instruction at the index given. + Jump(InstIdx), + + // Jumps to the instruction at the first index given. If that leads to + // a failing state, then the instruction at the second index given is + // tried. + Split(InstIdx, InstIdx), +} + +/// Program represents a compiled regular expression. Once an expression is +/// compiled, its representation is immutable and will never change. +/// +/// All of the data in a compiled expression is wrapped in "MaybeStatic" or +/// "MaybeOwned" types so that a `Program` can be represented as static data. +/// (This makes it convenient and efficient for use with the `regex!` macro.) +#[deriving(Clone)] +pub struct Program { + /// A sequence of instructions. + pub insts: Vec, + /// If the regular expression requires a literal prefix in order to have a + /// match, that prefix is stored here. (It's used in the VM to implement + /// an optimization.) + pub prefix: ~str, +} + +impl Program { + /// Compiles a Regex given its AST. + pub fn new(ast: ~parse::Ast) -> (Program, ~[Option<~str>]) { + let mut c = Compiler { + insts: Vec::with_capacity(100), + names: Vec::with_capacity(10), + }; + + c.insts.push(Save(0)); + c.compile(ast); + c.insts.push(Save(1)); + c.insts.push(Match); + + // Try to discover a literal string prefix. + // This is a bit hacky since we have to skip over the initial + // 'Save' instruction. + let mut pre = StrBuf::with_capacity(5); + for i in iter::range(1, c.insts.len()) { + match *c.insts.get(i) { + OneChar(c, FLAG_EMPTY) => pre.push_char(c), + _ => break + } + } + + let names = c.names.as_slice().into_owned(); + let prog = Program { + insts: c.insts, + prefix: pre.into_owned(), + }; + (prog, names) + } + + /// Returns the total number of capture groups in the regular expression. + /// This includes the zeroth capture. + pub fn num_captures(&self) -> uint { + let mut n = 0; + for inst in self.insts.iter() { + match *inst { + Save(c) => n = cmp::max(n, c+1), + _ => {} + } + } + // There's exactly 2 Save slots for every capture. + n / 2 + } +} + +struct Compiler<'r> { + insts: Vec, + names: Vec>, +} + +// The compiler implemented here is extremely simple. Most of the complexity +// in this crate is in the parser or the VM. +// The only tricky thing here is patching jump/split instructions to point to +// the right instruction. +impl<'r> Compiler<'r> { + fn compile(&mut self, ast: ~parse::Ast) { + match ast { + ~Nothing => {}, + ~Literal(c, flags) => self.push(OneChar(c, flags)), + ~Dot(nl) => self.push(Any(nl)), + ~Class(ranges, flags) => + self.push(CharClass(ranges, flags)), + ~Begin(flags) => self.push(EmptyBegin(flags)), + ~End(flags) => self.push(EmptyEnd(flags)), + ~WordBoundary(flags) => self.push(EmptyWordBoundary(flags)), + ~Capture(cap, name, x) => { + let len = self.names.len(); + if cap >= len { + self.names.grow(10 + cap - len, &None) + } + *self.names.get_mut(cap) = name; + + self.push(Save(2 * cap)); + self.compile(x); + self.push(Save(2 * cap + 1)); + } + ~Cat(xs) => { + for x in xs.move_iter() { + self.compile(x) + } + } + ~Alt(x, y) => { + let split = self.empty_split(); // push: split 0, 0 + let j1 = self.insts.len(); + self.compile(x); // push: insts for x + let jmp = self.empty_jump(); // push: jmp 0 + let j2 = self.insts.len(); + self.compile(y); // push: insts for y + let j3 = self.insts.len(); + + self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2 + self.set_jump(jmp, j3); // jmp 0 -> jmp j3 + } + ~Rep(x, ZeroOne, g) => { + let split = self.empty_split(); + let j1 = self.insts.len(); + self.compile(x); + let j2 = self.insts.len(); + + if g.is_greedy() { + self.set_split(split, j1, j2); + } else { + self.set_split(split, j2, j1); + } + } + ~Rep(x, ZeroMore, g) => { + let j1 = self.insts.len(); + let split = self.empty_split(); + let j2 = self.insts.len(); + self.compile(x); + let jmp = self.empty_jump(); + let j3 = self.insts.len(); + + self.set_jump(jmp, j1); + if g.is_greedy() { + self.set_split(split, j2, j3); + } else { + self.set_split(split, j3, j2); + } + } + ~Rep(x, OneMore, g) => { + let j1 = self.insts.len(); + self.compile(x); + let split = self.empty_split(); + let j2 = self.insts.len(); + + if g.is_greedy() { + self.set_split(split, j1, j2); + } else { + self.set_split(split, j2, j1); + } + } + } + } + + /// Appends the given instruction to the program. + #[inline] + fn push(&mut self, x: Inst) { + self.insts.push(x) + } + + /// Appends an *empty* `Split` instruction to the program and returns + /// the index of that instruction. (The index can then be used to "patch" + /// the actual locations of the split in later.) + #[inline] + fn empty_split(&mut self) -> InstIdx { + self.insts.push(Split(0, 0)); + self.insts.len() - 1 + } + + /// Sets the left and right locations of a `Split` instruction at index + /// `i` to `pc1` and `pc2`, respectively. + /// If the instruction at index `i` isn't a `Split` instruction, then + /// `fail!` is called. + #[inline] + fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) { + let split = self.insts.get_mut(i); + match *split { + Split(_, _) => *split = Split(pc1, pc2), + _ => fail!("BUG: Invalid split index."), + } + } + + /// Appends an *empty* `Jump` instruction to the program and returns the + /// index of that instruction. + #[inline] + fn empty_jump(&mut self) -> InstIdx { + self.insts.push(Jump(0)); + self.insts.len() - 1 + } + + /// Sets the location of a `Jump` instruction at index `i` to `pc`. + /// If the instruction at index `i` isn't a `Jump` instruction, then + /// `fail!` is called. + #[inline] + fn set_jump(&mut self, i: InstIdx, pc: InstIdx) { + let jmp = self.insts.get_mut(i); + match *jmp { + Jump(_) => *jmp = Jump(pc), + _ => fail!("BUG: Invalid jump index."), + } + } +} diff --git a/src/libregex/lib.rs b/src/libregex/lib.rs new file mode 100644 index 0000000000000..cd5d387bfa0d6 --- /dev/null +++ b/src/libregex/lib.rs @@ -0,0 +1,426 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This crate provides a native implementation of regular expressions that is +//! heavily based on RE2 both in syntax and in implementation. Notably, +//! backreferences and arbitrary lookahead/lookbehind assertions are not +//! provided. In return, regular expression searching provided by this package +//! has excellent worst case performance. The specific syntax supported is +//! documented further down. +//! +//! This crate's documentation provides some simple examples, describes Unicode +//! support and exhaustively lists the supported syntax. For more specific +//! details on the API, please see the documentation for the `Regex` type. +//! +//! # First example: find a date +//! +//! General use of regular expressions in this package involves compiling an +//! expression and then using it to search, split or replace text. For example, +//! to confirm that some text resembles a date: +//! +//! ```rust +//! use regex::Regex; +//! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") { +//! Ok(re) => re, +//! Err(err) => fail!("{}", err), +//! }; +//! assert_eq!(re.is_match("2014-01-01"), true); +//! ``` +//! +//! Notice the use of the `^` and `$` anchors. In this crate, every expression +//! is executed with an implicit `.*?` at the beginning and end, which allows +//! it to match anywhere in the text. Anchors can be used to ensure that the +//! full text matches an expression. +//! +//! This example also demonstrates the utility of raw strings in Rust, which +//! are just like regular strings except they are prefixed with an `r` and do +//! not process any escape sequences. For example, `"\\d"` is the same +//! expression as `r"\d"`. +//! +//! # The `regex!` macro +//! +//! Rust's compile time meta-programming facilities provide a way to write a +//! `regex!` macro which compiles regular expressions *when your program +//! compiles*. Said differently, if you only use `regex!` to build regular +//! expressions in your program, then your program cannot compile with an +//! invalid regular expression. Moreover, the `regex!` macro compiles the +//! given expression to native Rust code, which makes it much faster for +//! searching text. +//! +//! Since `regex!` provides compiled regular expressions that are both safer +//! and faster to use, you should use them whenever possible. The only +//! requirement for using them is that you have a string literal corresponding +//! to your expression. Otherwise, it is indistinguishable from an expression +//! compiled at runtime with `Regex::new`. +//! +//! To use the `regex!` macro, you must enable the `phase` feature and import +//! the `regex_macros` crate as a syntax extension: +//! +//! ```rust +//! #![feature(phase)] +//! #[phase(syntax)] +//! extern crate regex_macros; +//! extern crate regex; +//! +//! fn main() { +//! let re = regex!(r"^\d{4}-\d{2}-\d{2}$"); +//! assert_eq!(re.is_match("2014-01-01"), true); +//! } +//! ``` +//! +//! There are a few things worth mentioning about using the `regex!` macro. +//! Firstly, the `regex!` macro *only* accepts string *literals*. +//! Secondly, the `regex` crate *must* be linked with the name `regex` since +//! the generated code depends on finding symbols in the `regex` crate. +//! +//! The only downside of using the `regex!` macro is that it can increase the +//! size of your program's binary since it generates specialized Rust code. +//! The extra size probably won't be significant for a small number of +//! expressions, but 100+ calls to `regex!` will probably result in a +//! noticeably bigger binary. +//! +//! # Example: iterating over capture groups +//! +//! This crate provides convenient iterators for matching an expression +//! repeatedly against a search string to find successive non-overlapping +//! matches. For example, to find all dates in a string and be able to access +//! them by their component pieces: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"(\d{4})-(\d{2})-(\d{2})"); +//! let text = "2012-03-14, 2013-01-01 and 2014-07-05"; +//! for cap in re.captures_iter(text) { +//! println!("Month: {} Day: {} Year: {}", cap.at(2), cap.at(3), cap.at(1)); +//! } +//! // Output: +//! // Month: 03 Day: 14 Year: 2012 +//! // Month: 01 Day: 01 Year: 2013 +//! // Month: 07 Day: 05 Year: 2014 +//! # } +//! ``` +//! +//! Notice that the year is in the capture group indexed at `1`. This is +//! because the *entire match* is stored in the capture group at index `0`. +//! +//! # Example: replacement with named capture groups +//! +//! Building on the previous example, perhaps we'd like to rearrange the date +//! formats. This can be done with text replacement. But to make the code +//! clearer, we can *name* our capture groups and use those names as variables +//! in our replacement text: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"(?P\d{4})-(?P\d{2})-(?P\d{2})"); +//! let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +//! let after = re.replace_all(before, "$m/$d/$y"); +//! assert_eq!(after.as_slice(), "03/14/2012, 01/01/2013 and 07/05/2014"); +//! # } +//! ``` +//! +//! The `replace` methods are actually polymorphic in the replacement, which +//! provides more flexibility than is seen here. (See the documentation for +//! `Regex::replace` for more details.) +//! +//! # Pay for what you use +//! +//! With respect to searching text with a regular expression, there are three +//! questions that can be asked: +//! +//! 1. Does the text match this expression? +//! 2. If so, where does it match? +//! 3. Where are the submatches? +//! +//! Generally speaking, this crate could provide a function to answer only #3, +//! which would subsume #1 and #2 automatically. However, it can be +//! significantly more expensive to compute the location of submatches, so it's +//! best not to do it if you don't need to. +//! +//! Therefore, only use what you need. For example, don't use `find` if you +//! only need to test if an expression matches a string. (Use `is_match` +//! instead.) +//! +//! # Unicode +//! +//! This implementation executes regular expressions **only** on sequences of +//! UTF8 codepoints while exposing match locations as byte indices. +//! +//! Currently, only naive case folding is supported. Namely, when matching +//! case insensitively, the characters are first converted to their uppercase +//! forms and then compared. +//! +//! Regular expressions themselves are also **only** interpreted as a sequence +//! of UTF8 codepoints. This means you can embed Unicode characters directly +//! into your expression: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"(?i)Δ+"); +//! assert_eq!(re.find("ΔδΔ"), Some((0, 6))); +//! # } +//! ``` +//! +//! Finally, Unicode general categories and scripts are available as character +//! classes. For example, you can match a sequence of numerals, Greek or +//! Cherokee letters: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"[\pN\p{Greek}\p{Cherokee}]+"); +//! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23))); +//! # } +//! ``` +//! +//! # Syntax +//! +//! The syntax supported in this crate is almost in an exact correspondence +//! with the syntax supported by RE2. +//! +//! ## Matching one character +//! +//!
+//! .           any character except new line (includes new line with s flag)
+//! [xyz]       A character class matching either x, y or z.
+//! [^xyz]      A character class matching any character except x, y and z.
+//! [a-z]       A character class matching any character in range a-z.
+//! \d          Perl character class ([0-9])
+//! \D          Negated Perl character class ([^0-9])
+//! [:alpha:]   ASCII character class ([A-Za-z])
+//! [:^alpha:]  Negated ASCII character class ([^A-Za-z])
+//! \pN         One letter name Unicode character class
+//! \p{Greek}   Unicode character class (general category or script)
+//! \PN         Negated one letter name Unicode character class
+//! \P{Greek}   negated Unicode character class (general category or script)
+//! 
+//! +//! Any named character class may appear inside a bracketed `[...]` character +//! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral +//! character. +//! +//! ## Composites +//! +//!
+//! xy    concatenation (x followed by y)
+//! x|y   alternation (x or y, prefer x)
+//! 
+//! +//! ## Repetitions +//! +//!
+//! x*        zero or more of x (greedy)
+//! x+        one or more of x (greedy)
+//! x?        zero or one of x (greedy)
+//! x*?       zero or more of x (ungreedy)
+//! x+?       one or more of x (ungreedy)
+//! x??       zero or one of x (ungreedy)
+//! x{n,m}    at least n and at most x (greedy)
+//! x{n,}     at least n x (greedy)
+//! x{n}      exactly n x
+//! x{n,m}?   at least n and at most x (ungreedy)
+//! x{n,}?    at least n x (ungreedy)
+//! x{n}?     exactly n x
+//! 
+//! +//! ## Empty matches +//! +//!
+//! ^     the beginning of text (or start-of-line with multi-line mode)
+//! $     the end of text (or end-of-line with multi-line mode)
+//! \A    only the beginning of text (even with multi-line mode enabled)
+//! \z    only the end of text (even with multi-line mode enabled)
+//! \b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+//! \B    not a Unicode word boundary
+//! 
+//! +//! ## Grouping and flags +//! +//!
+//! (exp)          numbered capture group (indexed by opening parenthesis)
+//! (?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
+//! (?:exp)        non-capturing group
+//! (?flags)       set flags within current group
+//! (?flags:exp)   set flags for exp (non-capturing)
+//! 
+//! +//! Flags are each a single character. For example, `(?x)` sets the flag `x` +//! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at +//! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets +//! the `x` flag and clears the `y` flag. +//! +//! All flags are by default disabled. They are: +//! +//!
+//! i     case insensitive
+//! m     multi-line mode: ^ and $ match begin/end of line
+//! s     allow . to match \n
+//! U     swap the meaning of x* and x*?
+//! 
+//! +//! Here's an example that matches case insensitively for only part of the +//! expression: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"(?i)a+(?-i)b+"); +//! let cap = re.captures("AaAaAbbBBBb").unwrap(); +//! assert_eq!(cap.at(0), "AaAaAbb"); +//! # } +//! ``` +//! +//! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches +//! `b`. +//! +//! ## Escape sequences +//! +//!
+//! \*         literal *, works for any punctuation character: \.+*?()|[]{}^$
+//! \a         bell (\x07)
+//! \f         form feed (\x0C)
+//! \t         horizontal tab
+//! \n         new line
+//! \r         carriage return
+//! \v         vertical tab (\x0B)
+//! \123       octal character code (up to three digits)
+//! \x7F       hex character code (exactly two digits)
+//! \x{10FFFF} any hex character code corresponding to a valid UTF8 codepoint
+//! 
+//! +//! ## Perl character classes (Unicode friendly) +//! +//!
+//! \d     digit ([0-9] + \p{Nd})
+//! \D     not digit
+//! \s     whitespace ([\t\n\f\r ] + \p{Z})
+//! \S     not whitespace
+//! \w     word character ([0-9A-Za-z_] + \p{L})
+//! \W     not word character
+//! 
+//! +//! ## ASCII character classes +//! +//!
+//! [:alnum:]    alphanumeric ([0-9A-Za-z])
+//! [:alpha:]    alphabetic ([A-Za-z])
+//! [:ascii:]    ASCII ([\x00-\x7F])
+//! [:blank:]    blank ([\t ])
+//! [:cntrl:]    control ([\x00-\x1F\x7F])
+//! [:digit:]    digits ([0-9])
+//! [:graph:]    graphical ([!-~])
+//! [:lower:]    lower case ([a-z])
+//! [:print:]    printable ([ -~])
+//! [:punct:]    punctuation ([!-/:-@[-`{-~])
+//! [:space:]    whitespace ([\t\n\v\f\r ])
+//! [:upper:]    upper case ([A-Z])
+//! [:word:]     word characters ([0-9A-Za-z_])
+//! [:xdigit:]   hex digit ([0-9A-Fa-f])
+//! 
+//! +//! # Untrusted input +//! +//! There are two factors to consider here: untrusted regular expressions and +//! untrusted search text. +//! +//! Currently, there are no counter-measures in place to prevent a malicious +//! user from writing an expression that may use a lot of resources. One such +//! example is to repeat counted repetitions: `((a{100}){100}){100}` will try +//! to repeat the `a` instruction `100^3` times. Essentially, this means it's +//! very easy for an attacker to exhaust your system's memory if they are +//! allowed to execute arbitrary regular expressions. A possible solution to +//! this is to impose a hard limit on the size of a compiled expression, but it +//! does not yet exist. +//! +//! The story is a bit better with untrusted search text, since this crate's +//! implementation provides `O(nm)` search where `n` is the number of +//! characters in the search text and `m` is the number of instructions in a +//! compiled expression. + +#![crate_id = "regex#0.11-pre"] +#![crate_type = "rlib"] +#![crate_type = "dylib"] +#![experimental] +#![license = "MIT/ASL2"] +#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", + html_favicon_url = "http://www.rust-lang.org/favicon.ico", + html_root_url = "http://static.rust-lang.org/doc/master")] + +#![feature(macro_rules, phase)] +#![deny(missing_doc)] + +extern crate collections; +#[cfg(test)] +extern crate stdtest = "test"; +#[cfg(test)] +extern crate rand; + +// During tests, this links with the `regex` crate so that the `regex!` macro +// can be tested. +#[cfg(test)] +extern crate regex; + +pub use parse::Error; +pub use re::{Regex, Captures, SubCaptures, SubCapturesPos}; +pub use re::{FindCaptures, FindMatches}; +pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN}; +pub use re::{quote, is_match}; + +mod compile; +mod parse; +mod re; +mod vm; + +// FIXME(#13725) windows needs fixing. +#[cfg(test, not(windows))] +mod test; + +/// The `program` module exists to support the `regex!` macro. Do not use. +#[doc(hidden)] +pub mod native { + // Exporting this stuff is bad form, but it's necessary for two reasons. + // Firstly, the `regex!` syntax extension is in a different crate and + // requires access to the representation of a regex (particularly the + // instruction set) in order to compile to native Rust. This could be + // mitigated if `regex!` was defined in the same crate, but this has + // undesirable consequences (such as requiring a dependency on + // `libsyntax`). + // + // Secondly, the code generated generated by `regex!` must *also* be able + // to access various functions in this crate to reduce code duplication + // and to provide a value with precisely the same `Regex` type in this + // crate. This, AFAIK, is impossible to mitigate. + // + // On the bright side, `rustdoc` lets us hide this from the public API + // documentation. + pub use compile::{ + Program, + OneChar, CharClass, Any, Save, Jump, Split, + Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, + }; + pub use parse::{ + FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, + FLAG_SWAP_GREED, FLAG_NEGATED, + }; + pub use re::{Dynamic, Native}; + pub use vm::{ + MatchKind, Exists, Location, Submatches, + StepState, StepMatchEarlyReturn, StepMatch, StepContinue, + CharReader, find_prefix, + }; +} diff --git a/src/libregex/parse.rs b/src/libregex/parse.rs new file mode 100644 index 0000000000000..27510f01bd676 --- /dev/null +++ b/src/libregex/parse.rs @@ -0,0 +1,1028 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::char; +use std::cmp; +use std::fmt; +use std::iter; +use std::num; +use std::str; + +/// Static data containing Unicode ranges for general categories and scripts. +use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW}; +#[allow(visible_private_types)] +pub mod unicode; + +/// The maximum number of repetitions allowed with the `{n,m}` syntax. +static MAX_REPEAT: uint = 1000; + +/// Error corresponds to something that can go wrong while parsing +/// a regular expression. +/// +/// (Once an expression is compiled, it is not possible to produce an error +/// via searching, splitting or replacing.) +pub struct Error { + /// The *approximate* character index of where the error occurred. + pub pos: uint, + /// A message describing the error. + pub msg: ~str, +} + +impl fmt::Show for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f.buf, "Regex syntax error near position {}: {}", + self.pos, self.msg) + } +} + +/// Represents the abstract syntax of a regular expression. +/// It is showable so that error messages resulting from a bug can provide +/// useful information. +/// It is cloneable so that expressions can be repeated for the counted +/// repetition feature. (No other copying is done.) +/// +/// Note that this representation prevents one from reproducing the regex as +/// it was typed. (But it could be used to reproduce an equivalent regex.) +#[deriving(Show, Clone)] +pub enum Ast { + Nothing, + Literal(char, Flags), + Dot(Flags), + Class(Vec<(char, char)>, Flags), + Begin(Flags), + End(Flags), + WordBoundary(Flags), + Capture(uint, Option<~str>, ~Ast), + // Represent concatenation as a flat vector to avoid blowing the + // stack in the compiler. + Cat(Vec<~Ast>), + Alt(~Ast, ~Ast), + Rep(~Ast, Repeater, Greed), +} + +#[deriving(Show, Eq, Clone)] +pub enum Repeater { + ZeroOne, + ZeroMore, + OneMore, +} + +#[deriving(Show, Clone)] +pub enum Greed { + Greedy, + Ungreedy, +} + +impl Greed { + pub fn is_greedy(&self) -> bool { + match *self { + Greedy => true, + _ => false, + } + } + + fn swap(self, swapped: bool) -> Greed { + if !swapped { return self } + match self { + Greedy => Ungreedy, + Ungreedy => Greedy, + } + } +} + +/// BuildAst is a regrettable type that represents intermediate state for +/// constructing an abstract syntax tree. Its central purpose is to facilitate +/// parsing groups and alternations while also maintaining a stack of flag +/// state. +#[deriving(Show)] +enum BuildAst { + Ast(~Ast), + Paren(Flags, uint, ~str), // '(' + Bar, // '|' +} + +impl BuildAst { + fn paren(&self) -> bool { + match *self { + Paren(_, _, _) => true, + _ => false, + } + } + + fn flags(&self) -> Flags { + match *self { + Paren(flags, _, _) => flags, + _ => fail!("Cannot get flags from {}", self), + } + } + + fn capture(&self) -> Option { + match *self { + Paren(_, 0, _) => None, + Paren(_, c, _) => Some(c), + _ => fail!("Cannot get capture group from {}", self), + } + } + + fn capture_name(&self) -> Option<~str> { + match *self { + Paren(_, 0, _) => None, + Paren(_, _, ref name) => { + if name.len() == 0 { + None + } else { + Some(name.clone()) + } + } + _ => fail!("Cannot get capture name from {}", self), + } + } + + fn bar(&self) -> bool { + match *self { + Bar => true, + _ => false, + } + } + + fn unwrap(self) -> Result<~Ast, Error> { + match self { + Ast(x) => Ok(x), + _ => fail!("Tried to unwrap non-AST item: {}", self), + } + } +} + +/// Flags represents all options that can be twiddled by a user in an +/// expression. +pub type Flags = u8; + +pub static FLAG_EMPTY: u8 = 0; +pub static FLAG_NOCASE: u8 = 1 << 0; // i +pub static FLAG_MULTI: u8 = 1 << 1; // m +pub static FLAG_DOTNL: u8 = 1 << 2; // s +pub static FLAG_SWAP_GREED: u8 = 1 << 3; // U +pub static FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary + +struct Parser<'a> { + // The input, parsed only as a sequence of UTF8 code points. + chars: Vec, + // The index of the current character in the input. + chari: uint, + // The intermediate state representing the AST. + stack: Vec, + // The current set of flags. + flags: Flags, + // The total number of capture groups. + // Incremented each time an opening left paren is seen (assuming it is + // opening a capture group). + caps: uint, + // A set of all capture group names used only to detect duplicates. + names: Vec<~str>, +} + +pub fn parse(s: &str) -> Result<~Ast, Error> { + Parser { + chars: s.chars().collect(), + chari: 0, + stack: vec!(), + flags: FLAG_EMPTY, + caps: 0, + names: vec!(), + }.parse() +} + +impl<'a> Parser<'a> { + fn parse(&mut self) -> Result<~Ast, Error> { + loop { + let c = self.cur(); + match c { + '?' | '*' | '+' => try!(self.push_repeater(c)), + '\\' => { + let ast = try!(self.parse_escape()); + self.push(ast) + } + '{' => try!(self.parse_counted()), + '[' => match self.try_parse_ascii() { + None => try!(self.parse_class()), + Some(class) => self.push(class), + }, + '(' => { + if self.peek_is(1, '?') { + try!(self.expect('?')) + try!(self.parse_group_opts()) + } else { + self.caps += 1; + self.stack.push(Paren(self.flags, self.caps, ~"")) + } + } + ')' => { + let catfrom = try!( + self.pos_last(false, |x| x.paren() || x.bar())); + try!(self.concat(catfrom)); + + let altfrom = try!(self.pos_last(false, |x| x.paren())); + // Before we smush the alternates together and pop off the + // left paren, let's grab the old flags and see if we + // need a capture. + let (cap, cap_name, oldflags) = { + let paren = self.stack.get(altfrom-1); + (paren.capture(), paren.capture_name(), paren.flags()) + }; + try!(self.alternate(altfrom)); + self.flags = oldflags; + + // If this was a capture, pop what we just pushed in + // alternate and make it a capture. + if cap.is_some() { + let ast = try!(self.pop_ast()); + self.push(~Capture(cap.unwrap(), cap_name, ast)); + } + } + '|' => { + let catfrom = try!( + self.pos_last(true, |x| x.paren() || x.bar())); + try!(self.concat(catfrom)); + + self.stack.push(Bar); + } + _ => try!(self.push_literal(c)), + } + if !self.next_char() { + break + } + } + + // Try to improve error handling. At this point, there should be + // no remaining open parens. + if self.stack.iter().any(|x| x.paren()) { + return self.err("Unclosed parenthesis.") + } + let catfrom = try!(self.pos_last(true, |x| x.bar())); + try!(self.concat(catfrom)); + try!(self.alternate(0)); + + assert!(self.stack.len() == 1); + self.pop_ast() + } + + fn noteof(&mut self, expected: &str) -> Result<(), Error> { + match self.next_char() { + true => Ok(()), + false => self.err(format!("Expected {} but got EOF.", expected)), + } + } + + fn expect(&mut self, expected: char) -> Result<(), Error> { + match self.next_char() { + true if self.cur() == expected => Ok(()), + true => self.err(format!("Expected '{}' but got '{}'.", + expected, self.cur())), + false => self.err(format!("Expected '{}' but got EOF.", expected)), + } + } + + fn next_char(&mut self) -> bool { + self.chari += 1; + self.chari < self.chars.len() + } + + fn pop_ast(&mut self) -> Result<~Ast, Error> { + match self.stack.pop().unwrap().unwrap() { + Err(e) => Err(e), + Ok(ast) => Ok(ast), + } + } + + fn push(&mut self, ast: ~Ast) { + self.stack.push(Ast(ast)) + } + + fn push_repeater(&mut self, c: char) -> Result<(), Error> { + if self.stack.len() == 0 { + return self.err( + "A repeat operator must be preceded by a valid expression.") + } + let rep: Repeater = match c { + '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore, + _ => fail!("Not a valid repeater operator."), + }; + + match self.peek(1) { + Some('*') | Some('+') => + return self.err( + "Double repeat operators are not supported."), + _ => {}, + } + let ast = try!(self.pop_ast()); + match ast { + ~Begin(_) | ~End(_) | ~WordBoundary(_) => + return self.err( + "Repeat arguments cannot be empty width assertions."), + _ => {} + } + let greed = try!(self.get_next_greedy()); + self.push(~Rep(ast, rep, greed)); + Ok(()) + } + + fn push_literal(&mut self, c: char) -> Result<(), Error> { + match c { + '.' => { + self.push(~Dot(self.flags)) + } + '^' => { + self.push(~Begin(self.flags)) + } + '$' => { + self.push(~End(self.flags)) + } + _ => { + self.push(~Literal(c, self.flags)) + } + } + Ok(()) + } + + // Parses all forms of character classes. + // Assumes that '[' is the current character. + fn parse_class(&mut self) -> Result<(), Error> { + let negated = + if self.peek_is(1, '^') { + try!(self.expect('^')) + FLAG_NEGATED + } else { + FLAG_EMPTY + }; + let mut ranges: Vec<(char, char)> = vec!(); + let mut alts: Vec<~Ast> = vec!(); + + if self.peek_is(1, ']') { + try!(self.expect(']')) + ranges.push((']', ']')) + } + while self.peek_is(1, '-') { + try!(self.expect('-')) + ranges.push(('-', '-')) + } + loop { + try!(self.noteof("a closing ']' or a non-empty character class)")) + let mut c = self.cur(); + match c { + '[' => + match self.try_parse_ascii() { + Some(~Class(asciis, flags)) => { + alts.push(~Class(asciis, flags ^ negated)); + continue + } + Some(ast) => + fail!("Expected Class AST but got '{}'", ast), + // Just drop down and try to add as a regular character. + None => {}, + }, + '\\' => { + match try!(self.parse_escape()) { + ~Class(asciis, flags) => { + alts.push(~Class(asciis, flags ^ negated)); + continue + } + ~Literal(c2, _) => c = c2, // process below + ~Begin(_) | ~End(_) | ~WordBoundary(_) => + return self.err( + "\\A, \\z, \\b and \\B are not valid escape \ + sequences inside a character class."), + ast => fail!("Unexpected AST item '{}'", ast), + } + } + _ => {}, + } + match c { + ']' => { + if ranges.len() > 0 { + let flags = negated | (self.flags & FLAG_NOCASE); + let mut ast = ~Class(combine_ranges(ranges), flags); + for alt in alts.move_iter() { + ast = ~Alt(alt, ast) + } + self.push(ast); + } else if alts.len() > 0 { + let mut ast = alts.pop().unwrap(); + for alt in alts.move_iter() { + ast = ~Alt(alt, ast) + } + self.push(ast); + } + return Ok(()) + } + c => { + if self.peek_is(1, '-') && !self.peek_is(2, ']') { + try!(self.expect('-')) + try!(self.noteof("not a ']'")) + let c2 = self.cur(); + if c2 < c { + return self.err(format!( + "Invalid character class range '{}-{}'", c, c2)) + } + ranges.push((c, self.cur())) + } else { + ranges.push((c, c)) + } + } + } + } + } + + // Tries to parse an ASCII character class of the form [:name:]. + // If successful, returns an AST character class corresponding to name + // and moves the parser to the final ']' character. + // If unsuccessful, no state is changed and None is returned. + // Assumes that '[' is the current character. + fn try_parse_ascii(&mut self) -> Option<~Ast> { + if !self.peek_is(1, ':') { + return None + } + let closer = + match self.pos(']') { + Some(i) => i, + None => return None, + }; + if *self.chars.get(closer-1) != ':' { + return None + } + if closer - self.chari <= 3 { + return None + } + let mut name_start = self.chari + 2; + let negated = + if self.peek_is(2, '^') { + name_start += 1; + FLAG_NEGATED + } else { + FLAG_EMPTY + }; + let name = self.slice(name_start, closer - 1); + match find_class(ASCII_CLASSES, name) { + None => None, + Some(ranges) => { + self.chari = closer; + let flags = negated | (self.flags & FLAG_NOCASE); + Some(~Class(combine_ranges(ranges), flags)) + } + } + } + + // Parses counted repetition. Supports: + // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}? + // Assumes that '{' is the current character. + // Returns either an error or moves the parser to the final '}' character. + // (Or the '?' character if not greedy.) + fn parse_counted(&mut self) -> Result<(), Error> { + // Scan until the closing '}' and grab the stuff in {}. + let start = self.chari; + let closer = + match self.pos('}') { + Some(i) => i, + None => return self.err(format!( + "No closing brace for counted repetition starting at \ + position {}.", start)), + }; + self.chari = closer; + let greed = try!(self.get_next_greedy()); + let inner = str::from_chars( + self.chars.as_slice().slice(start + 1, closer)); + + // Parse the min and max values from the regex. + let (mut min, mut max): (uint, Option); + if !inner.contains(",") { + min = try!(self.parse_uint(inner)); + max = Some(min); + } else { + let pieces: Vec<&str> = inner.splitn(',', 1).collect(); + let (smin, smax) = (*pieces.get(0), *pieces.get(1)); + if smin.len() == 0 { + return self.err("Max repetitions cannot be specified \ + without min repetitions.") + } + min = try!(self.parse_uint(smin)); + max = + if smax.len() == 0 { + None + } else { + Some(try!(self.parse_uint(smax))) + }; + } + + // Do some bounds checking and make sure max >= min. + if min > MAX_REPEAT { + return self.err(format!( + "{} exceeds maximum allowed repetitions ({})", + min, MAX_REPEAT)); + } + if max.is_some() { + let m = max.unwrap(); + if m > MAX_REPEAT { + return self.err(format!( + "{} exceeds maximum allowed repetitions ({})", + m, MAX_REPEAT)); + } + if m < min { + return self.err(format!( + "Max repetitions ({}) cannot be smaller than min \ + repetitions ({}).", m, min)); + } + } + + // Now manipulate the AST be repeating elements. + if max.is_none() { + // Require N copies of what's on the stack and then repeat it. + let ast = try!(self.pop_ast()); + for _ in iter::range(0, min) { + self.push(ast.clone()) + } + self.push(~Rep(ast, ZeroMore, greed)); + } else { + // Require N copies of what's on the stack and then repeat it + // up to M times optionally. + let ast = try!(self.pop_ast()); + for _ in iter::range(0, min) { + self.push(ast.clone()) + } + if max.is_some() { + for _ in iter::range(min, max.unwrap()) { + self.push(~Rep(ast.clone(), ZeroOne, greed)) + } + } + // It's possible that we popped something off the stack but + // never put anything back on it. To keep things simple, add + // a no-op expression. + if min == 0 && (max.is_none() || max == Some(0)) { + self.push(~Nothing) + } + } + Ok(()) + } + + // Parses all escape sequences. + // Assumes that '\' is the current character. + fn parse_escape(&mut self) -> Result<~Ast, Error> { + try!(self.noteof("an escape sequence following a '\\'")) + + let c = self.cur(); + if is_punct(c) { + return Ok(~Literal(c, FLAG_EMPTY)) + } + match c { + 'a' => Ok(~Literal('\x07', FLAG_EMPTY)), + 'f' => Ok(~Literal('\x0C', FLAG_EMPTY)), + 't' => Ok(~Literal('\t', FLAG_EMPTY)), + 'n' => Ok(~Literal('\n', FLAG_EMPTY)), + 'r' => Ok(~Literal('\r', FLAG_EMPTY)), + 'v' => Ok(~Literal('\x0B', FLAG_EMPTY)), + 'A' => Ok(~Begin(FLAG_EMPTY)), + 'z' => Ok(~End(FLAG_EMPTY)), + 'b' => Ok(~WordBoundary(FLAG_EMPTY)), + 'B' => Ok(~WordBoundary(FLAG_NEGATED)), + '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())), + 'x' => Ok(try!(self.parse_hex())), + 'p' | 'P' => Ok(try!(self.parse_unicode_name())), + 'd' | 'D' | 's' | 'S' | 'w' | 'W' => { + let ranges = perl_unicode_class(c); + let mut flags = self.flags & FLAG_NOCASE; + if c.is_uppercase() { flags |= FLAG_NEGATED } + Ok(~Class(ranges, flags)) + } + _ => self.err(format!("Invalid escape sequence '\\\\{}'", c)), + } + } + + // Parses a unicode character class name, either of the form \pF where + // F is a one letter unicode class name or of the form \p{name} where + // name is the unicode class name. + // Assumes that \p or \P has been read (and 'p' or 'P' is the current + // character). + fn parse_unicode_name(&mut self) -> Result<~Ast, Error> { + let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY }; + let mut name: ~str; + if self.peek_is(1, '{') { + try!(self.expect('{')) + let closer = + match self.pos('}') { + Some(i) => i, + None => return self.err(format!( + "Missing '\\}' for unclosed '\\{' at position {}", + self.chari)), + }; + if closer - self.chari + 1 == 0 { + return self.err("No Unicode class name found.") + } + name = self.slice(self.chari + 1, closer); + self.chari = closer; + } else { + if self.chari + 1 >= self.chars.len() { + return self.err("No single letter Unicode class name found.") + } + name = self.slice(self.chari + 1, self.chari + 2); + self.chari += 1; + } + match find_class(UNICODE_CLASSES, name) { + None => return self.err(format!( + "Could not find Unicode class '{}'", name)), + Some(ranges) => { + Ok(~Class(ranges, negated | (self.flags & FLAG_NOCASE))) + } + } + } + + // Parses an octal number, up to 3 digits. + // Assumes that \n has been read, where n is the first digit. + fn parse_octal(&mut self) -> Result<~Ast, Error> { + let start = self.chari; + let mut end = start + 1; + let (d2, d3) = (self.peek(1), self.peek(2)); + if d2 >= Some('0') && d2 <= Some('7') { + try!(self.noteof("expected octal character in [0-7]")) + end += 1; + if d3 >= Some('0') && d3 <= Some('7') { + try!(self.noteof("expected octal character in [0-7]")) + end += 1; + } + } + let s = self.slice(start, end); + match num::from_str_radix::(s, 8) { + Some(n) => Ok(~Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), + None => self.err(format!( + "Could not parse '{}' as octal number.", s)), + } + } + + // Parse a hex number. Either exactly two digits or anything in {}. + // Assumes that \x has been read. + fn parse_hex(&mut self) -> Result<~Ast, Error> { + if !self.peek_is(1, '{') { + try!(self.expect('{')) + return self.parse_hex_two() + } + let start = self.chari + 2; + let closer = + match self.pos('}') { + None => return self.err(format!( + "Missing '\\}' for unclosed '\\{' at position {}", start)), + Some(i) => i, + }; + self.chari = closer; + self.parse_hex_digits(self.slice(start, closer)) + } + + // Parses a two-digit hex number. + // Assumes that \xn has been read, where n is the first digit and is the + // current character. + // After return, parser will point at the second digit. + fn parse_hex_two(&mut self) -> Result<~Ast, Error> { + let (start, end) = (self.chari, self.chari + 2); + let bad = self.slice(start - 2, self.chars.len()); + try!(self.noteof(format!("Invalid hex escape sequence '{}'", bad))) + self.parse_hex_digits(self.slice(start, end)) + } + + // Parses `s` as a hexadecimal number. + fn parse_hex_digits(&self, s: &str) -> Result<~Ast, Error> { + match num::from_str_radix::(s, 16) { + Some(n) => Ok(~Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), + None => self.err(format!( + "Could not parse '{}' as hex number.", s)), + } + } + + // Parses a named capture. + // Assumes that '(?P<' has been consumed and that the current character + // is '<'. + // When done, parser will be at the closing '>' character. + fn parse_named_capture(&mut self) -> Result<(), Error> { + try!(self.noteof("a capture name")) + let closer = + match self.pos('>') { + Some(i) => i, + None => return self.err("Capture name must end with '>'."), + }; + if closer - self.chari == 0 { + return self.err("Capture names must have at least 1 character.") + } + let name = self.slice(self.chari, closer); + if !name.chars().all(is_valid_cap) { + return self.err( + "Capture names can only have underscores, letters and digits.") + } + if self.names.contains(&name) { + return self.err(format!("Duplicate capture group name '{}'.", name)) + } + self.names.push(name.clone()); + self.chari = closer; + self.caps += 1; + self.stack.push(Paren(self.flags, self.caps, name)); + Ok(()) + } + + // Parses non-capture groups and options. + // Assumes that '(?' has already been consumed and '?' is the current + // character. + fn parse_group_opts(&mut self) -> Result<(), Error> { + if self.peek_is(1, 'P') && self.peek_is(2, '<') { + try!(self.expect('P')) try!(self.expect('<')) + return self.parse_named_capture() + } + let start = self.chari; + let mut flags = self.flags; + let mut sign = 1; + let mut saw_flag = false; + loop { + try!(self.noteof("expected non-empty set of flags or closing ')'")) + match self.cur() { + 'i' => { flags = flags | FLAG_NOCASE; saw_flag = true}, + 'm' => { flags = flags | FLAG_MULTI; saw_flag = true}, + 's' => { flags = flags | FLAG_DOTNL; saw_flag = true}, + 'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true}, + '-' => { + if sign < 0 { + return self.err(format!( + "Cannot negate flags twice in '{}'.", + self.slice(start, self.chari + 1))) + } + sign = -1; + saw_flag = false; + flags = flags ^ flags; + } + ':' | ')' => { + if sign < 0 { + if !saw_flag { + return self.err(format!( + "A valid flag does not follow negation in '{}'", + self.slice(start, self.chari + 1))) + } + flags = flags ^ flags; + } + if self.cur() == ':' { + // Save the old flags with the opening paren. + self.stack.push(Paren(self.flags, 0, ~"")); + } + self.flags = flags; + return Ok(()) + } + _ => return self.err(format!( + "Unrecognized flag '{}'.", self.cur())), + } + } + } + + // Peeks at the next character and returns whether it's ungreedy or not. + // If it is, then the next character is consumed. + fn get_next_greedy(&mut self) -> Result { + Ok(if self.peek_is(1, '?') { + try!(self.expect('?')) + Ungreedy + } else { + Greedy + }.swap(self.flags & FLAG_SWAP_GREED > 0)) + } + + // Searches the stack (starting at the top) until it finds an expression + // for which `pred` returns true. The index of that expression in the + // stack is returned. + // If there's no match, then one of two things happens depending on the + // values of `allow_start`. When it's true, then `0` will be returned. + // Otherwise, an error will be returned. + // Generally, `allow_start` is only true when you're *not* expecting an + // opening parenthesis. + fn pos_last(&self, allow_start: bool, pred: |&BuildAst| -> bool) + -> Result { + let from = match self.stack.iter().rev().position(pred) { + Some(i) => i, + None => { + if allow_start { + self.stack.len() + } else { + return self.err("No matching opening parenthesis.") + } + } + }; + // Adjust index since 'from' is for the reversed stack. + // Also, don't include the '(' or '|'. + Ok(self.stack.len() - from) + } + + // concat starts at `from` in the parser's stack and concatenates all + // expressions up to the top of the stack. The resulting concatenation is + // then pushed on to the stack. + // Usually `from` corresponds to the position of an opening parenthesis, + // a '|' (alternation) or the start of the entire expression. + fn concat(&mut self, from: uint) -> Result<(), Error> { + let ast = try!(self.build_from(from, concat_flatten)); + self.push(ast); + Ok(()) + } + + // concat starts at `from` in the parser's stack and alternates all + // expressions up to the top of the stack. The resulting alternation is + // then pushed on to the stack. + // Usually `from` corresponds to the position of an opening parenthesis + // or the start of the entire expression. + // This will also drop any opening parens or alternation bars found in + // the intermediate AST. + fn alternate(&mut self, mut from: uint) -> Result<(), Error> { + // Unlike in the concatenation case, we want 'build_from' to continue + // all the way to the opening left paren (so it will be popped off and + // thrown away). But be careful with overflow---we can't count on the + // open paren to be there. + if from > 0 { from = from - 1} + let ast = try!(self.build_from(from, Alt)); + self.push(ast); + Ok(()) + } + + // build_from combines all AST elements starting at 'from' in the + // parser's stack using 'mk' to combine them. If any such element is not an + // AST then it is popped off the stack and ignored. + fn build_from(&mut self, from: uint, mk: |~Ast, ~Ast| -> Ast) + -> Result<~Ast, Error> { + if from >= self.stack.len() { + return self.err("Empty group or alternate not allowed.") + } + + let mut combined = try!(self.pop_ast()); + let mut i = self.stack.len(); + while i > from { + i = i - 1; + match self.stack.pop().unwrap() { + Ast(x) => combined = ~mk(x, combined), + _ => {}, + } + } + Ok(combined) + } + + fn parse_uint(&self, s: &str) -> Result { + match from_str::(s) { + Some(i) => Ok(i), + None => self.err(format!( + "Expected an unsigned integer but got '{}'.", s)), + } + } + + fn char_from_u32(&self, n: u32) -> Result { + match char::from_u32(n) { + Some(c) => Ok(c), + None => self.err(format!( + "Could not decode '{}' to unicode character.", n)), + } + } + + fn pos(&self, c: char) -> Option { + self.chars.iter() + .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i) + } + + fn err(&self, msg: &str) -> Result { + Err(Error { + pos: self.chari, + msg: msg.to_owned(), + }) + } + + fn peek(&self, offset: uint) -> Option { + if self.chari + offset >= self.chars.len() { + return None + } + Some(*self.chars.get(self.chari + offset)) + } + + fn peek_is(&self, offset: uint, is: char) -> bool { + self.peek(offset) == Some(is) + } + + fn cur(&self) -> char { + *self.chars.get(self.chari) + } + + fn slice(&self, start: uint, end: uint) -> ~str { + str::from_chars(self.chars.as_slice().slice(start, end)) + } +} + +// Given an unordered collection of character ranges, combine_ranges returns +// an ordered sequence of character ranges where no two ranges overlap. They +// are ordered from least to greatest (using start position). +fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> { + // Returns true iff the two character classes overlap or share a boundary. + // e.g., ('a', 'g') and ('h', 'm') would return true. + fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool { + cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1 + } + + // This is currently O(n^2), but I think with sufficient cleverness, + // it can be reduced to O(n) **if necessary**. + let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len()); + for (us, ue) in unordered.move_iter() { + let (mut us, mut ue) = (us, ue); + assert!(us <= ue); + let mut which: Option = None; + for (i, &(os, oe)) in ordered.iter().enumerate() { + if should_merge((us, ue), (os, oe)) { + us = cmp::min(us, os); + ue = cmp::max(ue, oe); + which = Some(i); + break + } + } + match which { + None => ordered.push((us, ue)), + Some(i) => *ordered.get_mut(i) = (us, ue), + } + } + ordered.sort(); + ordered +} + +// Constructs a Unicode friendly Perl character class from \d, \s or \w +// (or any of their negated forms). Note that this does not handle negation. +fn perl_unicode_class(which: char) -> Vec<(char, char)> { + match which.to_lowercase() { + 'd' => Vec::from_slice(PERLD), + 's' => Vec::from_slice(PERLS), + 'w' => Vec::from_slice(PERLW), + _ => unreachable!(), + } +} + +// Returns a concatenation of two expressions. This also guarantees that a +// `Cat` expression will never be a direct child of another `Cat` expression. +fn concat_flatten(x: ~Ast, y: ~Ast) -> Ast { + match (x, y) { + (~Cat(mut xs), ~Cat(ys)) => { xs.push_all_move(ys); Cat(xs) } + (~Cat(mut xs), ast) => { xs.push(ast); Cat(xs) } + (ast, ~Cat(mut xs)) => { xs.unshift(ast); Cat(xs) } + (ast1, ast2) => Cat(vec!(ast1, ast2)), + } +} + +pub fn is_punct(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | + '[' | ']' | '{' | '}' | '^' | '$' => true, + _ => false, + } +} + +fn is_valid_cap(c: char) -> bool { + c == '_' || (c >= '0' && c <= '9') + || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} + +fn find_class(classes: NamedClasses, name: &str) -> Option> { + match classes.bsearch(|&(s, _)| s.cmp(&name)) { + Some(i) => Some(Vec::from_slice(classes[i].val1())), + None => None, + } +} + +type Class = &'static [(char, char)]; +type NamedClasses = &'static [(&'static str, Class)]; + +static ASCII_CLASSES: NamedClasses = &[ + // Classes must be in alphabetical order so that bsearch works. + // [:alnum:] alphanumeric (== [0-9A-Za-z]) + // [:alpha:] alphabetic (== [A-Za-z]) + // [:ascii:] ASCII (== [\x00-\x7F]) + // [:blank:] blank (== [\t ]) + // [:cntrl:] control (== [\x00-\x1F\x7F]) + // [:digit:] digits (== [0-9]) + // [:graph:] graphical (== [!-~]) + // [:lower:] lower case (== [a-z]) + // [:print:] printable (== [ -~] == [ [:graph:]]) + // [:punct:] punctuation (== [!-/:-@[-`{-~]) + // [:space:] whitespace (== [\t\n\v\f\r ]) + // [:upper:] upper case (== [A-Z]) + // [:word:] word characters (== [0-9A-Za-z_]) + // [:xdigit:] hex digit (== [0-9A-Fa-f]) + // Taken from: http://golang.org/pkg/regex/syntax/ + ("alnum", &[('0', '9'), ('A', 'Z'), ('a', 'z')]), + ("alpha", &[('A', 'Z'), ('a', 'z')]), + ("ascii", &[('\x00', '\x7F')]), + ("blank", &[(' ', ' '), ('\t', '\t')]), + ("cntrl", &[('\x00', '\x1F'), ('\x7F', '\x7F')]), + ("digit", &[('0', '9')]), + ("graph", &[('!', '~')]), + ("lower", &[('a', 'z')]), + ("print", &[(' ', '~')]), + ("punct", &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]), + ("space", &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'), + ('\r', '\r'), (' ', ' ')]), + ("upper", &[('A', 'Z')]), + ("word", &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]), + ("xdigit", &[('0', '9'), ('A', 'F'), ('a', 'f')]), +]; diff --git a/src/libregex/re.rs b/src/libregex/re.rs new file mode 100644 index 0000000000000..da3ebaee6dba1 --- /dev/null +++ b/src/libregex/re.rs @@ -0,0 +1,870 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use collections::HashMap; +use std::fmt; +use std::from_str::from_str; +use std::str::{MaybeOwned, Owned, Slice}; + +use compile::Program; +use parse; +use vm; +use vm::{CaptureLocs, MatchKind, Exists, Location, Submatches}; + +/// Escapes all regular expression meta characters in `text` so that it may be +/// safely used in a regular expression as a literal string. +pub fn quote(text: &str) -> ~str { + let mut quoted = StrBuf::with_capacity(text.len()); + for c in text.chars() { + if parse::is_punct(c) { + quoted.push_char('\\') + } + quoted.push_char(c); + } + quoted.into_owned() +} + +/// Tests if the given regular expression matches somewhere in the text given. +/// +/// If there was a problem compiling the regular expression, an error is +/// returned. +/// +/// To find submatches, split or replace text, you'll need to compile an +/// expression first. +/// +/// Note that you should prefer the `regex!` macro when possible. For example, +/// `regex!("...").is_match("...")`. +pub fn is_match(regex: &str, text: &str) -> Result { + Regex::new(regex).map(|r| r.is_match(text)) +} + +/// Regex is a compiled regular expression, represented as either a sequence +/// of bytecode instructions (dynamic) or as a specialized Rust function +/// (native). It can be used to search, split +/// or replace text. All searching is done with an implicit `.*?` at the +/// beginning and end of an expression. To force an expression to match the +/// whole string (or a prefix or a suffix), you must use an anchor like `^` or +/// `$` (or `\A` and `\z`). +/// +/// While this crate will handle Unicode strings (whether in the regular +/// expression or in the search text), all positions returned are **byte +/// indices**. Every byte index is guaranteed to be at a UTF8 codepoint +/// boundary. +/// +/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a +/// compiled regular expression and text to search, respectively. +/// +/// The only methods that allocate new strings are the string replacement +/// methods. All other methods (searching and splitting) return borrowed +/// pointers into the string given. +/// +/// # Examples +/// +/// Find the location of a US phone number: +/// +/// ```rust +/// # use regex::Regex; +/// let re = match Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}") { +/// Ok(re) => re, +/// Err(err) => fail!("{}", err), +/// }; +/// assert_eq!(re.find("phone: 111-222-3333"), Some((7, 19))); +/// ``` +/// +/// You can also use the `regex!` macro to compile a regular expression when +/// you compile your program: +/// +/// ```rust +/// #![feature(phase)] +/// extern crate regex; +/// #[phase(syntax)] extern crate regex_macros; +/// +/// fn main() { +/// let re = regex!(r"\d+"); +/// assert_eq!(re.find("123 abc"), Some((0, 3))); +/// } +/// ``` +/// +/// Given an incorrect regular expression, `regex!` will cause the Rust +/// compiler to produce a compile time error. +/// Note that `regex!` will compile the expression to native Rust code, which +/// makes it much faster when searching text. +/// More details about the `regex!` macro can be found in the `regex` crate +/// documentation. +#[deriving(Clone)] +#[allow(visible_private_types)] +pub struct Regex { + /// The representation of `Regex` is exported to support the `regex!` + /// syntax extension. Do not rely on it. + /// + /// See the comments for the `program` module in `lib.rs` for a more + /// detailed explanation for what `regex!` requires. + #[doc(hidden)] + pub original: ~str, + #[doc(hidden)] + pub names: ~[Option<~str>], + #[doc(hidden)] + pub p: MaybeNative, +} + +impl fmt::Show for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f.buf, "{}", self.original) + } +} + +pub enum MaybeNative { + Dynamic(Program), + Native(fn(MatchKind, &str, uint, uint) -> Vec>), +} + +impl Clone for MaybeNative { + fn clone(&self) -> MaybeNative { + match *self { + Dynamic(ref p) => Dynamic(p.clone()), + Native(fp) => Native(fp), + } + } +} + +impl Regex { + /// Compiles a dynamic regular expression. Once compiled, it can be + /// used repeatedly to search, split or replace text in a string. + /// + /// When possible, you should prefer the `regex!` macro since it is + /// safer and always faster. + /// + /// If an invalid expression is given, then an error is returned. + pub fn new(re: &str) -> Result { + let ast = try!(parse::parse(re)); + let (prog, names) = Program::new(ast); + Ok(Regex { original: re.to_owned(), names: names, p: Dynamic(prog) }) + } + + /// Returns true if and only if the regex matches the string given. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 + /// characters: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let text = "I categorically deny having triskaidekaphobia."; + /// let matched = regex!(r"\b\w{13}\b").is_match(text); + /// assert!(matched); + /// # } + /// ``` + pub fn is_match(&self, text: &str) -> bool { + has_match(&exec(self, Exists, text)) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `text`. If no match exists, then `None` is returned. + /// + /// Note that this should only be used if you want to discover the position + /// of the match. Testing the existence of a match is faster if you use + /// `is_match`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 + /// characters: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let text = "I categorically deny having triskaidekaphobia."; + /// let pos = regex!(r"\b\w{13}\b").find(text); + /// assert_eq!(pos, Some((2, 15))); + /// # } + /// ``` + pub fn find(&self, text: &str) -> Option<(uint, uint)> { + let caps = exec(self, Location, text); + if has_match(&caps) { + Some((caps.get(0).unwrap(), caps.get(1).unwrap())) + } else { + None + } + } + + /// Returns an iterator for each successive non-overlapping match in + /// `text`, returning the start and end byte indices with respect to + /// `text`. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// characters: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let text = "Retroactively relinquishing remunerations is reprehensible."; + /// for pos in regex!(r"\b\w{13}\b").find_iter(text) { + /// println!("{}", pos); + /// } + /// // Output: + /// // (0, 13) + /// // (14, 27) + /// // (28, 41) + /// // (45, 58) + /// # } + /// ``` + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { + FindMatches { + re: self, + search: text, + last_end: 0, + last_match: None, + } + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `text`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// You should only use `captures` if you need access to submatches. + /// Otherwise, `find` is faster for discovering the location of the overall + /// match. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let re = regex!(r"'([^']+)'\s+\((\d{4})\)"); + /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.at(1), "Citizen Kane"); + /// assert_eq!(caps.at(2), "1941"); + /// assert_eq!(caps.at(0), "'Citizen Kane' (1941)"); + /// # } + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let re = regex!(r"'(?P[^']+)'\s+\((?P<year>\d{4})\)"); + /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.name("title"), "Citizen Kane"); + /// assert_eq!(caps.name("year"), "1941"); + /// assert_eq!(caps.at(0), "'Citizen Kane' (1941)"); + /// # } + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method. Note that the named capture groups are still accessible with + /// `at`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `at(0)`. + pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { + let caps = exec(self, Submatches, text); + Captures::new(self, text, caps) + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `text`. This is operationally the same as `find_iter` (except it + /// yields information about submatches). + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let re = regex!(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for caps in re.captures_iter(text) { + /// println!("Movie: {}, Released: {}", caps.name("title"), caps.name("year")); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # } + /// ``` + pub fn captures_iter<'r, 't>(&'r self, text: &'t str) + -> FindCaptures<'r, 't> { + FindCaptures { + re: self, + search: text, + last_match: None, + last_end: 0, + } + } + + /// Returns an iterator of substrings of `text` delimited by a match + /// of the regular expression. + /// Namely, each element of the iterator corresponds to text that *isn't* + /// matched by the regular expression. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let re = regex!(r"[ \t]+"); + /// let fields: Vec<&str> = re.split("a b \t c\td e").collect(); + /// assert_eq!(fields, vec!("a", "b", "c", "d", "e")); + /// # } + /// ``` + pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> { + RegexSplits { + finder: self.find_iter(text), + last: 0, + } + } + + /// Returns an iterator of at most `limit` substrings of `text` delimited + /// by a match of the regular expression. (A `limit` of `0` will return no + /// substrings.) + /// Namely, each element of the iterator corresponds to text that *isn't* + /// matched by the regular expression. + /// The remainder of the string that is not split will be the last element + /// in the iterator. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// Get the first two words in some text: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let re = regex!(r"\W+"); + /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect(); + /// assert_eq!(fields, vec!("Hey", "How", "are you?")); + /// # } + /// ``` + pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: uint) + -> RegexSplitsN<'r, 't> { + RegexSplitsN { + splits: self.split(text), + cur: 0, + limit: limit, + } + } + + /// Replaces the leftmost-first match with the replacement provided. + /// The replacement can be a regular string (where `$N` and `$name` are + /// expanded to match capture groups) or a function that takes the matches' + /// `Captures` and returns the replaced string. + /// + /// If no match is found, then a copy of the string is returned unchanged. + /// + /// # Examples + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal string: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let re = regex!("[^01]+"); + /// assert_eq!(re.replace("1078910", "").as_slice(), "1010"); + /// # } + /// ``` + /// + /// But anything satisfying the `Replacer` trait will work. For example, + /// a closure of type `|&Captures| -> ~str` provides direct access to the + /// captures corresponding to a match. This allows one to access + /// submatches easily: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # use regex::Captures; fn main() { + /// let re = regex!(r"([^,\s]+),\s+(\S+)"); + /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { + /// format!("{} {}", caps.at(2), caps.at(1)) + /// }); + /// assert_eq!(result.as_slice(), "Bruce Springsteen"); + /// # } + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported that expands `$name` into the corresponding capture + /// group. Here's the last example, but using this expansion technique + /// with named capture groups: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// let re = regex!(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)"); + /// let result = re.replace("Springsteen, Bruce", "$first $last"); + /// assert_eq!(result.as_slice(), "Bruce Springsteen"); + /// # } + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// submatch expansion. This can be done by wrapping a string with + /// `NoExpand`: + /// + /// ```rust + /// # #![feature(phase)] + /// # extern crate regex; #[phase(syntax)] extern crate regex_macros; + /// # fn main() { + /// use regex::NoExpand; + /// + /// let re = regex!(r"(?P<last>[^,\s]+),\s+(\S+)"); + /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); + /// assert_eq!(result.as_slice(), "$2 $last"); + /// # } + /// ``` + pub fn replace<R: Replacer>(&self, text: &str, rep: R) -> StrBuf { + self.replacen(text, 1, rep) + } + + /// Replaces all non-overlapping matches in `text` with the + /// replacement provided. This is the same as calling `replacen` with + /// `limit` set to `0`. + /// + /// See the documentation for `replace` for details on how to access + /// submatches in the replacement string. + pub fn replace_all<R: Replacer>(&self, text: &str, rep: R) -> StrBuf { + self.replacen(text, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in `text` with the + /// replacement provided. If `limit` is 0, then all non-overlapping matches + /// are replaced. + /// + /// See the documentation for `replace` for details on how to access + /// submatches in the replacement string. + pub fn replacen<R: Replacer> + (&self, text: &str, limit: uint, mut rep: R) -> StrBuf { + let mut new = StrBuf::with_capacity(text.len()); + let mut last_match = 0u; + let mut i = 0; + for cap in self.captures_iter(text) { + // It'd be nicer to use the 'take' iterator instead, but it seemed + // awkward given that '0' => no limit. + if limit > 0 && i >= limit { + break + } + i += 1; + + let (s, e) = cap.pos(0).unwrap(); // captures only reports matches + new.push_str(text.slice(last_match, s)); + new.push_str(rep.reg_replace(&cap).as_slice()); + last_match = e; + } + new.append(text.slice(last_match, text.len())) + } +} + +/// NoExpand indicates literal string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal +/// string replacement without expanding `$name` to their corresponding +/// capture groups. +/// +/// `'r` is the lifetime of the literal text. +pub struct NoExpand<'t>(pub &'t str); + +/// Replacer describes types that can be used to replace matches in a string. +pub trait Replacer { + /// Returns a possibly owned string that is used to replace the match + /// corresponding the the `caps` capture group. + /// + /// The `'a` lifetime refers to the lifetime of a borrowed string when + /// a new owned string isn't needed (e.g., for `NoExpand`). + fn reg_replace<'a>(&'a mut self, caps: &Captures) -> MaybeOwned<'a>; +} + +impl<'t> Replacer for NoExpand<'t> { + fn reg_replace<'a>(&'a mut self, _: &Captures) -> MaybeOwned<'a> { + let NoExpand(s) = *self; + Slice(s) + } +} + +impl<'t> Replacer for &'t str { + fn reg_replace<'a>(&'a mut self, caps: &Captures) -> MaybeOwned<'a> { + Owned(caps.expand(*self).into_owned()) + } +} + +impl<'a> Replacer for |&Captures|: 'a -> ~str { + fn reg_replace<'r>(&'r mut self, caps: &Captures) -> MaybeOwned<'r> { + Owned((*self)(caps).into_owned()) + } +} + +/// Yields all substrings delimited by a regular expression match. +/// +/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime +/// of the string being split. +pub struct RegexSplits<'r, 't> { + finder: FindMatches<'r, 't>, + last: uint, +} + +impl<'r, 't> Iterator<&'t str> for RegexSplits<'r, 't> { + fn next(&mut self) -> Option<&'t str> { + let text = self.finder.search; + match self.finder.next() { + None => { + if self.last >= text.len() { + None + } else { + let s = text.slice(self.last, text.len()); + self.last = text.len(); + Some(s) + } + } + Some((s, e)) => { + let matched = text.slice(self.last, s); + self.last = e; + Some(matched) + } + } + } +} + +/// Yields at most `N` substrings delimited by a regular expression match. +/// +/// The last substring will be whatever remains after splitting. +/// +/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime +/// of the string being split. +pub struct RegexSplitsN<'r, 't> { + splits: RegexSplits<'r, 't>, + cur: uint, + limit: uint, +} + +impl<'r, 't> Iterator<&'t str> for RegexSplitsN<'r, 't> { + fn next(&mut self) -> Option<&'t str> { + let text = self.splits.finder.search; + if self.cur >= self.limit { + None + } else { + self.cur += 1; + if self.cur >= self.limit { + Some(text.slice(self.splits.last, text.len())) + } else { + self.splits.next() + } + } + } +} + +/// Captures represents a group of captured strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. +/// If a capture group is named, then the matched string is *also* available +/// via the `name` method. (Note that the 0th capture is always unnamed and so +/// must be accessed with the `at` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'t` is the lifetime of the matched text. +pub struct Captures<'t> { + text: &'t str, + locs: CaptureLocs, + named: Option<HashMap<~str, uint>>, +} + +impl<'t> Captures<'t> { + fn new(re: &Regex, search: &'t str, locs: CaptureLocs) + -> Option<Captures<'t>> { + if !has_match(&locs) { + return None + } + + let named = + if re.names.len() == 0 { + None + } else { + let mut named = HashMap::new(); + for (i, name) in re.names.iter().enumerate() { + match name { + &None => {}, + &Some(ref name) => { + named.insert(name.to_owned(), i); + } + } + } + Some(named) + }; + Some(Captures { + text: search, + locs: locs, + named: named, + }) + } + + /// Returns the start and end positions of the Nth capture group. + /// Returns `None` if `i` is not a valid capture group or if the capture + /// group did not match anything. + /// The positions returned are *always* byte indices with respect to the + /// original string matched. + pub fn pos(&self, i: uint) -> Option<(uint, uint)> { + let (s, e) = (i * 2, i * 2 + 1); + if e >= self.locs.len() || self.locs.get(s).is_none() { + // VM guarantees that each pair of locations are both Some or None. + return None + } + Some((self.locs.get(s).unwrap(), self.locs.get(e).unwrap())) + } + + /// Returns the matched string for the capture group `i`. + /// If `i` isn't a valid capture group or didn't match anything, then the + /// empty string is returned. + pub fn at(&self, i: uint) -> &'t str { + match self.pos(i) { + None => "", + Some((s, e)) => { + self.text.slice(s, e) + } + } + } + + /// Returns the matched string for the capture group named `name`. + /// If `name` isn't a valid capture group or didn't match anything, then + /// the empty string is returned. + pub fn name(&self, name: &str) -> &'t str { + match self.named { + None => "", + Some(ref h) => { + match h.find_equiv(&name) { + None => "", + Some(i) => self.at(*i), + } + } + } + } + + /// Creates an iterator of all the capture groups in order of appearance + /// in the regular expression. + pub fn iter(&'t self) -> SubCaptures<'t> { + SubCaptures { idx: 0, caps: self, } + } + + /// Creates an iterator of all the capture group positions in order of + /// appearance in the regular expression. Positions are byte indices + /// in terms of the original string matched. + pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { + SubCapturesPos { idx: 0, caps: self, } + } + + /// Expands all instances of `$name` in `text` to the corresponding capture + /// group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist or + /// isn't a valid index), then it is replaced with the empty string. + /// + /// To write a literal `$` use `$$`. + pub fn expand(&self, text: &str) -> StrBuf { + // How evil can you get? + // FIXME: Don't use regexes for this. It's completely unnecessary. + let re = Regex::new(r"(^|[^$]|\b)\$(\w+)").unwrap(); + let text = re.replace_all(text, |refs: &Captures| -> ~str { + let (pre, name) = (refs.at(1), refs.at(2)); + pre + match from_str::<uint>(name) { + None => self.name(name).to_owned(), + Some(i) => self.at(i).to_owned(), + } + }); + let re = Regex::new(r"\$\$").unwrap(); + re.replace_all(text.as_slice(), NoExpand("$")) + } +} + +impl<'t> Container for Captures<'t> { + /// Returns the number of captured groups. + #[inline] + fn len(&self) -> uint { + self.locs.len() / 2 + } +} + +/// An iterator over capture groups for a particular match of a regular +/// expression. +/// +/// `'t` is the lifetime of the matched text. +pub struct SubCaptures<'t> { + idx: uint, + caps: &'t Captures<'t>, +} + +impl<'t> Iterator<&'t str> for SubCaptures<'t> { + fn next(&mut self) -> Option<&'t str> { + if self.idx < self.caps.len() { + self.idx += 1; + Some(self.caps.at(self.idx - 1)) + } else { + None + } + } +} + +/// An iterator over capture group positions for a particular match of a +/// regular expression. +/// +/// Positions are byte indices in terms of the original string matched. +/// +/// `'t` is the lifetime of the matched text. +pub struct SubCapturesPos<'t> { + idx: uint, + caps: &'t Captures<'t>, +} + +impl<'t> Iterator<Option<(uint, uint)>> for SubCapturesPos<'t> { + fn next(&mut self) -> Option<Option<(uint, uint)>> { + if self.idx < self.caps.len() { + self.idx += 1; + Some(self.caps.pos(self.idx - 1)) + } else { + None + } + } +} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. The iterator stops when no more matches can +/// be found. +/// +/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime +/// of the matched string. +pub struct FindCaptures<'r, 't> { + re: &'r Regex, + search: &'t str, + last_match: Option<uint>, + last_end: uint, +} + +impl<'r, 't> Iterator<Captures<'t>> for FindCaptures<'r, 't> { + fn next(&mut self) -> Option<Captures<'t>> { + if self.last_end > self.search.len() { + return None + } + + let caps = exec_slice(self.re, Submatches, self.search, + self.last_end, self.search.len()); + let (s, e) = + if !has_match(&caps) { + return None + } else { + (caps.get(0).unwrap(), caps.get(1).unwrap()) + }; + + // Don't accept empty matches immediately following a match. + // i.e., no infinite loops please. + if e - s == 0 && Some(self.last_end) == self.last_match { + self.last_end += 1; + return self.next() + } + self.last_end = e; + self.last_match = Some(self.last_end); + Captures::new(self.re, self.search, caps) + } +} + +/// An iterator over all non-overlapping matches for a particular string. +/// +/// The iterator yields a tuple of integers corresponding to the start and end +/// of the match. The indices are byte offsets. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime +/// of the matched string. +pub struct FindMatches<'r, 't> { + re: &'r Regex, + search: &'t str, + last_match: Option<uint>, + last_end: uint, +} + +impl<'r, 't> Iterator<(uint, uint)> for FindMatches<'r, 't> { + fn next(&mut self) -> Option<(uint, uint)> { + if self.last_end > self.search.len() { + return None + } + + let caps = exec_slice(self.re, Location, self.search, + self.last_end, self.search.len()); + let (s, e) = + if !has_match(&caps) { + return None + } else { + (caps.get(0).unwrap(), caps.get(1).unwrap()) + }; + + // Don't accept empty matches immediately following a match. + // i.e., no infinite loops please. + if e - s == 0 && Some(self.last_end) == self.last_match { + self.last_end += 1; + return self.next() + } + self.last_end = e; + self.last_match = Some(self.last_end); + Some((s, e)) + } +} + +fn exec(re: &Regex, which: MatchKind, input: &str) -> CaptureLocs { + exec_slice(re, which, input, 0, input.len()) +} + +fn exec_slice(re: &Regex, which: MatchKind, + input: &str, s: uint, e: uint) -> CaptureLocs { + match re.p { + Dynamic(ref prog) => vm::run(which, prog, input, s, e), + Native(exec) => exec(which, input, s, e), + } +} + +#[inline] +fn has_match(caps: &CaptureLocs) -> bool { + caps.len() >= 2 && caps.get(0).is_some() && caps.get(1).is_some() +} diff --git a/src/libregex/test/bench.rs b/src/libregex/test/bench.rs new file mode 100644 index 0000000000000..a5667ab088e75 --- /dev/null +++ b/src/libregex/test/bench.rs @@ -0,0 +1,179 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use rand::{Rng, task_rng}; +use stdtest::Bencher; +use std::str; +use regex::{Regex, NoExpand}; + +fn bench_assert_match(b: &mut Bencher, re: Regex, text: &str) { + b.iter(|| if !re.is_match(text) { fail!("no match") }); +} + +#[bench] +fn no_exponential(b: &mut Bencher) { + let n = 100; + let re = Regex::new("a?".repeat(n) + "a".repeat(n)).unwrap(); + let text = "a".repeat(n); + bench_assert_match(b, re, text); +} + +#[bench] +fn literal(b: &mut Bencher) { + let re = regex!("y"); + let text = "x".repeat(50) + "y"; + bench_assert_match(b, re, text); +} + +#[bench] +fn not_literal(b: &mut Bencher) { + let re = regex!(".y"); + let text = "x".repeat(50) + "y"; + bench_assert_match(b, re, text); +} + +#[bench] +fn match_class(b: &mut Bencher) { + let re = regex!("[abcdw]"); + let text = "xxxx".repeat(20) + "w"; + bench_assert_match(b, re, text); +} + +#[bench] +fn match_class_in_range(b: &mut Bencher) { + // 'b' is between 'a' and 'c', so the class range checking doesn't help. + let re = regex!("[ac]"); + let text = "bbbb".repeat(20) + "c"; + bench_assert_match(b, re, text); +} + +#[bench] +fn replace_all(b: &mut Bencher) { + let re = regex!("[cjrw]"); + let text = "abcdefghijklmnopqrstuvwxyz"; + // FIXME: This isn't using the $name expand stuff. + // It's possible RE2/Go is using it, but currently, the expand in this + // crate is actually compiling a regex, so it's incredibly slow. + b.iter(|| re.replace_all(text, NoExpand(""))); +} + +#[bench] +fn anchored_literal_short_non_match(b: &mut Bencher) { + let re = regex!("^zbc(d|e)"); + let text = "abcdefghijklmnopqrstuvwxyz"; + b.iter(|| re.is_match(text)); +} + +#[bench] +fn anchored_literal_long_non_match(b: &mut Bencher) { + let re = regex!("^zbc(d|e)"); + let text = "abcdefghijklmnopqrstuvwxyz".repeat(15); + b.iter(|| re.is_match(text)); +} + +#[bench] +fn anchored_literal_short_match(b: &mut Bencher) { + let re = regex!("^.bc(d|e)"); + let text = "abcdefghijklmnopqrstuvwxyz"; + b.iter(|| re.is_match(text)); +} + +#[bench] +fn anchored_literal_long_match(b: &mut Bencher) { + let re = regex!("^.bc(d|e)"); + let text = "abcdefghijklmnopqrstuvwxyz".repeat(15); + b.iter(|| re.is_match(text)); +} + +#[bench] +fn one_pass_short_a(b: &mut Bencher) { + let re = regex!("^.bc(d|e)*$"); + let text = "abcddddddeeeededd"; + b.iter(|| re.is_match(text)); +} + +#[bench] +fn one_pass_short_a_not(b: &mut Bencher) { + let re = regex!(".bc(d|e)*$"); + let text = "abcddddddeeeededd"; + b.iter(|| re.is_match(text)); +} + +#[bench] +fn one_pass_short_b(b: &mut Bencher) { + let re = regex!("^.bc(?:d|e)*$"); + let text = "abcddddddeeeededd"; + b.iter(|| re.is_match(text)); +} + +#[bench] +fn one_pass_short_b_not(b: &mut Bencher) { + let re = regex!(".bc(?:d|e)*$"); + let text = "abcddddddeeeededd"; + b.iter(|| re.is_match(text)); +} + +#[bench] +fn one_pass_long_prefix(b: &mut Bencher) { + let re = regex!("^abcdefghijklmnopqrstuvwxyz.*$"); + let text = "abcdefghijklmnopqrstuvwxyz"; + b.iter(|| re.is_match(text)); +} + +#[bench] +fn one_pass_long_prefix_not(b: &mut Bencher) { + let re = regex!("^.bcdefghijklmnopqrstuvwxyz.*$"); + let text = "abcdefghijklmnopqrstuvwxyz"; + b.iter(|| re.is_match(text)); +} + +macro_rules! throughput( + ($name:ident, $regex:expr, $size:expr) => ( + #[bench] + fn $name(b: &mut Bencher) { + let text = gen_text($size); + b.bytes = $size; + b.iter(|| if $regex.is_match(text) { fail!("match") }); + } + ); +) + +fn easy0() -> Regex { regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } +fn easy1() -> Regex { regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") } +fn medium() -> Regex { regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } +fn hard() -> Regex { regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } + +fn gen_text(n: uint) -> ~str { + let mut rng = task_rng(); + let mut bytes = rng.gen_ascii_str(n).into_bytes(); + for (i, b) in bytes.mut_iter().enumerate() { + if i % 20 == 0 { + *b = '\n' as u8 + } + } + str::from_utf8(bytes).unwrap().to_owned() +} + +throughput!(easy0_32, easy0(), 32) +throughput!(easy0_1K, easy0(), 1<<10) +throughput!(easy0_32K, easy0(), 32<<10) + +throughput!(easy1_32, easy1(), 32) +throughput!(easy1_1K, easy1(), 1<<10) +throughput!(easy1_32K, easy1(), 32<<10) + +throughput!(medium_32, medium(), 32) +throughput!(medium_1K, medium(), 1<<10) +throughput!(medium_32K,medium(), 32<<10) + +throughput!(hard_32, hard(), 32) +throughput!(hard_1K, hard(), 1<<10) +throughput!(hard_32K,hard(), 32<<10) + diff --git a/src/libregex/test/matches.rs b/src/libregex/test/matches.rs new file mode 100644 index 0000000000000..fb938513cb10b --- /dev/null +++ b/src/libregex/test/matches.rs @@ -0,0 +1,373 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// ignore-tidy-linelength + +// DO NOT EDIT. Automatically generated by 'src/etc/regex-match-tests' +// on 2014-04-23 01:33:36.539280. + +// Tests from basic.dat +mat!(match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18))) +mat!(match_basic_4, r"a...b", r"abababbb", Some((2, 7))) +mat!(match_basic_5, r"XXXXXX", r"..XXXXXX", Some((2, 8))) +mat!(match_basic_6, r"\)", r"()", Some((1, 2))) +mat!(match_basic_7, r"a]", r"a]a", Some((0, 2))) +mat!(match_basic_9, r"\}", r"}", Some((0, 1))) +mat!(match_basic_10, r"\]", r"]", Some((0, 1))) +mat!(match_basic_12, r"]", r"]", Some((0, 1))) +mat!(match_basic_15, r"^a", r"ax", Some((0, 1))) +mat!(match_basic_16, r"\^a", r"a^a", Some((1, 3))) +mat!(match_basic_17, r"a\^", r"a^", Some((0, 2))) +mat!(match_basic_18, r"a$", r"aa", Some((1, 2))) +mat!(match_basic_19, r"a\$", r"a$", Some((0, 2))) +mat!(match_basic_20, r"^$", r"", Some((0, 0))) +mat!(match_basic_21, r"$^", r"", Some((0, 0))) +mat!(match_basic_22, r"a($)", r"aa", Some((1, 2)), Some((2, 2))) +mat!(match_basic_23, r"a*(^a)", r"aa", Some((0, 1)), Some((0, 1))) +mat!(match_basic_24, r"(..)*(...)*", r"a", Some((0, 0))) +mat!(match_basic_25, r"(..)*(...)*", r"abcd", Some((0, 4)), Some((2, 4))) +mat!(match_basic_26, r"(ab|a)(bc|c)", r"abc", Some((0, 3)), Some((0, 2)), Some((2, 3))) +mat!(match_basic_27, r"(ab)c|abc", r"abc", Some((0, 3)), Some((0, 2))) +mat!(match_basic_28, r"a{0}b", r"ab", Some((1, 2))) +mat!(match_basic_29, r"(a*)(b?)(b+)b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7))) +mat!(match_basic_30, r"(a*)(b{0,1})(b{1,})b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7))) +mat!(match_basic_32, r"((a|a)|a)", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1))) +mat!(match_basic_33, r"(a*)(a|aa)", r"aaaa", Some((0, 4)), Some((0, 3)), Some((3, 4))) +mat!(match_basic_34, r"a*(a.|aa)", r"aaaa", Some((0, 4)), Some((2, 4))) +mat!(match_basic_35, r"a(b)|c(d)|a(e)f", r"aef", Some((0, 3)), None, None, Some((1, 2))) +mat!(match_basic_36, r"(a|b)?.*", r"b", Some((0, 1)), Some((0, 1))) +mat!(match_basic_37, r"(a|b)c|a(b|c)", r"ac", Some((0, 2)), Some((0, 1))) +mat!(match_basic_38, r"(a|b)c|a(b|c)", r"ab", Some((0, 2)), None, Some((1, 2))) +mat!(match_basic_39, r"(a|b)*c|(a|ab)*c", r"abc", Some((0, 3)), Some((1, 2))) +mat!(match_basic_40, r"(a|b)*c|(a|ab)*c", r"xc", Some((1, 2))) +mat!(match_basic_41, r"(.a|.b).*|.*(.a|.b)", r"xa", Some((0, 2)), Some((0, 2))) +mat!(match_basic_42, r"a?(ab|ba)ab", r"abab", Some((0, 4)), Some((0, 2))) +mat!(match_basic_43, r"a?(ac{0}b|ba)ab", r"abab", Some((0, 4)), Some((0, 2))) +mat!(match_basic_44, r"ab|abab", r"abbabab", Some((0, 2))) +mat!(match_basic_45, r"aba|bab|bba", r"baaabbbaba", Some((5, 8))) +mat!(match_basic_46, r"aba|bab", r"baaabbbaba", Some((6, 9))) +mat!(match_basic_47, r"(aa|aaa)*|(a|aaaaa)", r"aa", Some((0, 2)), Some((0, 2))) +mat!(match_basic_48, r"(a.|.a.)*|(a|.a...)", r"aa", Some((0, 2)), Some((0, 2))) +mat!(match_basic_49, r"ab|a", r"xabc", Some((1, 3))) +mat!(match_basic_50, r"ab|a", r"xxabc", Some((2, 4))) +mat!(match_basic_51, r"(?i)(Ab|cD)*", r"aBcD", Some((0, 4)), Some((2, 4))) +mat!(match_basic_52, r"[^-]", r"--a", Some((2, 3))) +mat!(match_basic_53, r"[a-]*", r"--a", Some((0, 3))) +mat!(match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4))) +mat!(match_basic_55, r":::1:::0:|:::1:1:0:", r":::0:::1:::1:::0:", Some((8, 17))) +mat!(match_basic_56, r":::1:::0:|:::1:1:1:", r":::0:::1:::1:::0:", Some((8, 17))) +mat!(match_basic_57, r"[[:upper:]]", r"A", Some((0, 1))) +mat!(match_basic_58, r"[[:lower:]]+", r"`az{", Some((1, 3))) +mat!(match_basic_59, r"[[:upper:]]+", r"@AZ[", Some((1, 3))) +mat!(match_basic_65, r" +", r" +", Some((0, 1))) +mat!(match_basic_66, r" +", r" +", Some((0, 1))) +mat!(match_basic_67, r"[^a]", r" +", Some((0, 1))) +mat!(match_basic_68, r" +a", r" +a", Some((0, 2))) +mat!(match_basic_69, r"(a)(b)(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((2, 3))) +mat!(match_basic_70, r"xxx", r"xxx", Some((0, 3))) +mat!(match_basic_71, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 6,", Some((0, 6))) +mat!(match_basic_72, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"2/7", Some((0, 3))) +mat!(match_basic_73, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 1,Feb 6", Some((5, 11))) +mat!(match_basic_74, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))", r"x", Some((0, 1)), Some((0, 1)), Some((0, 1))) +mat!(match_basic_75, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*", r"xx", Some((0, 2)), Some((1, 2)), Some((1, 2))) +mat!(match_basic_76, r"a?(ab|ba)*", r"ababababababababababababababababababababababababababababababababababababababababa", Some((0, 81)), Some((79, 81))) +mat!(match_basic_77, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabbbbaa", Some((18, 25))) +mat!(match_basic_78, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabaa", Some((18, 22))) +mat!(match_basic_79, r"aaac|aabc|abac|abbc|baac|babc|bbac|bbbc", r"baaabbbabac", Some((7, 11))) +mat!(match_basic_80, r".*", r"", Some((0, 2))) +mat!(match_basic_81, r"aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll", r"XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", Some((53, 57))) +mat!(match_basic_83, r"a*a*a*a*a*b", r"aaaaaaaaab", Some((0, 10))) +mat!(match_basic_84, r"^", r"", Some((0, 0))) +mat!(match_basic_85, r"$", r"", Some((0, 0))) +mat!(match_basic_86, r"^$", r"", Some((0, 0))) +mat!(match_basic_87, r"^a$", r"a", Some((0, 1))) +mat!(match_basic_88, r"abc", r"abc", Some((0, 3))) +mat!(match_basic_89, r"abc", r"xabcy", Some((1, 4))) +mat!(match_basic_90, r"abc", r"ababc", Some((2, 5))) +mat!(match_basic_91, r"ab*c", r"abc", Some((0, 3))) +mat!(match_basic_92, r"ab*bc", r"abc", Some((0, 3))) +mat!(match_basic_93, r"ab*bc", r"abbc", Some((0, 4))) +mat!(match_basic_94, r"ab*bc", r"abbbbc", Some((0, 6))) +mat!(match_basic_95, r"ab+bc", r"abbc", Some((0, 4))) +mat!(match_basic_96, r"ab+bc", r"abbbbc", Some((0, 6))) +mat!(match_basic_97, r"ab?bc", r"abbc", Some((0, 4))) +mat!(match_basic_98, r"ab?bc", r"abc", Some((0, 3))) +mat!(match_basic_99, r"ab?c", r"abc", Some((0, 3))) +mat!(match_basic_100, r"^abc$", r"abc", Some((0, 3))) +mat!(match_basic_101, r"^abc", r"abcc", Some((0, 3))) +mat!(match_basic_102, r"abc$", r"aabc", Some((1, 4))) +mat!(match_basic_103, r"^", r"abc", Some((0, 0))) +mat!(match_basic_104, r"$", r"abc", Some((3, 3))) +mat!(match_basic_105, r"a.c", r"abc", Some((0, 3))) +mat!(match_basic_106, r"a.c", r"axc", Some((0, 3))) +mat!(match_basic_107, r"a.*c", r"axyzc", Some((0, 5))) +mat!(match_basic_108, r"a[bc]d", r"abd", Some((0, 3))) +mat!(match_basic_109, r"a[b-d]e", r"ace", Some((0, 3))) +mat!(match_basic_110, r"a[b-d]", r"aac", Some((1, 3))) +mat!(match_basic_111, r"a[-b]", r"a-", Some((0, 2))) +mat!(match_basic_112, r"a[b-]", r"a-", Some((0, 2))) +mat!(match_basic_113, r"a]", r"a]", Some((0, 2))) +mat!(match_basic_114, r"a[]]b", r"a]b", Some((0, 3))) +mat!(match_basic_115, r"a[^bc]d", r"aed", Some((0, 3))) +mat!(match_basic_116, r"a[^-b]c", r"adc", Some((0, 3))) +mat!(match_basic_117, r"a[^]b]c", r"adc", Some((0, 3))) +mat!(match_basic_118, r"ab|cd", r"abc", Some((0, 2))) +mat!(match_basic_119, r"ab|cd", r"abcd", Some((0, 2))) +mat!(match_basic_120, r"a\(b", r"a(b", Some((0, 3))) +mat!(match_basic_121, r"a\(*b", r"ab", Some((0, 2))) +mat!(match_basic_122, r"a\(*b", r"a((b", Some((0, 4))) +mat!(match_basic_123, r"((a))", r"abc", Some((0, 1)), Some((0, 1)), Some((0, 1))) +mat!(match_basic_124, r"(a)b(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((2, 3))) +mat!(match_basic_125, r"a+b+c", r"aabbabc", Some((4, 7))) +mat!(match_basic_126, r"a*", r"aaa", Some((0, 3))) +mat!(match_basic_128, r"(a*)*", r"-", Some((0, 0)), None) +mat!(match_basic_129, r"(a*)+", r"-", Some((0, 0)), Some((0, 0))) +mat!(match_basic_131, r"(a*|b)*", r"-", Some((0, 0)), None) +mat!(match_basic_132, r"(a+|b)*", r"ab", Some((0, 2)), Some((1, 2))) +mat!(match_basic_133, r"(a+|b)+", r"ab", Some((0, 2)), Some((1, 2))) +mat!(match_basic_134, r"(a+|b)?", r"ab", Some((0, 1)), Some((0, 1))) +mat!(match_basic_135, r"[^ab]*", r"cde", Some((0, 3))) +mat!(match_basic_137, r"(^)*", r"-", Some((0, 0)), None) +mat!(match_basic_138, r"a*", r"", Some((0, 0))) +mat!(match_basic_139, r"([abc])*d", r"abbbcd", Some((0, 6)), Some((4, 5))) +mat!(match_basic_140, r"([abc])*bcd", r"abcd", Some((0, 4)), Some((0, 1))) +mat!(match_basic_141, r"a|b|c|d|e", r"e", Some((0, 1))) +mat!(match_basic_142, r"(a|b|c|d|e)f", r"ef", Some((0, 2)), Some((0, 1))) +mat!(match_basic_144, r"((a*|b))*", r"-", Some((0, 0)), None, None) +mat!(match_basic_145, r"abcd*efg", r"abcdefg", Some((0, 7))) +mat!(match_basic_146, r"ab*", r"xabyabbbz", Some((1, 3))) +mat!(match_basic_147, r"ab*", r"xayabbbz", Some((1, 2))) +mat!(match_basic_148, r"(ab|cd)e", r"abcde", Some((2, 5)), Some((2, 4))) +mat!(match_basic_149, r"[abhgefdc]ij", r"hij", Some((0, 3))) +mat!(match_basic_150, r"(a|b)c*d", r"abcd", Some((1, 4)), Some((1, 2))) +mat!(match_basic_151, r"(ab|ab*)bc", r"abc", Some((0, 3)), Some((0, 1))) +mat!(match_basic_152, r"a([bc]*)c*", r"abc", Some((0, 3)), Some((1, 3))) +mat!(match_basic_153, r"a([bc]*)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4))) +mat!(match_basic_154, r"a([bc]+)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4))) +mat!(match_basic_155, r"a([bc]*)(c+d)", r"abcd", Some((0, 4)), Some((1, 2)), Some((2, 4))) +mat!(match_basic_156, r"a[bcd]*dcdcde", r"adcdcde", Some((0, 7))) +mat!(match_basic_157, r"(ab|a)b*c", r"abc", Some((0, 3)), Some((0, 2))) +mat!(match_basic_158, r"((a)(b)c)(d)", r"abcd", Some((0, 4)), Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((3, 4))) +mat!(match_basic_159, r"[A-Za-z_][A-Za-z0-9_]*", r"alpha", Some((0, 5))) +mat!(match_basic_160, r"^a(bc+|b[eh])g|.h$", r"abh", Some((1, 3))) +mat!(match_basic_161, r"(bc+d$|ef*g.|h?i(j|k))", r"effgz", Some((0, 5)), Some((0, 5))) +mat!(match_basic_162, r"(bc+d$|ef*g.|h?i(j|k))", r"ij", Some((0, 2)), Some((0, 2)), Some((1, 2))) +mat!(match_basic_163, r"(bc+d$|ef*g.|h?i(j|k))", r"reffgz", Some((1, 6)), Some((1, 6))) +mat!(match_basic_164, r"(((((((((a)))))))))", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1))) +mat!(match_basic_165, r"multiple words", r"multiple words yeah", Some((0, 14))) +mat!(match_basic_166, r"(.*)c(.*)", r"abcde", Some((0, 5)), Some((0, 2)), Some((3, 5))) +mat!(match_basic_167, r"abcd", r"abcd", Some((0, 4))) +mat!(match_basic_168, r"a(bc)d", r"abcd", Some((0, 4)), Some((1, 3))) +mat!(match_basic_169, r"a[-]?c", r"ac", Some((0, 3))) +mat!(match_basic_170, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qaddafi", Some((0, 15)), None, Some((10, 12))) +mat!(match_basic_171, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mo'ammar Gadhafi", Some((0, 16)), None, Some((11, 13))) +mat!(match_basic_172, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Kaddafi", Some((0, 15)), None, Some((10, 12))) +mat!(match_basic_173, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qadhafi", Some((0, 15)), None, Some((10, 12))) +mat!(match_basic_174, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gadafi", Some((0, 14)), None, Some((10, 11))) +mat!(match_basic_175, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadafi", Some((0, 15)), None, Some((11, 12))) +mat!(match_basic_176, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moamar Gaddafi", Some((0, 14)), None, Some((9, 11))) +mat!(match_basic_177, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadhdhafi", Some((0, 18)), None, Some((13, 15))) +mat!(match_basic_178, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Khaddafi", Some((0, 16)), None, Some((11, 13))) +mat!(match_basic_179, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafy", Some((0, 16)), None, Some((11, 13))) +mat!(match_basic_180, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghadafi", Some((0, 15)), None, Some((11, 12))) +mat!(match_basic_181, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafi", Some((0, 16)), None, Some((11, 13))) +mat!(match_basic_182, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muamar Kaddafi", Some((0, 14)), None, Some((9, 11))) +mat!(match_basic_183, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Quathafi", Some((0, 16)), None, Some((11, 13))) +mat!(match_basic_184, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gheddafi", Some((0, 16)), None, Some((11, 13))) +mat!(match_basic_185, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Khadafy", Some((0, 15)), None, Some((11, 12))) +mat!(match_basic_186, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Qudhafi", Some((0, 15)), None, Some((10, 12))) +mat!(match_basic_187, r"a+(b|c)*d+", r"aabcdd", Some((0, 6)), Some((3, 4))) +mat!(match_basic_188, r"^.+$", r"vivi", Some((0, 4))) +mat!(match_basic_189, r"^(.+)$", r"vivi", Some((0, 4)), Some((0, 4))) +mat!(match_basic_190, r"^([^!.]+).att.com!(.+)$", r"gryphon.att.com!eby", Some((0, 19)), Some((0, 7)), Some((16, 19))) +mat!(match_basic_191, r"^([^!]+!)?([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3))) +mat!(match_basic_192, r"^([^!]+!)?([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))) +mat!(match_basic_193, r"^([^!]+!)?([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))) +mat!(match_basic_194, r"^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), Some((4, 8)), Some((8, 11))) +mat!(match_basic_195, r"((foo)|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), None, Some((0, 3))) +mat!(match_basic_196, r"((foo)|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), None, Some((4, 7))) +mat!(match_basic_197, r"((foo)|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3))) +mat!(match_basic_198, r"((foo)|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3))) +mat!(match_basic_199, r"((foo)|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7))) +mat!(match_basic_200, r"((foo)|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3))) +mat!(match_basic_201, r"(foo|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), Some((0, 3))) +mat!(match_basic_202, r"(foo|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), Some((4, 7))) +mat!(match_basic_203, r"(foo|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3))) +mat!(match_basic_204, r"(foo|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3))) +mat!(match_basic_205, r"(foo|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7))) +mat!(match_basic_206, r"(foo|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3))) +mat!(match_basic_207, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11))) +mat!(match_basic_208, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3))) +mat!(match_basic_209, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))) +mat!(match_basic_210, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), None, None, Some((4, 8)), Some((8, 11))) +mat!(match_basic_211, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))) +mat!(match_basic_212, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bas", Some((0, 3)), Some((0, 3)), None, Some((0, 3))) +mat!(match_basic_213, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bar!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7))) +mat!(match_basic_214, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11))) +mat!(match_basic_215, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7))) +mat!(match_basic_216, r".*(/XXX).*", r"/XXX", Some((0, 4)), Some((0, 4))) +mat!(match_basic_217, r".*(\\XXX).*", r"\XXX", Some((0, 4)), Some((0, 4))) +mat!(match_basic_218, r"\\XXX", r"\XXX", Some((0, 4))) +mat!(match_basic_219, r".*(/000).*", r"/000", Some((0, 4)), Some((0, 4))) +mat!(match_basic_220, r".*(\\000).*", r"\000", Some((0, 4)), Some((0, 4))) +mat!(match_basic_221, r"\\000", r"\000", Some((0, 4))) + +// Tests from nullsubexpr.dat +mat!(match_nullsubexpr_3, r"(a*)*", r"a", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_5, r"(a*)*", r"x", Some((0, 0)), None) +mat!(match_nullsubexpr_6, r"(a*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_7, r"(a*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_8, r"(a*)+", r"a", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_9, r"(a*)+", r"x", Some((0, 0)), Some((0, 0))) +mat!(match_nullsubexpr_10, r"(a*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_11, r"(a*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_12, r"(a+)*", r"a", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_13, r"(a+)*", r"x", Some((0, 0))) +mat!(match_nullsubexpr_14, r"(a+)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_15, r"(a+)*", r"aaaaaax", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_16, r"(a+)+", r"a", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_17, r"(a+)+", r"x", None) +mat!(match_nullsubexpr_18, r"(a+)+", r"aaaaaa", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_19, r"(a+)+", r"aaaaaax", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_21, r"([a]*)*", r"a", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_23, r"([a]*)*", r"x", Some((0, 0)), None) +mat!(match_nullsubexpr_24, r"([a]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_25, r"([a]*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_26, r"([a]*)+", r"a", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_27, r"([a]*)+", r"x", Some((0, 0)), Some((0, 0))) +mat!(match_nullsubexpr_28, r"([a]*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_29, r"([a]*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_30, r"([^b]*)*", r"a", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_32, r"([^b]*)*", r"b", Some((0, 0)), None) +mat!(match_nullsubexpr_33, r"([^b]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_34, r"([^b]*)*", r"aaaaaab", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_35, r"([ab]*)*", r"a", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_36, r"([ab]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_37, r"([ab]*)*", r"ababab", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_38, r"([ab]*)*", r"bababa", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_39, r"([ab]*)*", r"b", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_40, r"([ab]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_41, r"([ab]*)*", r"aaaabcde", Some((0, 5)), Some((0, 5))) +mat!(match_nullsubexpr_42, r"([^a]*)*", r"b", Some((0, 1)), Some((0, 1))) +mat!(match_nullsubexpr_43, r"([^a]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_45, r"([^a]*)*", r"aaaaaa", Some((0, 0)), None) +mat!(match_nullsubexpr_46, r"([^ab]*)*", r"ccccxx", Some((0, 6)), Some((0, 6))) +mat!(match_nullsubexpr_48, r"([^ab]*)*", r"ababab", Some((0, 0)), None) +mat!(match_nullsubexpr_50, r"((z)+|a)*", r"zabcde", Some((0, 2)), Some((1, 2))) +mat!(match_nullsubexpr_69, r"(a*)*(x)", r"x", Some((0, 1)), None, Some((0, 1))) +mat!(match_nullsubexpr_70, r"(a*)*(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2))) +mat!(match_nullsubexpr_71, r"(a*)*(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2))) +mat!(match_nullsubexpr_73, r"(a*)+(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1))) +mat!(match_nullsubexpr_74, r"(a*)+(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2))) +mat!(match_nullsubexpr_75, r"(a*)+(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2))) +mat!(match_nullsubexpr_77, r"(a*){2}(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1))) +mat!(match_nullsubexpr_78, r"(a*){2}(x)", r"ax", Some((0, 2)), Some((1, 1)), Some((1, 2))) +mat!(match_nullsubexpr_79, r"(a*){2}(x)", r"axa", Some((0, 2)), Some((1, 1)), Some((1, 2))) + +// Tests from repetition.dat +mat!(match_repetition_10, r"((..)|(.))", r"", None) +mat!(match_repetition_11, r"((..)|(.))((..)|(.))", r"", None) +mat!(match_repetition_12, r"((..)|(.))((..)|(.))((..)|(.))", r"", None) +mat!(match_repetition_14, r"((..)|(.)){1}", r"", None) +mat!(match_repetition_15, r"((..)|(.)){2}", r"", None) +mat!(match_repetition_16, r"((..)|(.)){3}", r"", None) +mat!(match_repetition_18, r"((..)|(.))*", r"", Some((0, 0))) +mat!(match_repetition_20, r"((..)|(.))", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1))) +mat!(match_repetition_21, r"((..)|(.))((..)|(.))", r"a", None) +mat!(match_repetition_22, r"((..)|(.))((..)|(.))((..)|(.))", r"a", None) +mat!(match_repetition_24, r"((..)|(.)){1}", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1))) +mat!(match_repetition_25, r"((..)|(.)){2}", r"a", None) +mat!(match_repetition_26, r"((..)|(.)){3}", r"a", None) +mat!(match_repetition_28, r"((..)|(.))*", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1))) +mat!(match_repetition_30, r"((..)|(.))", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_31, r"((..)|(.))((..)|(.))", r"aa", Some((0, 2)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2))) +mat!(match_repetition_32, r"((..)|(.))((..)|(.))((..)|(.))", r"aa", None) +mat!(match_repetition_34, r"((..)|(.)){1}", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_35, r"((..)|(.)){2}", r"aa", Some((0, 2)), Some((1, 2)), None, Some((1, 2))) +mat!(match_repetition_36, r"((..)|(.)){3}", r"aa", None) +mat!(match_repetition_38, r"((..)|(.))*", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_40, r"((..)|(.))", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_41, r"((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3))) +mat!(match_repetition_42, r"((..)|(.))((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2)), Some((2, 3)), None, Some((2, 3))) +mat!(match_repetition_44, r"((..)|(.)){1}", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_46, r"((..)|(.)){2}", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3))) +mat!(match_repetition_47, r"((..)|(.)){3}", r"aaa", Some((0, 3)), Some((2, 3)), None, Some((2, 3))) +mat!(match_repetition_50, r"((..)|(.))*", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3))) +mat!(match_repetition_52, r"((..)|(.))", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_53, r"((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None) +mat!(match_repetition_54, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3)), Some((3, 4)), None, Some((3, 4))) +mat!(match_repetition_56, r"((..)|(.)){1}", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_57, r"((..)|(.)){2}", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None) +mat!(match_repetition_59, r"((..)|(.)){3}", r"aaaa", Some((0, 4)), Some((3, 4)), Some((0, 2)), Some((3, 4))) +mat!(match_repetition_61, r"((..)|(.))*", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None) +mat!(match_repetition_63, r"((..)|(.))", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_64, r"((..)|(.))((..)|(.))", r"aaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None) +mat!(match_repetition_65, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaa", Some((0, 5)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 5)), None, Some((4, 5))) +mat!(match_repetition_67, r"((..)|(.)){1}", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_68, r"((..)|(.)){2}", r"aaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None) +mat!(match_repetition_70, r"((..)|(.)){3}", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5))) +mat!(match_repetition_73, r"((..)|(.))*", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5))) +mat!(match_repetition_75, r"((..)|(.))", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_76, r"((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None) +mat!(match_repetition_77, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 6)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 6)), Some((4, 6)), None) +mat!(match_repetition_79, r"((..)|(.)){1}", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) +mat!(match_repetition_80, r"((..)|(.)){2}", r"aaaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None) +mat!(match_repetition_81, r"((..)|(.)){3}", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None) +mat!(match_repetition_83, r"((..)|(.))*", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None) +mat!(match_repetition_90, r"X(.?){0,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) +mat!(match_repetition_91, r"X(.?){1,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) +mat!(match_repetition_92, r"X(.?){2,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) +mat!(match_repetition_93, r"X(.?){3,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) +mat!(match_repetition_94, r"X(.?){4,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) +mat!(match_repetition_95, r"X(.?){5,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) +mat!(match_repetition_96, r"X(.?){6,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) +mat!(match_repetition_97, r"X(.?){7,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) +mat!(match_repetition_98, r"X(.?){8,}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_100, r"X(.?){0,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_102, r"X(.?){1,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_104, r"X(.?){2,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_106, r"X(.?){3,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_108, r"X(.?){4,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_110, r"X(.?){5,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_112, r"X(.?){6,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_114, r"X(.?){7,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_115, r"X(.?){8,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) +mat!(match_repetition_126, r"(a|ab|c|bcd){0,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) +mat!(match_repetition_127, r"(a|ab|c|bcd){1,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) +mat!(match_repetition_128, r"(a|ab|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))) +mat!(match_repetition_129, r"(a|ab|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))) +mat!(match_repetition_130, r"(a|ab|c|bcd){4,}(d*)", r"ababcd", None) +mat!(match_repetition_131, r"(a|ab|c|bcd){0,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) +mat!(match_repetition_132, r"(a|ab|c|bcd){1,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) +mat!(match_repetition_133, r"(a|ab|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))) +mat!(match_repetition_134, r"(a|ab|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))) +mat!(match_repetition_135, r"(a|ab|c|bcd){4,10}(d*)", r"ababcd", None) +mat!(match_repetition_136, r"(a|ab|c|bcd)*(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) +mat!(match_repetition_137, r"(a|ab|c|bcd)+(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) +mat!(match_repetition_143, r"(ab|a|c|bcd){0,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) +mat!(match_repetition_145, r"(ab|a|c|bcd){1,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) +mat!(match_repetition_147, r"(ab|a|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) +mat!(match_repetition_149, r"(ab|a|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) +mat!(match_repetition_150, r"(ab|a|c|bcd){4,}(d*)", r"ababcd", None) +mat!(match_repetition_152, r"(ab|a|c|bcd){0,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) +mat!(match_repetition_154, r"(ab|a|c|bcd){1,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) +mat!(match_repetition_156, r"(ab|a|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) +mat!(match_repetition_158, r"(ab|a|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) +mat!(match_repetition_159, r"(ab|a|c|bcd){4,10}(d*)", r"ababcd", None) +mat!(match_repetition_161, r"(ab|a|c|bcd)*(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) +mat!(match_repetition_163, r"(ab|a|c|bcd)+(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) + diff --git a/src/libregex/test/mod.rs b/src/libregex/test/mod.rs new file mode 100644 index 0000000000000..9386e17e92088 --- /dev/null +++ b/src/libregex/test/mod.rs @@ -0,0 +1,29 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#[cfg(not(stage1))] +#[phase(syntax)] +extern crate regex_macros; + +// Dirty hack: During stage1, test dynamic regexs. For stage2, we test +// native regexs. +#[cfg(stage1)] +macro_rules! regex( + ($re:expr) => ( + match ::regex::Regex::new($re) { + Ok(re) => re, + Err(err) => fail!("{}", err), + } + ); +) + +mod bench; +mod tests; + diff --git a/src/libregex/test/tests.rs b/src/libregex/test/tests.rs new file mode 100644 index 0000000000000..ce8996c681d85 --- /dev/null +++ b/src/libregex/test/tests.rs @@ -0,0 +1,199 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// ignore-tidy-linelength + +use regex::{Regex, NoExpand}; + +#[test] +fn splitn() { + let re = regex!(r"\d+"); + let text = "cauchy123plato456tyler789binx"; + let subs: Vec<&str> = re.splitn(text, 2).collect(); + assert_eq!(subs, vec!("cauchy", "plato456tyler789binx")); +} + +#[test] +fn split() { + let re = regex!(r"\d+"); + let text = "cauchy123plato456tyler789binx"; + let subs: Vec<&str> = re.split(text).collect(); + assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx")); +} + +macro_rules! replace( + ($name:ident, $which:ident, $re:expr, + $search:expr, $replace:expr, $result:expr) => ( + #[test] + fn $name() { + let re = regex!($re); + assert_eq!(re.$which($search, $replace), StrBuf::from_str($result)); + } + ); +) + +replace!(rep_first, replace, r"\d", "age: 26", "Z", "age: Z6") +replace!(rep_plus, replace, r"\d+", "age: 26", "Z", "age: Z") +replace!(rep_all, replace_all, r"\d", "age: 26", "Z", "age: ZZ") +replace!(rep_groups, replace, r"(\S+)\s+(\S+)", "w1 w2", "$2 $1", "w2 w1") +replace!(rep_double_dollar, replace, + r"(\S+)\s+(\S+)", "w1 w2", "$2 $$1", "w2 $1") +replace!(rep_no_expand, replace, + r"(\S+)\s+(\S+)", "w1 w2", NoExpand("$2 $1"), "$2 $1") +replace!(rep_named, replace_all, + r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)", + "w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3") +replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t", + "", "trim me") + +macro_rules! noparse( + ($name:ident, $re:expr) => ( + #[test] + fn $name() { + let re = $re; + match Regex::new(re) { + Err(_) => {}, + Ok(_) => fail!("Regex '{}' should cause a parse error.", re), + } + } + ); +) + +noparse!(fail_double_repeat, "a**") +noparse!(fail_no_repeat_arg, "*") +noparse!(fail_no_repeat_arg_begin, "^*") +noparse!(fail_incomplete_escape, "\\") +noparse!(fail_class_incomplete, "[A-") +noparse!(fail_class_not_closed, "[A") +noparse!(fail_class_no_begin, r"[\A]") +noparse!(fail_class_no_end, r"[\z]") +noparse!(fail_class_no_boundary, r"[\b]") +noparse!(fail_open_paren, "(") +noparse!(fail_close_paren, ")") +noparse!(fail_invalid_range, "[a-Z]") +noparse!(fail_empty_capture_name, "(?P<>a)") +noparse!(fail_empty_capture_exp, "(?P<name>)") +noparse!(fail_bad_capture_name, "(?P<na-me>)") +noparse!(fail_bad_flag, "(?a)a") +noparse!(fail_empty_alt_before, "|a") +noparse!(fail_empty_alt_after, "a|") +noparse!(fail_counted_big_exact, "a{1001}") +noparse!(fail_counted_big_min, "a{1001,}") +noparse!(fail_counted_no_close, "a{1001") +noparse!(fail_unfinished_cap, "(?") +noparse!(fail_unfinished_escape, "\\") +noparse!(fail_octal_digit, r"\8") +noparse!(fail_hex_digit, r"\xG0") +noparse!(fail_hex_short, r"\xF") +noparse!(fail_hex_long_digits, r"\x{fffg}") +noparse!(fail_flag_bad, "(?a)") +noparse!(fail_flag_empty, "(?)") +noparse!(fail_double_neg, "(?-i-i)") +noparse!(fail_neg_empty, "(?i-)") +noparse!(fail_empty_group, "()") +noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)") + +macro_rules! mat( + ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( + #[test] + fn $name() { + let text = $text; + let expected: Vec<Option<(uint, uint)>> = vec!($($loc)+); + let r = regex!($re); + let got = match r.captures(text) { + Some(c) => c.iter_pos().collect::<Vec<Option<(uint, uint)>>>(), + None => vec!(None), + }; + // The test set sometimes leave out capture groups, so truncate + // actual capture groups to match test set. + let (sexpect, mut sgot) = (expected.as_slice(), got.as_slice()); + if sgot.len() > sexpect.len() { + sgot = sgot.slice(0, sexpect.len()) + } + if sexpect != sgot { + fail!("For RE '{}' against '{}', expected '{}' but got '{}'", + $re, text, sexpect, sgot); + } + } + ); +) + +// Some crazy expressions from regular-expressions.info. +mat!(match_ranges, + r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + "num: 255", Some((5, 8))) +mat!(match_ranges_not, + r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + "num: 256", None) +mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3))) +mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3))) +mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4))) +mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None) +mat!(match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + "mine is jam.slam@gmail.com ", Some((8, 26))) +mat!(match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + "mine is jam.slam@gmail ", None) +mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", + "mine is jam.slam@gmail.com ", Some((8, 26))) +mat!(match_date1, + r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-01-01", Some((0, 10))) +mat!(match_date2, + r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-00-01", None) +mat!(match_date3, + r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-13-01", None) + +// Exercise the flags. +mat!(match_flag_case, "(?i)abc", "ABC", Some((0, 3))) +mat!(match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3))) +mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None) +mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2))) +mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4))) +mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None) +mat!(match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2))) +mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11))) +mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1))) +mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2))) +mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2))) + +// Some Unicode tests. +mat!(uni_literal, r"Ⅰ", "Ⅰ", Some((0, 3))) +mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))) +mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))) +mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))) +mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))) +mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))) +mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))) +mat!(uni_case_not, r"Δ", "δ", None) +mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))) +mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))) +mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))) +mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))) + +// Test the Unicode friendliness of Perl character classes. +mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))) +mat!(uni_perl_w_not, r"\w+", "Ⅱ", None) +mat!(uni_perl_w_neg, r"\W+", "Ⅱ", Some((0, 3))) +mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))) +mat!(uni_perl_d_not, r"\d+", "Ⅱ", None) +mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))) +mat!(uni_perl_s, r"\s+", " ", Some((0, 3))) +mat!(uni_perl_s_not, r"\s+", "☃", None) +mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))) + +// And do the same for word boundaries. +mat!(uni_boundary_none, r"\d\b", "6δ", None) +mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))) + +// A whole mess of tests from Glenn Fowler's regex test suite. +// Generated by the 'src/etc/regex-match-tests' program. +mod matches; diff --git a/src/libregex/testdata/LICENSE b/src/libregex/testdata/LICENSE new file mode 100644 index 0000000000000..f47dbf4c449bc --- /dev/null +++ b/src/libregex/testdata/LICENSE @@ -0,0 +1,19 @@ +The following license covers testregex.c and all associated test data. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of the +Software, and to permit persons to whom the Software is furnished to do +so, subject to the following disclaimer: + +THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/libregex/testdata/README b/src/libregex/testdata/README new file mode 100644 index 0000000000000..33b0ba17ed7f6 --- /dev/null +++ b/src/libregex/testdata/README @@ -0,0 +1,17 @@ +Test data was taken from the Go distribution, which was in turn taken from the +testregex test suite: + + http://www2.research.att.com/~astopen/testregex/testregex.html + +The LICENSE in this directory corresponds to the LICENSE that the data was +released under. + +The tests themselves were modified for RE2/Go. A couple were modified further +by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. +(Yes, it seems like RE2/Go includes failing test cases.) This may or may not +have been a bad idea, but I think being consistent with an established Regex +library is worth something. + +Note that these files are read by 'src/etc/regexp-match-tests' and turned into +Rust tests found in 'src/libregexp/tests/matches.rs'. + diff --git a/src/libregex/testdata/basic.dat b/src/libregex/testdata/basic.dat new file mode 100644 index 0000000000000..e55efaeec0624 --- /dev/null +++ b/src/libregex/testdata/basic.dat @@ -0,0 +1,221 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[<element>]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +# No collation in Go +#BE [[-]] [[-]] (2,4) +#BE [[.NIL.]] NULL ECOLLATE +#BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\x7f (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +#E (a*)* - (0,0)(0,0) +E (a*)* - (0,0)(?,?) RE2/Go +E (a*)+ - (0,0)(0,0) +#E (a*|b)* - (0,0)(0,0) +E (a*|b)* - (0,0)(?,?) RE2/Go +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +#E (^)* - (0,0)(0,0) +E (^)* - (0,0)(?,?) RE2/Go +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +#E ((a*|b))* - (0,0)(0,0)(0,0) +E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/src/libregex/testdata/nullsubexpr.dat b/src/libregex/testdata/nullsubexpr.dat new file mode 100644 index 0000000000000..2e18fbb917070 --- /dev/null +++ b/src/libregex/testdata/nullsubexpr.dat @@ -0,0 +1,79 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +#E SAME b (0,0)(0,0) +E SAME b (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +#E SAME aaaaaa (0,0)(0,0) +E SAME aaaaaa (0,0)(?,?) RE2/Go +E ([^ab]*)* ccccxx (0,6)(0,6) +#E SAME ababab (0,0)(0,0) +E SAME ababab (0,0)(?,?) RE2/Go + +E ((z)+|a)* zabcde (0,2)(1,2) + +#{E a+? aaaaaa (0,1) no *? +? mimimal match ops +#E (a) aaa (0,1)(0,1) +#E (a*?) aaa (0,0)(0,0) +#E (a)*? aaa (0,0) +#E (a*?)*? aaa (0,0) +#} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +#E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/src/libregex/testdata/repetition.dat b/src/libregex/testdata/repetition.dat new file mode 100644 index 0000000000000..3bb2121180005 --- /dev/null +++ b/src/libregex/testdata/repetition.dat @@ -0,0 +1,163 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) +E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +# These test a bug in OS X / FreeBSD / NetBSD, and libtree. +# Linux/GLIBC gets the {8,} and {8,8} wrong. + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/src/libregex/unicode.rs b/src/libregex/unicode.rs new file mode 100644 index 0000000000000..c263827dab847 --- /dev/null +++ b/src/libregex/unicode.rs @@ -0,0 +1,5537 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// DO NOT EDIT. Automatically generated by 'src/etc/regex-unicode-tables' +// on 2014-04-23 00:13:04.445491. + +use parse::{Class, NamedClasses}; + +pub static UNICODE_CLASSES: NamedClasses = &[ + +("Arabic", &[ + ('\U00000600', '\U00000604'), + ('\U00000606', '\U0000060b'), + ('\U0000060d', '\U0000061a'), + ('\U0000061c', '\U0000061c'), + ('\U0000061e', '\U0000061e'), + ('\U00000620', '\U0000063f'), + ('\U00000641', '\U0000064a'), + ('\U00000656', '\U0000065f'), + ('\U0000066a', '\U0000066f'), + ('\U00000671', '\U000006dc'), + ('\U000006de', '\U000006ff'), + ('\U00000750', '\U0000077f'), + ('\U000008a0', '\U000008a0'), + ('\U000008a2', '\U000008ac'), + ('\U000008e4', '\U000008fe'), + ('\U0000fb50', '\U0000fbc1'), + ('\U0000fbd3', '\U0000fd3d'), + ('\U0000fd50', '\U0000fd8f'), + ('\U0000fd92', '\U0000fdc7'), + ('\U0000fdf0', '\U0000fdfc'), + ('\U0000fe70', '\U0000fe74'), + ('\U0000fe76', '\U0000fefc'), + ('\U00010e60', '\U00010e7e'), + ('\U0001ee00', '\U0001ee03'), + ('\U0001ee05', '\U0001ee1f'), + ('\U0001ee21', '\U0001ee22'), + ('\U0001ee24', '\U0001ee24'), + ('\U0001ee27', '\U0001ee27'), + ('\U0001ee29', '\U0001ee32'), + ('\U0001ee34', '\U0001ee37'), + ('\U0001ee39', '\U0001ee39'), + ('\U0001ee3b', '\U0001ee3b'), + ('\U0001ee42', '\U0001ee42'), + ('\U0001ee47', '\U0001ee47'), + ('\U0001ee49', '\U0001ee49'), + ('\U0001ee4b', '\U0001ee4b'), + ('\U0001ee4d', '\U0001ee4f'), + ('\U0001ee51', '\U0001ee52'), + ('\U0001ee54', '\U0001ee54'), + ('\U0001ee57', '\U0001ee57'), + ('\U0001ee59', '\U0001ee59'), + ('\U0001ee5b', '\U0001ee5b'), + ('\U0001ee5d', '\U0001ee5d'), + ('\U0001ee5f', '\U0001ee5f'), + ('\U0001ee61', '\U0001ee62'), + ('\U0001ee64', '\U0001ee64'), + ('\U0001ee67', '\U0001ee6a'), + ('\U0001ee6c', '\U0001ee72'), + ('\U0001ee74', '\U0001ee77'), + ('\U0001ee79', '\U0001ee7c'), + ('\U0001ee7e', '\U0001ee7e'), + ('\U0001ee80', '\U0001ee89'), + ('\U0001ee8b', '\U0001ee9b'), + ('\U0001eea1', '\U0001eea3'), + ('\U0001eea5', '\U0001eea9'), + ('\U0001eeab', '\U0001eebb'), + ('\U0001eef0', '\U0001eef1') + ]), +("Armenian", &[ + ('\U00000531', '\U00000556'), + ('\U00000559', '\U0000055f'), + ('\U00000561', '\U00000587'), + ('\U0000058a', '\U0000058a'), + ('\U0000058f', '\U0000058f'), + ('\U0000fb13', '\U0000fb17') + ]), +("Avestan", &[ + ('\U00010b00', '\U00010b35'), + ('\U00010b39', '\U00010b3f') + ]), +("Balinese", &[ + ('\U00001b00', '\U00001b4b'), + ('\U00001b50', '\U00001b7c') + ]), +("Bamum", &[ + ('\U0000a6a0', '\U0000a6f7'), + ('\U00016800', '\U00016a38') + ]), +("Batak", &[ + ('\U00001bc0', '\U00001bf3'), + ('\U00001bfc', '\U00001bff') + ]), +("Bengali", &[ + ('\U00000981', '\U00000983'), + ('\U00000985', '\U0000098c'), + ('\U0000098f', '\U00000990'), + ('\U00000993', '\U000009a8'), + ('\U000009aa', '\U000009b0'), + ('\U000009b2', '\U000009b2'), + ('\U000009b6', '\U000009b9'), + ('\U000009bc', '\U000009c4'), + ('\U000009c7', '\U000009c8'), + ('\U000009cb', '\U000009ce'), + ('\U000009d7', '\U000009d7'), + ('\U000009dc', '\U000009dd'), + ('\U000009df', '\U000009e3'), + ('\U000009e6', '\U000009fb') + ]), +("Bopomofo", &[ + ('\U000002ea', '\U000002eb'), + ('\U00003105', '\U0000312d'), + ('\U000031a0', '\U000031ba') + ]), +("Brahmi", &[ + ('\U00011000', '\U0001104d'), + ('\U00011052', '\U0001106f') + ]), +("Braille", &[ + ('\U00002800', '\U000028ff') + ]), +("Buginese", &[ + ('\U00001a00', '\U00001a1b'), + ('\U00001a1e', '\U00001a1f') + ]), +("Buhid", &[ + ('\U00001740', '\U00001753') + ]), +("C", &[ + ('\U00000000', '\U0000001f'), + ('\U0000007f', '\U0000009f'), + ('\U000000ad', '\U000000ad'), + ('\U00000600', '\U00000604'), + ('\U0000061c', '\U0000061c'), + ('\U000006dd', '\U000006dd'), + ('\U0000070f', '\U0000070f'), + ('\U0000180e', '\U0000180e'), + ('\U0000200b', '\U0000200f'), + ('\U0000202a', '\U0000202e'), + ('\U00002060', '\U00002064'), + ('\U00002066', '\U0000206f'), + ('\U0000e000', '\U0000e000'), + ('\U0000f8ff', '\U0000f8ff'), + ('\U0000feff', '\U0000feff'), + ('\U0000fff9', '\U0000fffb'), + ('\U000110bd', '\U000110bd'), + ('\U0001d173', '\U0001d17a'), + ('\U000e0001', '\U000e0001'), + ('\U000e0020', '\U000e007f'), + ('\U000f0000', '\U000f0000'), + ('\U000ffffd', '\U000ffffd'), + ('\U00100000', '\U00100000'), + ('\U0010fffd', '\U0010fffd') + ]), +("Canadian_Aboriginal", &[ + ('\U00001400', '\U0000167f'), + ('\U000018b0', '\U000018f5') + ]), +("Carian", &[ + ('\U000102a0', '\U000102d0') + ]), +("Cc", &[ + ('\U00000000', '\U0000001f'), + ('\U0000007f', '\U0000009f') + ]), +("Cf", &[ + ('\U000000ad', '\U000000ad'), + ('\U00000600', '\U00000604'), + ('\U0000061c', '\U0000061c'), + ('\U000006dd', '\U000006dd'), + ('\U0000070f', '\U0000070f'), + ('\U0000180e', '\U0000180e'), + ('\U0000200b', '\U0000200f'), + ('\U0000202a', '\U0000202e'), + ('\U00002060', '\U00002064'), + ('\U00002066', '\U0000206f'), + ('\U0000feff', '\U0000feff'), + ('\U0000fff9', '\U0000fffb'), + ('\U000110bd', '\U000110bd'), + ('\U0001d173', '\U0001d17a'), + ('\U000e0001', '\U000e0001'), + ('\U000e0020', '\U000e007f') + ]), +("Chakma", &[ + ('\U00011100', '\U00011134'), + ('\U00011136', '\U00011143') + ]), +("Cham", &[ + ('\U0000aa00', '\U0000aa36'), + ('\U0000aa40', '\U0000aa4d'), + ('\U0000aa50', '\U0000aa59'), + ('\U0000aa5c', '\U0000aa5f') + ]), +("Cherokee", &[ + ('\U000013a0', '\U000013f4') + ]), +("Co", &[ + ('\U0000e000', '\U0000e000'), + ('\U0000f8ff', '\U0000f8ff'), + ('\U000f0000', '\U000f0000'), + ('\U000ffffd', '\U000ffffd'), + ('\U00100000', '\U00100000'), + ('\U0010fffd', '\U0010fffd') + ]), +("Common", &[ + ('\U00000000', '\U00000040'), + ('\U0000005b', '\U00000060'), + ('\U0000007b', '\U000000a9'), + ('\U000000ab', '\U000000b9'), + ('\U000000bb', '\U000000bf'), + ('\U000000d7', '\U000000d7'), + ('\U000000f7', '\U000000f7'), + ('\U000002b9', '\U000002df'), + ('\U000002e5', '\U000002e9'), + ('\U000002ec', '\U000002ff'), + ('\U00000374', '\U00000374'), + ('\U0000037e', '\U0000037e'), + ('\U00000385', '\U00000385'), + ('\U00000387', '\U00000387'), + ('\U00000589', '\U00000589'), + ('\U0000060c', '\U0000060c'), + ('\U0000061b', '\U0000061b'), + ('\U0000061f', '\U0000061f'), + ('\U00000640', '\U00000640'), + ('\U00000660', '\U00000669'), + ('\U000006dd', '\U000006dd'), + ('\U00000964', '\U00000965'), + ('\U00000e3f', '\U00000e3f'), + ('\U00000fd5', '\U00000fd8'), + ('\U000010fb', '\U000010fb'), + ('\U000016eb', '\U000016ed'), + ('\U00001735', '\U00001736'), + ('\U00001802', '\U00001803'), + ('\U00001805', '\U00001805'), + ('\U00001cd3', '\U00001cd3'), + ('\U00001ce1', '\U00001ce1'), + ('\U00001ce9', '\U00001cec'), + ('\U00001cee', '\U00001cf3'), + ('\U00001cf5', '\U00001cf6'), + ('\U00002000', '\U0000200b'), + ('\U0000200e', '\U00002064'), + ('\U00002066', '\U00002070'), + ('\U00002074', '\U0000207e'), + ('\U00002080', '\U0000208e'), + ('\U000020a0', '\U000020ba'), + ('\U00002100', '\U00002125'), + ('\U00002127', '\U00002129'), + ('\U0000212c', '\U00002131'), + ('\U00002133', '\U0000214d'), + ('\U0000214f', '\U0000215f'), + ('\U00002189', '\U00002189'), + ('\U00002190', '\U000023f3'), + ('\U00002400', '\U00002426'), + ('\U00002440', '\U0000244a'), + ('\U00002460', '\U000026ff'), + ('\U00002701', '\U000027ff'), + ('\U00002900', '\U00002b4c'), + ('\U00002b50', '\U00002b59'), + ('\U00002e00', '\U00002e3b'), + ('\U00002ff0', '\U00002ffb'), + ('\U00003000', '\U00003004'), + ('\U00003006', '\U00003006'), + ('\U00003008', '\U00003020'), + ('\U00003030', '\U00003037'), + ('\U0000303c', '\U0000303f'), + ('\U0000309b', '\U0000309c'), + ('\U000030a0', '\U000030a0'), + ('\U000030fb', '\U000030fc'), + ('\U00003190', '\U0000319f'), + ('\U000031c0', '\U000031e3'), + ('\U00003220', '\U0000325f'), + ('\U0000327f', '\U000032cf'), + ('\U00003358', '\U000033ff'), + ('\U00004dc0', '\U00004dff'), + ('\U0000a700', '\U0000a721'), + ('\U0000a788', '\U0000a78a'), + ('\U0000a830', '\U0000a839'), + ('\U0000a9cf', '\U0000a9cf'), + ('\U0000fd3e', '\U0000fd3f'), + ('\U0000fdfd', '\U0000fdfd'), + ('\U0000fe10', '\U0000fe19'), + ('\U0000fe30', '\U0000fe52'), + ('\U0000fe54', '\U0000fe66'), + ('\U0000fe68', '\U0000fe6b'), + ('\U0000feff', '\U0000feff'), + ('\U0000ff01', '\U0000ff20'), + ('\U0000ff3b', '\U0000ff40'), + ('\U0000ff5b', '\U0000ff65'), + ('\U0000ff70', '\U0000ff70'), + ('\U0000ff9e', '\U0000ff9f'), + ('\U0000ffe0', '\U0000ffe6'), + ('\U0000ffe8', '\U0000ffee'), + ('\U0000fff9', '\U0000fffd'), + ('\U00010100', '\U00010102'), + ('\U00010107', '\U00010133'), + ('\U00010137', '\U0001013f'), + ('\U00010190', '\U0001019b'), + ('\U000101d0', '\U000101fc'), + ('\U0001d000', '\U0001d0f5'), + ('\U0001d100', '\U0001d126'), + ('\U0001d129', '\U0001d166'), + ('\U0001d16a', '\U0001d17a'), + ('\U0001d183', '\U0001d184'), + ('\U0001d18c', '\U0001d1a9'), + ('\U0001d1ae', '\U0001d1dd'), + ('\U0001d300', '\U0001d356'), + ('\U0001d360', '\U0001d371'), + ('\U0001d400', '\U0001d454'), + ('\U0001d456', '\U0001d49c'), + ('\U0001d49e', '\U0001d49f'), + ('\U0001d4a2', '\U0001d4a2'), + ('\U0001d4a5', '\U0001d4a6'), + ('\U0001d4a9', '\U0001d4ac'), + ('\U0001d4ae', '\U0001d4b9'), + ('\U0001d4bb', '\U0001d4bb'), + ('\U0001d4bd', '\U0001d4c3'), + ('\U0001d4c5', '\U0001d505'), + ('\U0001d507', '\U0001d50a'), + ('\U0001d50d', '\U0001d514'), + ('\U0001d516', '\U0001d51c'), + ('\U0001d51e', '\U0001d539'), + ('\U0001d53b', '\U0001d53e'), + ('\U0001d540', '\U0001d544'), + ('\U0001d546', '\U0001d546'), + ('\U0001d54a', '\U0001d550'), + ('\U0001d552', '\U0001d6a5'), + ('\U0001d6a8', '\U0001d7cb'), + ('\U0001d7ce', '\U0001d7ff'), + ('\U0001f000', '\U0001f02b'), + ('\U0001f030', '\U0001f093'), + ('\U0001f0a0', '\U0001f0ae'), + ('\U0001f0b1', '\U0001f0be'), + ('\U0001f0c1', '\U0001f0cf'), + ('\U0001f0d1', '\U0001f0df'), + ('\U0001f100', '\U0001f10a'), + ('\U0001f110', '\U0001f12e'), + ('\U0001f130', '\U0001f16b'), + ('\U0001f170', '\U0001f19a'), + ('\U0001f1e6', '\U0001f1ff'), + ('\U0001f201', '\U0001f202'), + ('\U0001f210', '\U0001f23a'), + ('\U0001f240', '\U0001f248'), + ('\U0001f250', '\U0001f251'), + ('\U0001f300', '\U0001f320'), + ('\U0001f330', '\U0001f335'), + ('\U0001f337', '\U0001f37c'), + ('\U0001f380', '\U0001f393'), + ('\U0001f3a0', '\U0001f3c4'), + ('\U0001f3c6', '\U0001f3ca'), + ('\U0001f3e0', '\U0001f3f0'), + ('\U0001f400', '\U0001f43e'), + ('\U0001f440', '\U0001f440'), + ('\U0001f442', '\U0001f4f7'), + ('\U0001f4f9', '\U0001f4fc'), + ('\U0001f500', '\U0001f53d'), + ('\U0001f540', '\U0001f543'), + ('\U0001f550', '\U0001f567'), + ('\U0001f5fb', '\U0001f640'), + ('\U0001f645', '\U0001f64f'), + ('\U0001f680', '\U0001f6c5'), + ('\U0001f700', '\U0001f773'), + ('\U000e0001', '\U000e0001'), + ('\U000e0020', '\U000e007f') + ]), +("Coptic", &[ + ('\U000003e2', '\U000003ef'), + ('\U00002c80', '\U00002cf3'), + ('\U00002cf9', '\U00002cff') + ]), +("Cuneiform", &[ + ('\U00012000', '\U0001236e'), + ('\U00012400', '\U00012462'), + ('\U00012470', '\U00012473') + ]), +("Cypriot", &[ + ('\U00010800', '\U00010805'), + ('\U00010808', '\U00010808'), + ('\U0001080a', '\U00010835'), + ('\U00010837', '\U00010838'), + ('\U0001083c', '\U0001083c'), + ('\U0001083f', '\U0001083f') + ]), +("Cyrillic", &[ + ('\U00000400', '\U00000484'), + ('\U00000487', '\U00000527'), + ('\U00001d2b', '\U00001d2b'), + ('\U00001d78', '\U00001d78'), + ('\U00002de0', '\U00002dff'), + ('\U0000a640', '\U0000a697'), + ('\U0000a69f', '\U0000a69f') + ]), +("Deseret", &[ + ('\U00010400', '\U0001044f') + ]), +("Devanagari", &[ + ('\U00000900', '\U00000950'), + ('\U00000953', '\U00000963'), + ('\U00000966', '\U00000977'), + ('\U00000979', '\U0000097f'), + ('\U0000a8e0', '\U0000a8fb') + ]), +("Egyptian_Hieroglyphs", &[ + ('\U00013000', '\U0001342e') + ]), +("Ethiopic", &[ + ('\U00001200', '\U00001248'), + ('\U0000124a', '\U0000124d'), + ('\U00001250', '\U00001256'), + ('\U00001258', '\U00001258'), + ('\U0000125a', '\U0000125d'), + ('\U00001260', '\U00001288'), + ('\U0000128a', '\U0000128d'), + ('\U00001290', '\U000012b0'), + ('\U000012b2', '\U000012b5'), + ('\U000012b8', '\U000012be'), + ('\U000012c0', '\U000012c0'), + ('\U000012c2', '\U000012c5'), + ('\U000012c8', '\U000012d6'), + ('\U000012d8', '\U00001310'), + ('\U00001312', '\U00001315'), + ('\U00001318', '\U0000135a'), + ('\U0000135d', '\U0000137c'), + ('\U00001380', '\U00001399'), + ('\U00002d80', '\U00002d96'), + ('\U00002da0', '\U00002da6'), + ('\U00002da8', '\U00002dae'), + ('\U00002db0', '\U00002db6'), + ('\U00002db8', '\U00002dbe'), + ('\U00002dc0', '\U00002dc6'), + ('\U00002dc8', '\U00002dce'), + ('\U00002dd0', '\U00002dd6'), + ('\U00002dd8', '\U00002dde'), + ('\U0000ab01', '\U0000ab06'), + ('\U0000ab09', '\U0000ab0e'), + ('\U0000ab11', '\U0000ab16'), + ('\U0000ab20', '\U0000ab26'), + ('\U0000ab28', '\U0000ab2e') + ]), +("Georgian", &[ + ('\U000010a0', '\U000010c5'), + ('\U000010c7', '\U000010c7'), + ('\U000010cd', '\U000010cd'), + ('\U000010d0', '\U000010fa'), + ('\U000010fc', '\U000010ff'), + ('\U00002d00', '\U00002d25'), + ('\U00002d27', '\U00002d27'), + ('\U00002d2d', '\U00002d2d') + ]), +("Glagolitic", &[ + ('\U00002c00', '\U00002c2e'), + ('\U00002c30', '\U00002c5e') + ]), +("Gothic", &[ + ('\U00010330', '\U0001034a') + ]), +("Greek", &[ + ('\U00000370', '\U00000373'), + ('\U00000375', '\U00000377'), + ('\U0000037a', '\U0000037d'), + ('\U00000384', '\U00000384'), + ('\U00000386', '\U00000386'), + ('\U00000388', '\U0000038a'), + ('\U0000038c', '\U0000038c'), + ('\U0000038e', '\U000003a1'), + ('\U000003a3', '\U000003e1'), + ('\U000003f0', '\U000003ff'), + ('\U00001d26', '\U00001d2a'), + ('\U00001d5d', '\U00001d61'), + ('\U00001d66', '\U00001d6a'), + ('\U00001dbf', '\U00001dbf'), + ('\U00001f00', '\U00001f15'), + ('\U00001f18', '\U00001f1d'), + ('\U00001f20', '\U00001f45'), + ('\U00001f48', '\U00001f4d'), + ('\U00001f50', '\U00001f57'), + ('\U00001f59', '\U00001f59'), + ('\U00001f5b', '\U00001f5b'), + ('\U00001f5d', '\U00001f5d'), + ('\U00001f5f', '\U00001f7d'), + ('\U00001f80', '\U00001fb4'), + ('\U00001fb6', '\U00001fc4'), + ('\U00001fc6', '\U00001fd3'), + ('\U00001fd6', '\U00001fdb'), + ('\U00001fdd', '\U00001fef'), + ('\U00001ff2', '\U00001ff4'), + ('\U00001ff6', '\U00001ffe'), + ('\U00002126', '\U00002126'), + ('\U00010140', '\U0001018a'), + ('\U0001d200', '\U0001d245') + ]), +("Gujarati", &[ + ('\U00000a81', '\U00000a83'), + ('\U00000a85', '\U00000a8d'), + ('\U00000a8f', '\U00000a91'), + ('\U00000a93', '\U00000aa8'), + ('\U00000aaa', '\U00000ab0'), + ('\U00000ab2', '\U00000ab3'), + ('\U00000ab5', '\U00000ab9'), + ('\U00000abc', '\U00000ac5'), + ('\U00000ac7', '\U00000ac9'), + ('\U00000acb', '\U00000acd'), + ('\U00000ad0', '\U00000ad0'), + ('\U00000ae0', '\U00000ae3'), + ('\U00000ae6', '\U00000af1') + ]), +("Gurmukhi", &[ + ('\U00000a01', '\U00000a03'), + ('\U00000a05', '\U00000a0a'), + ('\U00000a0f', '\U00000a10'), + ('\U00000a13', '\U00000a28'), + ('\U00000a2a', '\U00000a30'), + ('\U00000a32', '\U00000a33'), + ('\U00000a35', '\U00000a36'), + ('\U00000a38', '\U00000a39'), + ('\U00000a3c', '\U00000a3c'), + ('\U00000a3e', '\U00000a42'), + ('\U00000a47', '\U00000a48'), + ('\U00000a4b', '\U00000a4d'), + ('\U00000a51', '\U00000a51'), + ('\U00000a59', '\U00000a5c'), + ('\U00000a5e', '\U00000a5e'), + ('\U00000a66', '\U00000a75') + ]), +("Han", &[ + ('\U00002e80', '\U00002e99'), + ('\U00002e9b', '\U00002ef3'), + ('\U00002f00', '\U00002fd5'), + ('\U00003005', '\U00003005'), + ('\U00003007', '\U00003007'), + ('\U00003021', '\U00003029'), + ('\U00003038', '\U0000303b'), + ('\U00003400', '\U00004db5'), + ('\U00004e00', '\U00009fcc'), + ('\U0000f900', '\U0000fa6d'), + ('\U0000fa70', '\U0000fad9'), + ('\U00020000', '\U0002a6d6'), + ('\U0002a700', '\U0002b734'), + ('\U0002b740', '\U0002b81d'), + ('\U0002f800', '\U0002fa1d') + ]), +("Hangul", &[ + ('\U00001100', '\U000011ff'), + ('\U0000302e', '\U0000302f'), + ('\U00003131', '\U0000318e'), + ('\U00003200', '\U0000321e'), + ('\U00003260', '\U0000327e'), + ('\U0000a960', '\U0000a97c'), + ('\U0000ac00', '\U0000d7a3'), + ('\U0000d7b0', '\U0000d7c6'), + ('\U0000d7cb', '\U0000d7fb'), + ('\U0000ffa0', '\U0000ffbe'), + ('\U0000ffc2', '\U0000ffc7'), + ('\U0000ffca', '\U0000ffcf'), + ('\U0000ffd2', '\U0000ffd7'), + ('\U0000ffda', '\U0000ffdc') + ]), +("Hanunoo", &[ + ('\U00001720', '\U00001734') + ]), +("Hebrew", &[ + ('\U00000591', '\U000005c7'), + ('\U000005d0', '\U000005ea'), + ('\U000005f0', '\U000005f4'), + ('\U0000fb1d', '\U0000fb36'), + ('\U0000fb38', '\U0000fb3c'), + ('\U0000fb3e', '\U0000fb3e'), + ('\U0000fb40', '\U0000fb41'), + ('\U0000fb43', '\U0000fb44'), + ('\U0000fb46', '\U0000fb4f') + ]), +("Hiragana", &[ + ('\U00003041', '\U00003096'), + ('\U0000309d', '\U0000309f'), + ('\U0001b001', '\U0001b001'), + ('\U0001f200', '\U0001f200') + ]), +("Imperial_Aramaic", &[ + ('\U00010840', '\U00010855'), + ('\U00010857', '\U0001085f') + ]), +("Inherited", &[ + ('\U00000300', '\U0000036f'), + ('\U00000485', '\U00000486'), + ('\U0000064b', '\U00000655'), + ('\U00000670', '\U00000670'), + ('\U00000951', '\U00000952'), + ('\U00001cd0', '\U00001cd2'), + ('\U00001cd4', '\U00001ce0'), + ('\U00001ce2', '\U00001ce8'), + ('\U00001ced', '\U00001ced'), + ('\U00001cf4', '\U00001cf4'), + ('\U00001dc0', '\U00001de6'), + ('\U00001dfc', '\U00001dff'), + ('\U0000200c', '\U0000200d'), + ('\U000020d0', '\U000020f0'), + ('\U0000302a', '\U0000302d'), + ('\U00003099', '\U0000309a'), + ('\U0000fe00', '\U0000fe0f'), + ('\U0000fe20', '\U0000fe26'), + ('\U000101fd', '\U000101fd'), + ('\U0001d167', '\U0001d169'), + ('\U0001d17b', '\U0001d182'), + ('\U0001d185', '\U0001d18b'), + ('\U0001d1aa', '\U0001d1ad'), + ('\U000e0100', '\U000e01ef') + ]), +("Inscriptional_Pahlavi", &[ + ('\U00010b60', '\U00010b72'), + ('\U00010b78', '\U00010b7f') + ]), +("Inscriptional_Parthian", &[ + ('\U00010b40', '\U00010b55'), + ('\U00010b58', '\U00010b5f') + ]), +("Javanese", &[ + ('\U0000a980', '\U0000a9cd'), + ('\U0000a9d0', '\U0000a9d9'), + ('\U0000a9de', '\U0000a9df') + ]), +("Kaithi", &[ + ('\U00011080', '\U000110c1') + ]), +("Kannada", &[ + ('\U00000c82', '\U00000c83'), + ('\U00000c85', '\U00000c8c'), + ('\U00000c8e', '\U00000c90'), + ('\U00000c92', '\U00000ca8'), + ('\U00000caa', '\U00000cb3'), + ('\U00000cb5', '\U00000cb9'), + ('\U00000cbc', '\U00000cc4'), + ('\U00000cc6', '\U00000cc8'), + ('\U00000cca', '\U00000ccd'), + ('\U00000cd5', '\U00000cd6'), + ('\U00000cde', '\U00000cde'), + ('\U00000ce0', '\U00000ce3'), + ('\U00000ce6', '\U00000cef'), + ('\U00000cf1', '\U00000cf2') + ]), +("Katakana", &[ + ('\U000030a1', '\U000030fa'), + ('\U000030fd', '\U000030ff'), + ('\U000031f0', '\U000031ff'), + ('\U000032d0', '\U000032fe'), + ('\U00003300', '\U00003357'), + ('\U0000ff66', '\U0000ff6f'), + ('\U0000ff71', '\U0000ff9d'), + ('\U0001b000', '\U0001b000') + ]), +("Kayah_Li", &[ + ('\U0000a900', '\U0000a92f') + ]), +("Kharoshthi", &[ + ('\U00010a00', '\U00010a03'), + ('\U00010a05', '\U00010a06'), + ('\U00010a0c', '\U00010a13'), + ('\U00010a15', '\U00010a17'), + ('\U00010a19', '\U00010a33'), + ('\U00010a38', '\U00010a3a'), + ('\U00010a3f', '\U00010a47'), + ('\U00010a50', '\U00010a58') + ]), +("Khmer", &[ + ('\U00001780', '\U000017dd'), + ('\U000017e0', '\U000017e9'), + ('\U000017f0', '\U000017f9'), + ('\U000019e0', '\U000019ff') + ]), +("L", &[ + ('\U00000041', '\U0000005a'), + ('\U00000061', '\U0000007a'), + ('\U000000aa', '\U000000aa'), + ('\U000000b5', '\U000000b5'), + ('\U000000ba', '\U000000ba'), + ('\U000000c0', '\U000000d6'), + ('\U000000d8', '\U000000f6'), + ('\U000000f8', '\U000002c1'), + ('\U000002c6', '\U000002d1'), + ('\U000002e0', '\U000002e4'), + ('\U000002ec', '\U000002ec'), + ('\U000002ee', '\U000002ee'), + ('\U00000370', '\U00000374'), + ('\U00000376', '\U00000377'), + ('\U0000037a', '\U0000037d'), + ('\U00000386', '\U00000386'), + ('\U00000388', '\U0000038a'), + ('\U0000038c', '\U0000038c'), + ('\U0000038e', '\U000003a1'), + ('\U000003a3', '\U000003f5'), + ('\U000003f7', '\U00000481'), + ('\U0000048a', '\U00000527'), + ('\U00000531', '\U00000556'), + ('\U00000559', '\U00000559'), + ('\U00000561', '\U00000587'), + ('\U000005d0', '\U000005ea'), + ('\U000005f0', '\U000005f2'), + ('\U00000620', '\U0000064a'), + ('\U0000066e', '\U0000066f'), + ('\U00000671', '\U000006d3'), + ('\U000006d5', '\U000006d5'), + ('\U000006e5', '\U000006e6'), + ('\U000006ee', '\U000006ef'), + ('\U000006fa', '\U000006fc'), + ('\U000006ff', '\U000006ff'), + ('\U00000710', '\U00000710'), + ('\U00000712', '\U0000072f'), + ('\U0000074d', '\U000007a5'), + ('\U000007b1', '\U000007b1'), + ('\U000007ca', '\U000007ea'), + ('\U000007f4', '\U000007f5'), + ('\U000007fa', '\U000007fa'), + ('\U00000800', '\U00000815'), + ('\U0000081a', '\U0000081a'), + ('\U00000824', '\U00000824'), + ('\U00000828', '\U00000828'), + ('\U00000840', '\U00000858'), + ('\U000008a0', '\U000008a0'), + ('\U000008a2', '\U000008ac'), + ('\U00000904', '\U00000939'), + ('\U0000093d', '\U0000093d'), + ('\U00000950', '\U00000950'), + ('\U00000958', '\U00000961'), + ('\U00000971', '\U00000977'), + ('\U00000979', '\U0000097f'), + ('\U00000985', '\U0000098c'), + ('\U0000098f', '\U00000990'), + ('\U00000993', '\U000009a8'), + ('\U000009aa', '\U000009b0'), + ('\U000009b2', '\U000009b2'), + ('\U000009b6', '\U000009b9'), + ('\U000009bd', '\U000009bd'), + ('\U000009ce', '\U000009ce'), + ('\U000009dc', '\U000009dd'), + ('\U000009df', '\U000009e1'), + ('\U000009f0', '\U000009f1'), + ('\U00000a05', '\U00000a0a'), + ('\U00000a0f', '\U00000a10'), + ('\U00000a13', '\U00000a28'), + ('\U00000a2a', '\U00000a30'), + ('\U00000a32', '\U00000a33'), + ('\U00000a35', '\U00000a36'), + ('\U00000a38', '\U00000a39'), + ('\U00000a59', '\U00000a5c'), + ('\U00000a5e', '\U00000a5e'), + ('\U00000a72', '\U00000a74'), + ('\U00000a85', '\U00000a8d'), + ('\U00000a8f', '\U00000a91'), + ('\U00000a93', '\U00000aa8'), + ('\U00000aaa', '\U00000ab0'), + ('\U00000ab2', '\U00000ab3'), + ('\U00000ab5', '\U00000ab9'), + ('\U00000abd', '\U00000abd'), + ('\U00000ad0', '\U00000ad0'), + ('\U00000ae0', '\U00000ae1'), + ('\U00000b05', '\U00000b0c'), + ('\U00000b0f', '\U00000b10'), + ('\U00000b13', '\U00000b28'), + ('\U00000b2a', '\U00000b30'), + ('\U00000b32', '\U00000b33'), + ('\U00000b35', '\U00000b39'), + ('\U00000b3d', '\U00000b3d'), + ('\U00000b5c', '\U00000b5d'), + ('\U00000b5f', '\U00000b61'), + ('\U00000b71', '\U00000b71'), + ('\U00000b83', '\U00000b83'), + ('\U00000b85', '\U00000b8a'), + ('\U00000b8e', '\U00000b90'), + ('\U00000b92', '\U00000b95'), + ('\U00000b99', '\U00000b9a'), + ('\U00000b9c', '\U00000b9c'), + ('\U00000b9e', '\U00000b9f'), + ('\U00000ba3', '\U00000ba4'), + ('\U00000ba8', '\U00000baa'), + ('\U00000bae', '\U00000bb9'), + ('\U00000bd0', '\U00000bd0'), + ('\U00000c05', '\U00000c0c'), + ('\U00000c0e', '\U00000c10'), + ('\U00000c12', '\U00000c28'), + ('\U00000c2a', '\U00000c33'), + ('\U00000c35', '\U00000c39'), + ('\U00000c3d', '\U00000c3d'), + ('\U00000c58', '\U00000c59'), + ('\U00000c60', '\U00000c61'), + ('\U00000c85', '\U00000c8c'), + ('\U00000c8e', '\U00000c90'), + ('\U00000c92', '\U00000ca8'), + ('\U00000caa', '\U00000cb3'), + ('\U00000cb5', '\U00000cb9'), + ('\U00000cbd', '\U00000cbd'), + ('\U00000cde', '\U00000cde'), + ('\U00000ce0', '\U00000ce1'), + ('\U00000cf1', '\U00000cf2'), + ('\U00000d05', '\U00000d0c'), + ('\U00000d0e', '\U00000d10'), + ('\U00000d12', '\U00000d3a'), + ('\U00000d3d', '\U00000d3d'), + ('\U00000d4e', '\U00000d4e'), + ('\U00000d60', '\U00000d61'), + ('\U00000d7a', '\U00000d7f'), + ('\U00000d85', '\U00000d96'), + ('\U00000d9a', '\U00000db1'), + ('\U00000db3', '\U00000dbb'), + ('\U00000dbd', '\U00000dbd'), + ('\U00000dc0', '\U00000dc6'), + ('\U00000e01', '\U00000e30'), + ('\U00000e32', '\U00000e33'), + ('\U00000e40', '\U00000e46'), + ('\U00000e81', '\U00000e82'), + ('\U00000e84', '\U00000e84'), + ('\U00000e87', '\U00000e88'), + ('\U00000e8a', '\U00000e8a'), + ('\U00000e8d', '\U00000e8d'), + ('\U00000e94', '\U00000e97'), + ('\U00000e99', '\U00000e9f'), + ('\U00000ea1', '\U00000ea3'), + ('\U00000ea5', '\U00000ea5'), + ('\U00000ea7', '\U00000ea7'), + ('\U00000eaa', '\U00000eab'), + ('\U00000ead', '\U00000eb0'), + ('\U00000eb2', '\U00000eb3'), + ('\U00000ebd', '\U00000ebd'), + ('\U00000ec0', '\U00000ec4'), + ('\U00000ec6', '\U00000ec6'), + ('\U00000edc', '\U00000edf'), + ('\U00000f00', '\U00000f00'), + ('\U00000f40', '\U00000f47'), + ('\U00000f49', '\U00000f6c'), + ('\U00000f88', '\U00000f8c'), + ('\U00001000', '\U0000102a'), + ('\U0000103f', '\U0000103f'), + ('\U00001050', '\U00001055'), + ('\U0000105a', '\U0000105d'), + ('\U00001061', '\U00001061'), + ('\U00001065', '\U00001066'), + ('\U0000106e', '\U00001070'), + ('\U00001075', '\U00001081'), + ('\U0000108e', '\U0000108e'), + ('\U000010a0', '\U000010c5'), + ('\U000010c7', '\U000010c7'), + ('\U000010cd', '\U000010cd'), + ('\U000010d0', '\U000010fa'), + ('\U000010fc', '\U00001248'), + ('\U0000124a', '\U0000124d'), + ('\U00001250', '\U00001256'), + ('\U00001258', '\U00001258'), + ('\U0000125a', '\U0000125d'), + ('\U00001260', '\U00001288'), + ('\U0000128a', '\U0000128d'), + ('\U00001290', '\U000012b0'), + ('\U000012b2', '\U000012b5'), + ('\U000012b8', '\U000012be'), + ('\U000012c0', '\U000012c0'), + ('\U000012c2', '\U000012c5'), + ('\U000012c8', '\U000012d6'), + ('\U000012d8', '\U00001310'), + ('\U00001312', '\U00001315'), + ('\U00001318', '\U0000135a'), + ('\U00001380', '\U0000138f'), + ('\U000013a0', '\U000013f4'), + ('\U00001401', '\U0000166c'), + ('\U0000166f', '\U0000167f'), + ('\U00001681', '\U0000169a'), + ('\U000016a0', '\U000016ea'), + ('\U00001700', '\U0000170c'), + ('\U0000170e', '\U00001711'), + ('\U00001720', '\U00001731'), + ('\U00001740', '\U00001751'), + ('\U00001760', '\U0000176c'), + ('\U0000176e', '\U00001770'), + ('\U00001780', '\U000017b3'), + ('\U000017d7', '\U000017d7'), + ('\U000017dc', '\U000017dc'), + ('\U00001820', '\U00001877'), + ('\U00001880', '\U000018a8'), + ('\U000018aa', '\U000018aa'), + ('\U000018b0', '\U000018f5'), + ('\U00001900', '\U0000191c'), + ('\U00001950', '\U0000196d'), + ('\U00001970', '\U00001974'), + ('\U00001980', '\U000019ab'), + ('\U000019c1', '\U000019c7'), + ('\U00001a00', '\U00001a16'), + ('\U00001a20', '\U00001a54'), + ('\U00001aa7', '\U00001aa7'), + ('\U00001b05', '\U00001b33'), + ('\U00001b45', '\U00001b4b'), + ('\U00001b83', '\U00001ba0'), + ('\U00001bae', '\U00001baf'), + ('\U00001bba', '\U00001be5'), + ('\U00001c00', '\U00001c23'), + ('\U00001c4d', '\U00001c4f'), + ('\U00001c5a', '\U00001c7d'), + ('\U00001ce9', '\U00001cec'), + ('\U00001cee', '\U00001cf1'), + ('\U00001cf5', '\U00001cf6'), + ('\U00001d00', '\U00001dbf'), + ('\U00001e00', '\U00001f15'), + ('\U00001f18', '\U00001f1d'), + ('\U00001f20', '\U00001f45'), + ('\U00001f48', '\U00001f4d'), + ('\U00001f50', '\U00001f57'), + ('\U00001f59', '\U00001f59'), + ('\U00001f5b', '\U00001f5b'), + ('\U00001f5d', '\U00001f5d'), + ('\U00001f5f', '\U00001f7d'), + ('\U00001f80', '\U00001fb4'), + ('\U00001fb6', '\U00001fbc'), + ('\U00001fbe', '\U00001fbe'), + ('\U00001fc2', '\U00001fc4'), + ('\U00001fc6', '\U00001fcc'), + ('\U00001fd0', '\U00001fd3'), + ('\U00001fd6', '\U00001fdb'), + ('\U00001fe0', '\U00001fec'), + ('\U00001ff2', '\U00001ff4'), + ('\U00001ff6', '\U00001ffc'), + ('\U00002071', '\U00002071'), + ('\U0000207f', '\U0000207f'), + ('\U00002090', '\U0000209c'), + ('\U00002102', '\U00002102'), + ('\U00002107', '\U00002107'), + ('\U0000210a', '\U00002113'), + ('\U00002115', '\U00002115'), + ('\U00002119', '\U0000211d'), + ('\U00002124', '\U00002124'), + ('\U00002126', '\U00002126'), + ('\U00002128', '\U00002128'), + ('\U0000212a', '\U0000212d'), + ('\U0000212f', '\U00002139'), + ('\U0000213c', '\U0000213f'), + ('\U00002145', '\U00002149'), + ('\U0000214e', '\U0000214e'), + ('\U00002183', '\U00002184'), + ('\U00002c00', '\U00002c2e'), + ('\U00002c30', '\U00002c5e'), + ('\U00002c60', '\U00002ce4'), + ('\U00002ceb', '\U00002cee'), + ('\U00002cf2', '\U00002cf3'), + ('\U00002d00', '\U00002d25'), + ('\U00002d27', '\U00002d27'), + ('\U00002d2d', '\U00002d2d'), + ('\U00002d30', '\U00002d67'), + ('\U00002d6f', '\U00002d6f'), + ('\U00002d80', '\U00002d96'), + ('\U00002da0', '\U00002da6'), + ('\U00002da8', '\U00002dae'), + ('\U00002db0', '\U00002db6'), + ('\U00002db8', '\U00002dbe'), + ('\U00002dc0', '\U00002dc6'), + ('\U00002dc8', '\U00002dce'), + ('\U00002dd0', '\U00002dd6'), + ('\U00002dd8', '\U00002dde'), + ('\U00002e2f', '\U00002e2f'), + ('\U00003005', '\U00003006'), + ('\U00003031', '\U00003035'), + ('\U0000303b', '\U0000303c'), + ('\U00003041', '\U00003096'), + ('\U0000309d', '\U0000309f'), + ('\U000030a1', '\U000030fa'), + ('\U000030fc', '\U000030ff'), + ('\U00003105', '\U0000312d'), + ('\U00003131', '\U0000318e'), + ('\U000031a0', '\U000031ba'), + ('\U000031f0', '\U000031ff'), + ('\U00003400', '\U00003400'), + ('\U00004db5', '\U00004db5'), + ('\U00004e00', '\U00004e00'), + ('\U00009fcc', '\U00009fcc'), + ('\U0000a000', '\U0000a48c'), + ('\U0000a4d0', '\U0000a4fd'), + ('\U0000a500', '\U0000a60c'), + ('\U0000a610', '\U0000a61f'), + ('\U0000a62a', '\U0000a62b'), + ('\U0000a640', '\U0000a66e'), + ('\U0000a67f', '\U0000a697'), + ('\U0000a6a0', '\U0000a6e5'), + ('\U0000a717', '\U0000a71f'), + ('\U0000a722', '\U0000a788'), + ('\U0000a78b', '\U0000a78e'), + ('\U0000a790', '\U0000a793'), + ('\U0000a7a0', '\U0000a7aa'), + ('\U0000a7f8', '\U0000a801'), + ('\U0000a803', '\U0000a805'), + ('\U0000a807', '\U0000a80a'), + ('\U0000a80c', '\U0000a822'), + ('\U0000a840', '\U0000a873'), + ('\U0000a882', '\U0000a8b3'), + ('\U0000a8f2', '\U0000a8f7'), + ('\U0000a8fb', '\U0000a8fb'), + ('\U0000a90a', '\U0000a925'), + ('\U0000a930', '\U0000a946'), + ('\U0000a960', '\U0000a97c'), + ('\U0000a984', '\U0000a9b2'), + ('\U0000a9cf', '\U0000a9cf'), + ('\U0000aa00', '\U0000aa28'), + ('\U0000aa40', '\U0000aa42'), + ('\U0000aa44', '\U0000aa4b'), + ('\U0000aa60', '\U0000aa76'), + ('\U0000aa7a', '\U0000aa7a'), + ('\U0000aa80', '\U0000aaaf'), + ('\U0000aab1', '\U0000aab1'), + ('\U0000aab5', '\U0000aab6'), + ('\U0000aab9', '\U0000aabd'), + ('\U0000aac0', '\U0000aac0'), + ('\U0000aac2', '\U0000aac2'), + ('\U0000aadb', '\U0000aadd'), + ('\U0000aae0', '\U0000aaea'), + ('\U0000aaf2', '\U0000aaf4'), + ('\U0000ab01', '\U0000ab06'), + ('\U0000ab09', '\U0000ab0e'), + ('\U0000ab11', '\U0000ab16'), + ('\U0000ab20', '\U0000ab26'), + ('\U0000ab28', '\U0000ab2e'), + ('\U0000abc0', '\U0000abe2'), + ('\U0000ac00', '\U0000ac00'), + ('\U0000d7a3', '\U0000d7a3'), + ('\U0000d7b0', '\U0000d7c6'), + ('\U0000d7cb', '\U0000d7fb'), + ('\U0000f900', '\U0000fa6d'), + ('\U0000fa70', '\U0000fad9'), + ('\U0000fb00', '\U0000fb06'), + ('\U0000fb13', '\U0000fb17'), + ('\U0000fb1d', '\U0000fb1d'), + ('\U0000fb1f', '\U0000fb28'), + ('\U0000fb2a', '\U0000fb36'), + ('\U0000fb38', '\U0000fb3c'), + ('\U0000fb3e', '\U0000fb3e'), + ('\U0000fb40', '\U0000fb41'), + ('\U0000fb43', '\U0000fb44'), + ('\U0000fb46', '\U0000fbb1'), + ('\U0000fbd3', '\U0000fd3d'), + ('\U0000fd50', '\U0000fd8f'), + ('\U0000fd92', '\U0000fdc7'), + ('\U0000fdf0', '\U0000fdfb'), + ('\U0000fe70', '\U0000fe74'), + ('\U0000fe76', '\U0000fefc'), + ('\U0000ff21', '\U0000ff3a'), + ('\U0000ff41', '\U0000ff5a'), + ('\U0000ff66', '\U0000ffbe'), + ('\U0000ffc2', '\U0000ffc7'), + ('\U0000ffca', '\U0000ffcf'), + ('\U0000ffd2', '\U0000ffd7'), + ('\U0000ffda', '\U0000ffdc'), + ('\U00010000', '\U0001000b'), + ('\U0001000d', '\U00010026'), + ('\U00010028', '\U0001003a'), + ('\U0001003c', '\U0001003d'), + ('\U0001003f', '\U0001004d'), + ('\U00010050', '\U0001005d'), + ('\U00010080', '\U000100fa'), + ('\U00010280', '\U0001029c'), + ('\U000102a0', '\U000102d0'), + ('\U00010300', '\U0001031e'), + ('\U00010330', '\U00010340'), + ('\U00010342', '\U00010349'), + ('\U00010380', '\U0001039d'), + ('\U000103a0', '\U000103c3'), + ('\U000103c8', '\U000103cf'), + ('\U00010400', '\U0001049d'), + ('\U00010800', '\U00010805'), + ('\U00010808', '\U00010808'), + ('\U0001080a', '\U00010835'), + ('\U00010837', '\U00010838'), + ('\U0001083c', '\U0001083c'), + ('\U0001083f', '\U00010855'), + ('\U00010900', '\U00010915'), + ('\U00010920', '\U00010939'), + ('\U00010980', '\U000109b7'), + ('\U000109be', '\U000109bf'), + ('\U00010a00', '\U00010a00'), + ('\U00010a10', '\U00010a13'), + ('\U00010a15', '\U00010a17'), + ('\U00010a19', '\U00010a33'), + ('\U00010a60', '\U00010a7c'), + ('\U00010b00', '\U00010b35'), + ('\U00010b40', '\U00010b55'), + ('\U00010b60', '\U00010b72'), + ('\U00010c00', '\U00010c48'), + ('\U00011003', '\U00011037'), + ('\U00011083', '\U000110af'), + ('\U000110d0', '\U000110e8'), + ('\U00011103', '\U00011126'), + ('\U00011183', '\U000111b2'), + ('\U000111c1', '\U000111c4'), + ('\U00011680', '\U000116aa'), + ('\U00012000', '\U0001236e'), + ('\U00013000', '\U0001342e'), + ('\U00016800', '\U00016a38'), + ('\U00016f00', '\U00016f44'), + ('\U00016f50', '\U00016f50'), + ('\U00016f93', '\U00016f9f'), + ('\U0001b000', '\U0001b001'), + ('\U0001d400', '\U0001d454'), + ('\U0001d456', '\U0001d49c'), + ('\U0001d49e', '\U0001d49f'), + ('\U0001d4a2', '\U0001d4a2'), + ('\U0001d4a5', '\U0001d4a6'), + ('\U0001d4a9', '\U0001d4ac'), + ('\U0001d4ae', '\U0001d4b9'), + ('\U0001d4bb', '\U0001d4bb'), + ('\U0001d4bd', '\U0001d4c3'), + ('\U0001d4c5', '\U0001d505'), + ('\U0001d507', '\U0001d50a'), + ('\U0001d50d', '\U0001d514'), + ('\U0001d516', '\U0001d51c'), + ('\U0001d51e', '\U0001d539'), + ('\U0001d53b', '\U0001d53e'), + ('\U0001d540', '\U0001d544'), + ('\U0001d546', '\U0001d546'), + ('\U0001d54a', '\U0001d550'), + ('\U0001d552', '\U0001d6a5'), + ('\U0001d6a8', '\U0001d6c0'), + ('\U0001d6c2', '\U0001d6da'), + ('\U0001d6dc', '\U0001d6fa'), + ('\U0001d6fc', '\U0001d714'), + ('\U0001d716', '\U0001d734'), + ('\U0001d736', '\U0001d74e'), + ('\U0001d750', '\U0001d76e'), + ('\U0001d770', '\U0001d788'), + ('\U0001d78a', '\U0001d7a8'), + ('\U0001d7aa', '\U0001d7c2'), + ('\U0001d7c4', '\U0001d7cb'), + ('\U0001ee00', '\U0001ee03'), + ('\U0001ee05', '\U0001ee1f'), + ('\U0001ee21', '\U0001ee22'), + ('\U0001ee24', '\U0001ee24'), + ('\U0001ee27', '\U0001ee27'), + ('\U0001ee29', '\U0001ee32'), + ('\U0001ee34', '\U0001ee37'), + ('\U0001ee39', '\U0001ee39'), + ('\U0001ee3b', '\U0001ee3b'), + ('\U0001ee42', '\U0001ee42'), + ('\U0001ee47', '\U0001ee47'), + ('\U0001ee49', '\U0001ee49'), + ('\U0001ee4b', '\U0001ee4b'), + ('\U0001ee4d', '\U0001ee4f'), + ('\U0001ee51', '\U0001ee52'), + ('\U0001ee54', '\U0001ee54'), + ('\U0001ee57', '\U0001ee57'), + ('\U0001ee59', '\U0001ee59'), + ('\U0001ee5b', '\U0001ee5b'), + ('\U0001ee5d', '\U0001ee5d'), + ('\U0001ee5f', '\U0001ee5f'), + ('\U0001ee61', '\U0001ee62'), + ('\U0001ee64', '\U0001ee64'), + ('\U0001ee67', '\U0001ee6a'), + ('\U0001ee6c', '\U0001ee72'), + ('\U0001ee74', '\U0001ee77'), + ('\U0001ee79', '\U0001ee7c'), + ('\U0001ee7e', '\U0001ee7e'), + ('\U0001ee80', '\U0001ee89'), + ('\U0001ee8b', '\U0001ee9b'), + ('\U0001eea1', '\U0001eea3'), + ('\U0001eea5', '\U0001eea9'), + ('\U0001eeab', '\U0001eebb'), + ('\U00020000', '\U00020000'), + ('\U0002a6d6', '\U0002a6d6'), + ('\U0002a700', '\U0002a700'), + ('\U0002b734', '\U0002b734'), + ('\U0002b740', '\U0002b740'), + ('\U0002b81d', '\U0002b81d'), + ('\U0002f800', '\U0002fa1d') + ]), +("LC", &[ + ('\U00000041', '\U0000005a'), + ('\U00000061', '\U0000007a'), + ('\U000000b5', '\U000000b5'), + ('\U000000c0', '\U000000d6'), + ('\U000000d8', '\U000000f6'), + ('\U000000f8', '\U000001ba'), + ('\U000001bc', '\U000001bf'), + ('\U000001c4', '\U00000293'), + ('\U00000295', '\U000002af'), + ('\U00000370', '\U00000373'), + ('\U00000376', '\U00000377'), + ('\U0000037b', '\U0000037d'), + ('\U00000386', '\U00000386'), + ('\U00000388', '\U0000038a'), + ('\U0000038c', '\U0000038c'), + ('\U0000038e', '\U000003a1'), + ('\U000003a3', '\U000003f5'), + ('\U000003f7', '\U00000481'), + ('\U0000048a', '\U00000527'), + ('\U00000531', '\U00000556'), + ('\U00000561', '\U00000587'), + ('\U000010a0', '\U000010c5'), + ('\U000010c7', '\U000010c7'), + ('\U000010cd', '\U000010cd'), + ('\U00001d00', '\U00001d2b'), + ('\U00001d6b', '\U00001d77'), + ('\U00001d79', '\U00001d9a'), + ('\U00001e00', '\U00001f15'), + ('\U00001f18', '\U00001f1d'), + ('\U00001f20', '\U00001f45'), + ('\U00001f48', '\U00001f4d'), + ('\U00001f50', '\U00001f57'), + ('\U00001f59', '\U00001f59'), + ('\U00001f5b', '\U00001f5b'), + ('\U00001f5d', '\U00001f5d'), + ('\U00001f5f', '\U00001f7d'), + ('\U00001f80', '\U00001fb4'), + ('\U00001fb6', '\U00001fbc'), + ('\U00001fbe', '\U00001fbe'), + ('\U00001fc2', '\U00001fc4'), + ('\U00001fc6', '\U00001fcc'), + ('\U00001fd0', '\U00001fd3'), + ('\U00001fd6', '\U00001fdb'), + ('\U00001fe0', '\U00001fec'), + ('\U00001ff2', '\U00001ff4'), + ('\U00001ff6', '\U00001ffc'), + ('\U00002102', '\U00002102'), + ('\U00002107', '\U00002107'), + ('\U0000210a', '\U00002113'), + ('\U00002115', '\U00002115'), + ('\U00002119', '\U0000211d'), + ('\U00002124', '\U00002124'), + ('\U00002126', '\U00002126'), + ('\U00002128', '\U00002128'), + ('\U0000212a', '\U0000212d'), + ('\U0000212f', '\U00002134'), + ('\U00002139', '\U00002139'), + ('\U0000213c', '\U0000213f'), + ('\U00002145', '\U00002149'), + ('\U0000214e', '\U0000214e'), + ('\U00002183', '\U00002184'), + ('\U00002c00', '\U00002c2e'), + ('\U00002c30', '\U00002c5e'), + ('\U00002c60', '\U00002c7b'), + ('\U00002c7e', '\U00002ce4'), + ('\U00002ceb', '\U00002cee'), + ('\U00002cf2', '\U00002cf3'), + ('\U00002d00', '\U00002d25'), + ('\U00002d27', '\U00002d27'), + ('\U00002d2d', '\U00002d2d'), + ('\U0000a640', '\U0000a66d'), + ('\U0000a680', '\U0000a697'), + ('\U0000a722', '\U0000a76f'), + ('\U0000a771', '\U0000a787'), + ('\U0000a78b', '\U0000a78e'), + ('\U0000a790', '\U0000a793'), + ('\U0000a7a0', '\U0000a7aa'), + ('\U0000a7fa', '\U0000a7fa'), + ('\U0000fb00', '\U0000fb06'), + ('\U0000fb13', '\U0000fb17'), + ('\U0000ff21', '\U0000ff3a'), + ('\U0000ff41', '\U0000ff5a'), + ('\U00010400', '\U0001044f'), + ('\U0001d400', '\U0001d454'), + ('\U0001d456', '\U0001d49c'), + ('\U0001d49e', '\U0001d49f'), + ('\U0001d4a2', '\U0001d4a2'), + ('\U0001d4a5', '\U0001d4a6'), + ('\U0001d4a9', '\U0001d4ac'), + ('\U0001d4ae', '\U0001d4b9'), + ('\U0001d4bb', '\U0001d4bb'), + ('\U0001d4bd', '\U0001d4c3'), + ('\U0001d4c5', '\U0001d505'), + ('\U0001d507', '\U0001d50a'), + ('\U0001d50d', '\U0001d514'), + ('\U0001d516', '\U0001d51c'), + ('\U0001d51e', '\U0001d539'), + ('\U0001d53b', '\U0001d53e'), + ('\U0001d540', '\U0001d544'), + ('\U0001d546', '\U0001d546'), + ('\U0001d54a', '\U0001d550'), + ('\U0001d552', '\U0001d6a5'), + ('\U0001d6a8', '\U0001d6c0'), + ('\U0001d6c2', '\U0001d6da'), + ('\U0001d6dc', '\U0001d6fa'), + ('\U0001d6fc', '\U0001d714'), + ('\U0001d716', '\U0001d734'), + ('\U0001d736', '\U0001d74e'), + ('\U0001d750', '\U0001d76e'), + ('\U0001d770', '\U0001d788'), + ('\U0001d78a', '\U0001d7a8'), + ('\U0001d7aa', '\U0001d7c2'), + ('\U0001d7c4', '\U0001d7cb') + ]), +("Lao", &[ + ('\U00000e81', '\U00000e82'), + ('\U00000e84', '\U00000e84'), + ('\U00000e87', '\U00000e88'), + ('\U00000e8a', '\U00000e8a'), + ('\U00000e8d', '\U00000e8d'), + ('\U00000e94', '\U00000e97'), + ('\U00000e99', '\U00000e9f'), + ('\U00000ea1', '\U00000ea3'), + ('\U00000ea5', '\U00000ea5'), + ('\U00000ea7', '\U00000ea7'), + ('\U00000eaa', '\U00000eab'), + ('\U00000ead', '\U00000eb9'), + ('\U00000ebb', '\U00000ebd'), + ('\U00000ec0', '\U00000ec4'), + ('\U00000ec6', '\U00000ec6'), + ('\U00000ec8', '\U00000ecd'), + ('\U00000ed0', '\U00000ed9'), + ('\U00000edc', '\U00000edf') + ]), +("Latin", &[ + ('\U00000041', '\U0000005a'), + ('\U00000061', '\U0000007a'), + ('\U000000aa', '\U000000aa'), + ('\U000000ba', '\U000000ba'), + ('\U000000c0', '\U000000d6'), + ('\U000000d8', '\U000000f6'), + ('\U000000f8', '\U000002b8'), + ('\U000002e0', '\U000002e4'), + ('\U00001d00', '\U00001d25'), + ('\U00001d2c', '\U00001d5c'), + ('\U00001d62', '\U00001d65'), + ('\U00001d6b', '\U00001d77'), + ('\U00001d79', '\U00001dbe'), + ('\U00001e00', '\U00001eff'), + ('\U00002071', '\U00002071'), + ('\U0000207f', '\U0000207f'), + ('\U00002090', '\U0000209c'), + ('\U0000212a', '\U0000212b'), + ('\U00002132', '\U00002132'), + ('\U0000214e', '\U0000214e'), + ('\U00002160', '\U00002188'), + ('\U00002c60', '\U00002c7f'), + ('\U0000a722', '\U0000a787'), + ('\U0000a78b', '\U0000a78e'), + ('\U0000a790', '\U0000a793'), + ('\U0000a7a0', '\U0000a7aa'), + ('\U0000a7f8', '\U0000a7ff'), + ('\U0000fb00', '\U0000fb06'), + ('\U0000ff21', '\U0000ff3a'), + ('\U0000ff41', '\U0000ff5a') + ]), +("Lepcha", &[ + ('\U00001c00', '\U00001c37'), + ('\U00001c3b', '\U00001c49'), + ('\U00001c4d', '\U00001c4f') + ]), +("Limbu", &[ + ('\U00001900', '\U0000191c'), + ('\U00001920', '\U0000192b'), + ('\U00001930', '\U0000193b'), + ('\U00001940', '\U00001940'), + ('\U00001944', '\U0000194f') + ]), +("Linear_B", &[ + ('\U00010000', '\U0001000b'), + ('\U0001000d', '\U00010026'), + ('\U00010028', '\U0001003a'), + ('\U0001003c', '\U0001003d'), + ('\U0001003f', '\U0001004d'), + ('\U00010050', '\U0001005d'), + ('\U00010080', '\U000100fa') + ]), +("Lisu", &[ + ('\U0000a4d0', '\U0000a4ff') + ]), +("Ll", &[ + ('\U00000061', '\U0000007a'), + ('\U000000b5', '\U000000b5'), + ('\U000000df', '\U000000f6'), + ('\U000000f8', '\U000000ff'), + ('\U00000101', '\U00000101'), + ('\U00000103', '\U00000103'), + ('\U00000105', '\U00000105'), + ('\U00000107', '\U00000107'), + ('\U00000109', '\U00000109'), + ('\U0000010b', '\U0000010b'), + ('\U0000010d', '\U0000010d'), + ('\U0000010f', '\U0000010f'), + ('\U00000111', '\U00000111'), + ('\U00000113', '\U00000113'), + ('\U00000115', '\U00000115'), + ('\U00000117', '\U00000117'), + ('\U00000119', '\U00000119'), + ('\U0000011b', '\U0000011b'), + ('\U0000011d', '\U0000011d'), + ('\U0000011f', '\U0000011f'), + ('\U00000121', '\U00000121'), + ('\U00000123', '\U00000123'), + ('\U00000125', '\U00000125'), + ('\U00000127', '\U00000127'), + ('\U00000129', '\U00000129'), + ('\U0000012b', '\U0000012b'), + ('\U0000012d', '\U0000012d'), + ('\U0000012f', '\U0000012f'), + ('\U00000131', '\U00000131'), + ('\U00000133', '\U00000133'), + ('\U00000135', '\U00000135'), + ('\U00000137', '\U00000138'), + ('\U0000013a', '\U0000013a'), + ('\U0000013c', '\U0000013c'), + ('\U0000013e', '\U0000013e'), + ('\U00000140', '\U00000140'), + ('\U00000142', '\U00000142'), + ('\U00000144', '\U00000144'), + ('\U00000146', '\U00000146'), + ('\U00000148', '\U00000149'), + ('\U0000014b', '\U0000014b'), + ('\U0000014d', '\U0000014d'), + ('\U0000014f', '\U0000014f'), + ('\U00000151', '\U00000151'), + ('\U00000153', '\U00000153'), + ('\U00000155', '\U00000155'), + ('\U00000157', '\U00000157'), + ('\U00000159', '\U00000159'), + ('\U0000015b', '\U0000015b'), + ('\U0000015d', '\U0000015d'), + ('\U0000015f', '\U0000015f'), + ('\U00000161', '\U00000161'), + ('\U00000163', '\U00000163'), + ('\U00000165', '\U00000165'), + ('\U00000167', '\U00000167'), + ('\U00000169', '\U00000169'), + ('\U0000016b', '\U0000016b'), + ('\U0000016d', '\U0000016d'), + ('\U0000016f', '\U0000016f'), + ('\U00000171', '\U00000171'), + ('\U00000173', '\U00000173'), + ('\U00000175', '\U00000175'), + ('\U00000177', '\U00000177'), + ('\U0000017a', '\U0000017a'), + ('\U0000017c', '\U0000017c'), + ('\U0000017e', '\U00000180'), + ('\U00000183', '\U00000183'), + ('\U00000185', '\U00000185'), + ('\U00000188', '\U00000188'), + ('\U0000018c', '\U0000018d'), + ('\U00000192', '\U00000192'), + ('\U00000195', '\U00000195'), + ('\U00000199', '\U0000019b'), + ('\U0000019e', '\U0000019e'), + ('\U000001a1', '\U000001a1'), + ('\U000001a3', '\U000001a3'), + ('\U000001a5', '\U000001a5'), + ('\U000001a8', '\U000001a8'), + ('\U000001aa', '\U000001ab'), + ('\U000001ad', '\U000001ad'), + ('\U000001b0', '\U000001b0'), + ('\U000001b4', '\U000001b4'), + ('\U000001b6', '\U000001b6'), + ('\U000001b9', '\U000001ba'), + ('\U000001bd', '\U000001bf'), + ('\U000001c6', '\U000001c6'), + ('\U000001c9', '\U000001c9'), + ('\U000001cc', '\U000001cc'), + ('\U000001ce', '\U000001ce'), + ('\U000001d0', '\U000001d0'), + ('\U000001d2', '\U000001d2'), + ('\U000001d4', '\U000001d4'), + ('\U000001d6', '\U000001d6'), + ('\U000001d8', '\U000001d8'), + ('\U000001da', '\U000001da'), + ('\U000001dc', '\U000001dd'), + ('\U000001df', '\U000001df'), + ('\U000001e1', '\U000001e1'), + ('\U000001e3', '\U000001e3'), + ('\U000001e5', '\U000001e5'), + ('\U000001e7', '\U000001e7'), + ('\U000001e9', '\U000001e9'), + ('\U000001eb', '\U000001eb'), + ('\U000001ed', '\U000001ed'), + ('\U000001ef', '\U000001f0'), + ('\U000001f3', '\U000001f3'), + ('\U000001f5', '\U000001f5'), + ('\U000001f9', '\U000001f9'), + ('\U000001fb', '\U000001fb'), + ('\U000001fd', '\U000001fd'), + ('\U000001ff', '\U000001ff'), + ('\U00000201', '\U00000201'), + ('\U00000203', '\U00000203'), + ('\U00000205', '\U00000205'), + ('\U00000207', '\U00000207'), + ('\U00000209', '\U00000209'), + ('\U0000020b', '\U0000020b'), + ('\U0000020d', '\U0000020d'), + ('\U0000020f', '\U0000020f'), + ('\U00000211', '\U00000211'), + ('\U00000213', '\U00000213'), + ('\U00000215', '\U00000215'), + ('\U00000217', '\U00000217'), + ('\U00000219', '\U00000219'), + ('\U0000021b', '\U0000021b'), + ('\U0000021d', '\U0000021d'), + ('\U0000021f', '\U0000021f'), + ('\U00000221', '\U00000221'), + ('\U00000223', '\U00000223'), + ('\U00000225', '\U00000225'), + ('\U00000227', '\U00000227'), + ('\U00000229', '\U00000229'), + ('\U0000022b', '\U0000022b'), + ('\U0000022d', '\U0000022d'), + ('\U0000022f', '\U0000022f'), + ('\U00000231', '\U00000231'), + ('\U00000233', '\U00000239'), + ('\U0000023c', '\U0000023c'), + ('\U0000023f', '\U00000240'), + ('\U00000242', '\U00000242'), + ('\U00000247', '\U00000247'), + ('\U00000249', '\U00000249'), + ('\U0000024b', '\U0000024b'), + ('\U0000024d', '\U0000024d'), + ('\U0000024f', '\U00000293'), + ('\U00000295', '\U000002af'), + ('\U00000371', '\U00000371'), + ('\U00000373', '\U00000373'), + ('\U00000377', '\U00000377'), + ('\U0000037b', '\U0000037d'), + ('\U00000390', '\U00000390'), + ('\U000003ac', '\U000003ce'), + ('\U000003d0', '\U000003d1'), + ('\U000003d5', '\U000003d7'), + ('\U000003d9', '\U000003d9'), + ('\U000003db', '\U000003db'), + ('\U000003dd', '\U000003dd'), + ('\U000003df', '\U000003df'), + ('\U000003e1', '\U000003e1'), + ('\U000003e3', '\U000003e3'), + ('\U000003e5', '\U000003e5'), + ('\U000003e7', '\U000003e7'), + ('\U000003e9', '\U000003e9'), + ('\U000003eb', '\U000003eb'), + ('\U000003ed', '\U000003ed'), + ('\U000003ef', '\U000003f3'), + ('\U000003f5', '\U000003f5'), + ('\U000003f8', '\U000003f8'), + ('\U000003fb', '\U000003fc'), + ('\U00000430', '\U0000045f'), + ('\U00000461', '\U00000461'), + ('\U00000463', '\U00000463'), + ('\U00000465', '\U00000465'), + ('\U00000467', '\U00000467'), + ('\U00000469', '\U00000469'), + ('\U0000046b', '\U0000046b'), + ('\U0000046d', '\U0000046d'), + ('\U0000046f', '\U0000046f'), + ('\U00000471', '\U00000471'), + ('\U00000473', '\U00000473'), + ('\U00000475', '\U00000475'), + ('\U00000477', '\U00000477'), + ('\U00000479', '\U00000479'), + ('\U0000047b', '\U0000047b'), + ('\U0000047d', '\U0000047d'), + ('\U0000047f', '\U0000047f'), + ('\U00000481', '\U00000481'), + ('\U0000048b', '\U0000048b'), + ('\U0000048d', '\U0000048d'), + ('\U0000048f', '\U0000048f'), + ('\U00000491', '\U00000491'), + ('\U00000493', '\U00000493'), + ('\U00000495', '\U00000495'), + ('\U00000497', '\U00000497'), + ('\U00000499', '\U00000499'), + ('\U0000049b', '\U0000049b'), + ('\U0000049d', '\U0000049d'), + ('\U0000049f', '\U0000049f'), + ('\U000004a1', '\U000004a1'), + ('\U000004a3', '\U000004a3'), + ('\U000004a5', '\U000004a5'), + ('\U000004a7', '\U000004a7'), + ('\U000004a9', '\U000004a9'), + ('\U000004ab', '\U000004ab'), + ('\U000004ad', '\U000004ad'), + ('\U000004af', '\U000004af'), + ('\U000004b1', '\U000004b1'), + ('\U000004b3', '\U000004b3'), + ('\U000004b5', '\U000004b5'), + ('\U000004b7', '\U000004b7'), + ('\U000004b9', '\U000004b9'), + ('\U000004bb', '\U000004bb'), + ('\U000004bd', '\U000004bd'), + ('\U000004bf', '\U000004bf'), + ('\U000004c2', '\U000004c2'), + ('\U000004c4', '\U000004c4'), + ('\U000004c6', '\U000004c6'), + ('\U000004c8', '\U000004c8'), + ('\U000004ca', '\U000004ca'), + ('\U000004cc', '\U000004cc'), + ('\U000004ce', '\U000004cf'), + ('\U000004d1', '\U000004d1'), + ('\U000004d3', '\U000004d3'), + ('\U000004d5', '\U000004d5'), + ('\U000004d7', '\U000004d7'), + ('\U000004d9', '\U000004d9'), + ('\U000004db', '\U000004db'), + ('\U000004dd', '\U000004dd'), + ('\U000004df', '\U000004df'), + ('\U000004e1', '\U000004e1'), + ('\U000004e3', '\U000004e3'), + ('\U000004e5', '\U000004e5'), + ('\U000004e7', '\U000004e7'), + ('\U000004e9', '\U000004e9'), + ('\U000004eb', '\U000004eb'), + ('\U000004ed', '\U000004ed'), + ('\U000004ef', '\U000004ef'), + ('\U000004f1', '\U000004f1'), + ('\U000004f3', '\U000004f3'), + ('\U000004f5', '\U000004f5'), + ('\U000004f7', '\U000004f7'), + ('\U000004f9', '\U000004f9'), + ('\U000004fb', '\U000004fb'), + ('\U000004fd', '\U000004fd'), + ('\U000004ff', '\U000004ff'), + ('\U00000501', '\U00000501'), + ('\U00000503', '\U00000503'), + ('\U00000505', '\U00000505'), + ('\U00000507', '\U00000507'), + ('\U00000509', '\U00000509'), + ('\U0000050b', '\U0000050b'), + ('\U0000050d', '\U0000050d'), + ('\U0000050f', '\U0000050f'), + ('\U00000511', '\U00000511'), + ('\U00000513', '\U00000513'), + ('\U00000515', '\U00000515'), + ('\U00000517', '\U00000517'), + ('\U00000519', '\U00000519'), + ('\U0000051b', '\U0000051b'), + ('\U0000051d', '\U0000051d'), + ('\U0000051f', '\U0000051f'), + ('\U00000521', '\U00000521'), + ('\U00000523', '\U00000523'), + ('\U00000525', '\U00000525'), + ('\U00000527', '\U00000527'), + ('\U00000561', '\U00000587'), + ('\U00001d00', '\U00001d2b'), + ('\U00001d6b', '\U00001d77'), + ('\U00001d79', '\U00001d9a'), + ('\U00001e01', '\U00001e01'), + ('\U00001e03', '\U00001e03'), + ('\U00001e05', '\U00001e05'), + ('\U00001e07', '\U00001e07'), + ('\U00001e09', '\U00001e09'), + ('\U00001e0b', '\U00001e0b'), + ('\U00001e0d', '\U00001e0d'), + ('\U00001e0f', '\U00001e0f'), + ('\U00001e11', '\U00001e11'), + ('\U00001e13', '\U00001e13'), + ('\U00001e15', '\U00001e15'), + ('\U00001e17', '\U00001e17'), + ('\U00001e19', '\U00001e19'), + ('\U00001e1b', '\U00001e1b'), + ('\U00001e1d', '\U00001e1d'), + ('\U00001e1f', '\U00001e1f'), + ('\U00001e21', '\U00001e21'), + ('\U00001e23', '\U00001e23'), + ('\U00001e25', '\U00001e25'), + ('\U00001e27', '\U00001e27'), + ('\U00001e29', '\U00001e29'), + ('\U00001e2b', '\U00001e2b'), + ('\U00001e2d', '\U00001e2d'), + ('\U00001e2f', '\U00001e2f'), + ('\U00001e31', '\U00001e31'), + ('\U00001e33', '\U00001e33'), + ('\U00001e35', '\U00001e35'), + ('\U00001e37', '\U00001e37'), + ('\U00001e39', '\U00001e39'), + ('\U00001e3b', '\U00001e3b'), + ('\U00001e3d', '\U00001e3d'), + ('\U00001e3f', '\U00001e3f'), + ('\U00001e41', '\U00001e41'), + ('\U00001e43', '\U00001e43'), + ('\U00001e45', '\U00001e45'), + ('\U00001e47', '\U00001e47'), + ('\U00001e49', '\U00001e49'), + ('\U00001e4b', '\U00001e4b'), + ('\U00001e4d', '\U00001e4d'), + ('\U00001e4f', '\U00001e4f'), + ('\U00001e51', '\U00001e51'), + ('\U00001e53', '\U00001e53'), + ('\U00001e55', '\U00001e55'), + ('\U00001e57', '\U00001e57'), + ('\U00001e59', '\U00001e59'), + ('\U00001e5b', '\U00001e5b'), + ('\U00001e5d', '\U00001e5d'), + ('\U00001e5f', '\U00001e5f'), + ('\U00001e61', '\U00001e61'), + ('\U00001e63', '\U00001e63'), + ('\U00001e65', '\U00001e65'), + ('\U00001e67', '\U00001e67'), + ('\U00001e69', '\U00001e69'), + ('\U00001e6b', '\U00001e6b'), + ('\U00001e6d', '\U00001e6d'), + ('\U00001e6f', '\U00001e6f'), + ('\U00001e71', '\U00001e71'), + ('\U00001e73', '\U00001e73'), + ('\U00001e75', '\U00001e75'), + ('\U00001e77', '\U00001e77'), + ('\U00001e79', '\U00001e79'), + ('\U00001e7b', '\U00001e7b'), + ('\U00001e7d', '\U00001e7d'), + ('\U00001e7f', '\U00001e7f'), + ('\U00001e81', '\U00001e81'), + ('\U00001e83', '\U00001e83'), + ('\U00001e85', '\U00001e85'), + ('\U00001e87', '\U00001e87'), + ('\U00001e89', '\U00001e89'), + ('\U00001e8b', '\U00001e8b'), + ('\U00001e8d', '\U00001e8d'), + ('\U00001e8f', '\U00001e8f'), + ('\U00001e91', '\U00001e91'), + ('\U00001e93', '\U00001e93'), + ('\U00001e95', '\U00001e9d'), + ('\U00001e9f', '\U00001e9f'), + ('\U00001ea1', '\U00001ea1'), + ('\U00001ea3', '\U00001ea3'), + ('\U00001ea5', '\U00001ea5'), + ('\U00001ea7', '\U00001ea7'), + ('\U00001ea9', '\U00001ea9'), + ('\U00001eab', '\U00001eab'), + ('\U00001ead', '\U00001ead'), + ('\U00001eaf', '\U00001eaf'), + ('\U00001eb1', '\U00001eb1'), + ('\U00001eb3', '\U00001eb3'), + ('\U00001eb5', '\U00001eb5'), + ('\U00001eb7', '\U00001eb7'), + ('\U00001eb9', '\U00001eb9'), + ('\U00001ebb', '\U00001ebb'), + ('\U00001ebd', '\U00001ebd'), + ('\U00001ebf', '\U00001ebf'), + ('\U00001ec1', '\U00001ec1'), + ('\U00001ec3', '\U00001ec3'), + ('\U00001ec5', '\U00001ec5'), + ('\U00001ec7', '\U00001ec7'), + ('\U00001ec9', '\U00001ec9'), + ('\U00001ecb', '\U00001ecb'), + ('\U00001ecd', '\U00001ecd'), + ('\U00001ecf', '\U00001ecf'), + ('\U00001ed1', '\U00001ed1'), + ('\U00001ed3', '\U00001ed3'), + ('\U00001ed5', '\U00001ed5'), + ('\U00001ed7', '\U00001ed7'), + ('\U00001ed9', '\U00001ed9'), + ('\U00001edb', '\U00001edb'), + ('\U00001edd', '\U00001edd'), + ('\U00001edf', '\U00001edf'), + ('\U00001ee1', '\U00001ee1'), + ('\U00001ee3', '\U00001ee3'), + ('\U00001ee5', '\U00001ee5'), + ('\U00001ee7', '\U00001ee7'), + ('\U00001ee9', '\U00001ee9'), + ('\U00001eeb', '\U00001eeb'), + ('\U00001eed', '\U00001eed'), + ('\U00001eef', '\U00001eef'), + ('\U00001ef1', '\U00001ef1'), + ('\U00001ef3', '\U00001ef3'), + ('\U00001ef5', '\U00001ef5'), + ('\U00001ef7', '\U00001ef7'), + ('\U00001ef9', '\U00001ef9'), + ('\U00001efb', '\U00001efb'), + ('\U00001efd', '\U00001efd'), + ('\U00001eff', '\U00001f07'), + ('\U00001f10', '\U00001f15'), + ('\U00001f20', '\U00001f27'), + ('\U00001f30', '\U00001f37'), + ('\U00001f40', '\U00001f45'), + ('\U00001f50', '\U00001f57'), + ('\U00001f60', '\U00001f67'), + ('\U00001f70', '\U00001f7d'), + ('\U00001f80', '\U00001f87'), + ('\U00001f90', '\U00001f97'), + ('\U00001fa0', '\U00001fa7'), + ('\U00001fb0', '\U00001fb4'), + ('\U00001fb6', '\U00001fb7'), + ('\U00001fbe', '\U00001fbe'), + ('\U00001fc2', '\U00001fc4'), + ('\U00001fc6', '\U00001fc7'), + ('\U00001fd0', '\U00001fd3'), + ('\U00001fd6', '\U00001fd7'), + ('\U00001fe0', '\U00001fe7'), + ('\U00001ff2', '\U00001ff4'), + ('\U00001ff6', '\U00001ff7'), + ('\U0000210a', '\U0000210a'), + ('\U0000210e', '\U0000210f'), + ('\U00002113', '\U00002113'), + ('\U0000212f', '\U0000212f'), + ('\U00002134', '\U00002134'), + ('\U00002139', '\U00002139'), + ('\U0000213c', '\U0000213d'), + ('\U00002146', '\U00002149'), + ('\U0000214e', '\U0000214e'), + ('\U00002184', '\U00002184'), + ('\U00002c30', '\U00002c5e'), + ('\U00002c61', '\U00002c61'), + ('\U00002c65', '\U00002c66'), + ('\U00002c68', '\U00002c68'), + ('\U00002c6a', '\U00002c6a'), + ('\U00002c6c', '\U00002c6c'), + ('\U00002c71', '\U00002c71'), + ('\U00002c73', '\U00002c74'), + ('\U00002c76', '\U00002c7b'), + ('\U00002c81', '\U00002c81'), + ('\U00002c83', '\U00002c83'), + ('\U00002c85', '\U00002c85'), + ('\U00002c87', '\U00002c87'), + ('\U00002c89', '\U00002c89'), + ('\U00002c8b', '\U00002c8b'), + ('\U00002c8d', '\U00002c8d'), + ('\U00002c8f', '\U00002c8f'), + ('\U00002c91', '\U00002c91'), + ('\U00002c93', '\U00002c93'), + ('\U00002c95', '\U00002c95'), + ('\U00002c97', '\U00002c97'), + ('\U00002c99', '\U00002c99'), + ('\U00002c9b', '\U00002c9b'), + ('\U00002c9d', '\U00002c9d'), + ('\U00002c9f', '\U00002c9f'), + ('\U00002ca1', '\U00002ca1'), + ('\U00002ca3', '\U00002ca3'), + ('\U00002ca5', '\U00002ca5'), + ('\U00002ca7', '\U00002ca7'), + ('\U00002ca9', '\U00002ca9'), + ('\U00002cab', '\U00002cab'), + ('\U00002cad', '\U00002cad'), + ('\U00002caf', '\U00002caf'), + ('\U00002cb1', '\U00002cb1'), + ('\U00002cb3', '\U00002cb3'), + ('\U00002cb5', '\U00002cb5'), + ('\U00002cb7', '\U00002cb7'), + ('\U00002cb9', '\U00002cb9'), + ('\U00002cbb', '\U00002cbb'), + ('\U00002cbd', '\U00002cbd'), + ('\U00002cbf', '\U00002cbf'), + ('\U00002cc1', '\U00002cc1'), + ('\U00002cc3', '\U00002cc3'), + ('\U00002cc5', '\U00002cc5'), + ('\U00002cc7', '\U00002cc7'), + ('\U00002cc9', '\U00002cc9'), + ('\U00002ccb', '\U00002ccb'), + ('\U00002ccd', '\U00002ccd'), + ('\U00002ccf', '\U00002ccf'), + ('\U00002cd1', '\U00002cd1'), + ('\U00002cd3', '\U00002cd3'), + ('\U00002cd5', '\U00002cd5'), + ('\U00002cd7', '\U00002cd7'), + ('\U00002cd9', '\U00002cd9'), + ('\U00002cdb', '\U00002cdb'), + ('\U00002cdd', '\U00002cdd'), + ('\U00002cdf', '\U00002cdf'), + ('\U00002ce1', '\U00002ce1'), + ('\U00002ce3', '\U00002ce4'), + ('\U00002cec', '\U00002cec'), + ('\U00002cee', '\U00002cee'), + ('\U00002cf3', '\U00002cf3'), + ('\U00002d00', '\U00002d25'), + ('\U00002d27', '\U00002d27'), + ('\U00002d2d', '\U00002d2d'), + ('\U0000a641', '\U0000a641'), + ('\U0000a643', '\U0000a643'), + ('\U0000a645', '\U0000a645'), + ('\U0000a647', '\U0000a647'), + ('\U0000a649', '\U0000a649'), + ('\U0000a64b', '\U0000a64b'), + ('\U0000a64d', '\U0000a64d'), + ('\U0000a64f', '\U0000a64f'), + ('\U0000a651', '\U0000a651'), + ('\U0000a653', '\U0000a653'), + ('\U0000a655', '\U0000a655'), + ('\U0000a657', '\U0000a657'), + ('\U0000a659', '\U0000a659'), + ('\U0000a65b', '\U0000a65b'), + ('\U0000a65d', '\U0000a65d'), + ('\U0000a65f', '\U0000a65f'), + ('\U0000a661', '\U0000a661'), + ('\U0000a663', '\U0000a663'), + ('\U0000a665', '\U0000a665'), + ('\U0000a667', '\U0000a667'), + ('\U0000a669', '\U0000a669'), + ('\U0000a66b', '\U0000a66b'), + ('\U0000a66d', '\U0000a66d'), + ('\U0000a681', '\U0000a681'), + ('\U0000a683', '\U0000a683'), + ('\U0000a685', '\U0000a685'), + ('\U0000a687', '\U0000a687'), + ('\U0000a689', '\U0000a689'), + ('\U0000a68b', '\U0000a68b'), + ('\U0000a68d', '\U0000a68d'), + ('\U0000a68f', '\U0000a68f'), + ('\U0000a691', '\U0000a691'), + ('\U0000a693', '\U0000a693'), + ('\U0000a695', '\U0000a695'), + ('\U0000a697', '\U0000a697'), + ('\U0000a723', '\U0000a723'), + ('\U0000a725', '\U0000a725'), + ('\U0000a727', '\U0000a727'), + ('\U0000a729', '\U0000a729'), + ('\U0000a72b', '\U0000a72b'), + ('\U0000a72d', '\U0000a72d'), + ('\U0000a72f', '\U0000a731'), + ('\U0000a733', '\U0000a733'), + ('\U0000a735', '\U0000a735'), + ('\U0000a737', '\U0000a737'), + ('\U0000a739', '\U0000a739'), + ('\U0000a73b', '\U0000a73b'), + ('\U0000a73d', '\U0000a73d'), + ('\U0000a73f', '\U0000a73f'), + ('\U0000a741', '\U0000a741'), + ('\U0000a743', '\U0000a743'), + ('\U0000a745', '\U0000a745'), + ('\U0000a747', '\U0000a747'), + ('\U0000a749', '\U0000a749'), + ('\U0000a74b', '\U0000a74b'), + ('\U0000a74d', '\U0000a74d'), + ('\U0000a74f', '\U0000a74f'), + ('\U0000a751', '\U0000a751'), + ('\U0000a753', '\U0000a753'), + ('\U0000a755', '\U0000a755'), + ('\U0000a757', '\U0000a757'), + ('\U0000a759', '\U0000a759'), + ('\U0000a75b', '\U0000a75b'), + ('\U0000a75d', '\U0000a75d'), + ('\U0000a75f', '\U0000a75f'), + ('\U0000a761', '\U0000a761'), + ('\U0000a763', '\U0000a763'), + ('\U0000a765', '\U0000a765'), + ('\U0000a767', '\U0000a767'), + ('\U0000a769', '\U0000a769'), + ('\U0000a76b', '\U0000a76b'), + ('\U0000a76d', '\U0000a76d'), + ('\U0000a76f', '\U0000a76f'), + ('\U0000a771', '\U0000a778'), + ('\U0000a77a', '\U0000a77a'), + ('\U0000a77c', '\U0000a77c'), + ('\U0000a77f', '\U0000a77f'), + ('\U0000a781', '\U0000a781'), + ('\U0000a783', '\U0000a783'), + ('\U0000a785', '\U0000a785'), + ('\U0000a787', '\U0000a787'), + ('\U0000a78c', '\U0000a78c'), + ('\U0000a78e', '\U0000a78e'), + ('\U0000a791', '\U0000a791'), + ('\U0000a793', '\U0000a793'), + ('\U0000a7a1', '\U0000a7a1'), + ('\U0000a7a3', '\U0000a7a3'), + ('\U0000a7a5', '\U0000a7a5'), + ('\U0000a7a7', '\U0000a7a7'), + ('\U0000a7a9', '\U0000a7a9'), + ('\U0000a7fa', '\U0000a7fa'), + ('\U0000fb00', '\U0000fb06'), + ('\U0000fb13', '\U0000fb17'), + ('\U0000ff41', '\U0000ff5a'), + ('\U00010428', '\U0001044f'), + ('\U0001d41a', '\U0001d433'), + ('\U0001d44e', '\U0001d454'), + ('\U0001d456', '\U0001d467'), + ('\U0001d482', '\U0001d49b'), + ('\U0001d4b6', '\U0001d4b9'), + ('\U0001d4bb', '\U0001d4bb'), + ('\U0001d4bd', '\U0001d4c3'), + ('\U0001d4c5', '\U0001d4cf'), + ('\U0001d4ea', '\U0001d503'), + ('\U0001d51e', '\U0001d537'), + ('\U0001d552', '\U0001d56b'), + ('\U0001d586', '\U0001d59f'), + ('\U0001d5ba', '\U0001d5d3'), + ('\U0001d5ee', '\U0001d607'), + ('\U0001d622', '\U0001d63b'), + ('\U0001d656', '\U0001d66f'), + ('\U0001d68a', '\U0001d6a5'), + ('\U0001d6c2', '\U0001d6da'), + ('\U0001d6dc', '\U0001d6e1'), + ('\U0001d6fc', '\U0001d714'), + ('\U0001d716', '\U0001d71b'), + ('\U0001d736', '\U0001d74e'), + ('\U0001d750', '\U0001d755'), + ('\U0001d770', '\U0001d788'), + ('\U0001d78a', '\U0001d78f'), + ('\U0001d7aa', '\U0001d7c2'), + ('\U0001d7c4', '\U0001d7c9'), + ('\U0001d7cb', '\U0001d7cb') + ]), +("Lm", &[ + ('\U000002b0', '\U000002c1'), + ('\U000002c6', '\U000002d1'), + ('\U000002e0', '\U000002e4'), + ('\U000002ec', '\U000002ec'), + ('\U000002ee', '\U000002ee'), + ('\U00000374', '\U00000374'), + ('\U0000037a', '\U0000037a'), + ('\U00000559', '\U00000559'), + ('\U00000640', '\U00000640'), + ('\U000006e5', '\U000006e6'), + ('\U000007f4', '\U000007f5'), + ('\U000007fa', '\U000007fa'), + ('\U0000081a', '\U0000081a'), + ('\U00000824', '\U00000824'), + ('\U00000828', '\U00000828'), + ('\U00000971', '\U00000971'), + ('\U00000e46', '\U00000e46'), + ('\U00000ec6', '\U00000ec6'), + ('\U000010fc', '\U000010fc'), + ('\U000017d7', '\U000017d7'), + ('\U00001843', '\U00001843'), + ('\U00001aa7', '\U00001aa7'), + ('\U00001c78', '\U00001c7d'), + ('\U00001d2c', '\U00001d6a'), + ('\U00001d78', '\U00001d78'), + ('\U00001d9b', '\U00001dbf'), + ('\U00002071', '\U00002071'), + ('\U0000207f', '\U0000207f'), + ('\U00002090', '\U0000209c'), + ('\U00002c7c', '\U00002c7d'), + ('\U00002d6f', '\U00002d6f'), + ('\U00002e2f', '\U00002e2f'), + ('\U00003005', '\U00003005'), + ('\U00003031', '\U00003035'), + ('\U0000303b', '\U0000303b'), + ('\U0000309d', '\U0000309e'), + ('\U000030fc', '\U000030fe'), + ('\U0000a015', '\U0000a015'), + ('\U0000a4f8', '\U0000a4fd'), + ('\U0000a60c', '\U0000a60c'), + ('\U0000a67f', '\U0000a67f'), + ('\U0000a717', '\U0000a71f'), + ('\U0000a770', '\U0000a770'), + ('\U0000a788', '\U0000a788'), + ('\U0000a7f8', '\U0000a7f9'), + ('\U0000a9cf', '\U0000a9cf'), + ('\U0000aa70', '\U0000aa70'), + ('\U0000aadd', '\U0000aadd'), + ('\U0000aaf3', '\U0000aaf4'), + ('\U0000ff70', '\U0000ff70'), + ('\U0000ff9e', '\U0000ff9f'), + ('\U00016f93', '\U00016f9f') + ]), +("Lo", &[ + ('\U000000aa', '\U000000aa'), + ('\U000000ba', '\U000000ba'), + ('\U000001bb', '\U000001bb'), + ('\U000001c0', '\U000001c3'), + ('\U00000294', '\U00000294'), + ('\U000005d0', '\U000005ea'), + ('\U000005f0', '\U000005f2'), + ('\U00000620', '\U0000063f'), + ('\U00000641', '\U0000064a'), + ('\U0000066e', '\U0000066f'), + ('\U00000671', '\U000006d3'), + ('\U000006d5', '\U000006d5'), + ('\U000006ee', '\U000006ef'), + ('\U000006fa', '\U000006fc'), + ('\U000006ff', '\U000006ff'), + ('\U00000710', '\U00000710'), + ('\U00000712', '\U0000072f'), + ('\U0000074d', '\U000007a5'), + ('\U000007b1', '\U000007b1'), + ('\U000007ca', '\U000007ea'), + ('\U00000800', '\U00000815'), + ('\U00000840', '\U00000858'), + ('\U000008a0', '\U000008a0'), + ('\U000008a2', '\U000008ac'), + ('\U00000904', '\U00000939'), + ('\U0000093d', '\U0000093d'), + ('\U00000950', '\U00000950'), + ('\U00000958', '\U00000961'), + ('\U00000972', '\U00000977'), + ('\U00000979', '\U0000097f'), + ('\U00000985', '\U0000098c'), + ('\U0000098f', '\U00000990'), + ('\U00000993', '\U000009a8'), + ('\U000009aa', '\U000009b0'), + ('\U000009b2', '\U000009b2'), + ('\U000009b6', '\U000009b9'), + ('\U000009bd', '\U000009bd'), + ('\U000009ce', '\U000009ce'), + ('\U000009dc', '\U000009dd'), + ('\U000009df', '\U000009e1'), + ('\U000009f0', '\U000009f1'), + ('\U00000a05', '\U00000a0a'), + ('\U00000a0f', '\U00000a10'), + ('\U00000a13', '\U00000a28'), + ('\U00000a2a', '\U00000a30'), + ('\U00000a32', '\U00000a33'), + ('\U00000a35', '\U00000a36'), + ('\U00000a38', '\U00000a39'), + ('\U00000a59', '\U00000a5c'), + ('\U00000a5e', '\U00000a5e'), + ('\U00000a72', '\U00000a74'), + ('\U00000a85', '\U00000a8d'), + ('\U00000a8f', '\U00000a91'), + ('\U00000a93', '\U00000aa8'), + ('\U00000aaa', '\U00000ab0'), + ('\U00000ab2', '\U00000ab3'), + ('\U00000ab5', '\U00000ab9'), + ('\U00000abd', '\U00000abd'), + ('\U00000ad0', '\U00000ad0'), + ('\U00000ae0', '\U00000ae1'), + ('\U00000b05', '\U00000b0c'), + ('\U00000b0f', '\U00000b10'), + ('\U00000b13', '\U00000b28'), + ('\U00000b2a', '\U00000b30'), + ('\U00000b32', '\U00000b33'), + ('\U00000b35', '\U00000b39'), + ('\U00000b3d', '\U00000b3d'), + ('\U00000b5c', '\U00000b5d'), + ('\U00000b5f', '\U00000b61'), + ('\U00000b71', '\U00000b71'), + ('\U00000b83', '\U00000b83'), + ('\U00000b85', '\U00000b8a'), + ('\U00000b8e', '\U00000b90'), + ('\U00000b92', '\U00000b95'), + ('\U00000b99', '\U00000b9a'), + ('\U00000b9c', '\U00000b9c'), + ('\U00000b9e', '\U00000b9f'), + ('\U00000ba3', '\U00000ba4'), + ('\U00000ba8', '\U00000baa'), + ('\U00000bae', '\U00000bb9'), + ('\U00000bd0', '\U00000bd0'), + ('\U00000c05', '\U00000c0c'), + ('\U00000c0e', '\U00000c10'), + ('\U00000c12', '\U00000c28'), + ('\U00000c2a', '\U00000c33'), + ('\U00000c35', '\U00000c39'), + ('\U00000c3d', '\U00000c3d'), + ('\U00000c58', '\U00000c59'), + ('\U00000c60', '\U00000c61'), + ('\U00000c85', '\U00000c8c'), + ('\U00000c8e', '\U00000c90'), + ('\U00000c92', '\U00000ca8'), + ('\U00000caa', '\U00000cb3'), + ('\U00000cb5', '\U00000cb9'), + ('\U00000cbd', '\U00000cbd'), + ('\U00000cde', '\U00000cde'), + ('\U00000ce0', '\U00000ce1'), + ('\U00000cf1', '\U00000cf2'), + ('\U00000d05', '\U00000d0c'), + ('\U00000d0e', '\U00000d10'), + ('\U00000d12', '\U00000d3a'), + ('\U00000d3d', '\U00000d3d'), + ('\U00000d4e', '\U00000d4e'), + ('\U00000d60', '\U00000d61'), + ('\U00000d7a', '\U00000d7f'), + ('\U00000d85', '\U00000d96'), + ('\U00000d9a', '\U00000db1'), + ('\U00000db3', '\U00000dbb'), + ('\U00000dbd', '\U00000dbd'), + ('\U00000dc0', '\U00000dc6'), + ('\U00000e01', '\U00000e30'), + ('\U00000e32', '\U00000e33'), + ('\U00000e40', '\U00000e45'), + ('\U00000e81', '\U00000e82'), + ('\U00000e84', '\U00000e84'), + ('\U00000e87', '\U00000e88'), + ('\U00000e8a', '\U00000e8a'), + ('\U00000e8d', '\U00000e8d'), + ('\U00000e94', '\U00000e97'), + ('\U00000e99', '\U00000e9f'), + ('\U00000ea1', '\U00000ea3'), + ('\U00000ea5', '\U00000ea5'), + ('\U00000ea7', '\U00000ea7'), + ('\U00000eaa', '\U00000eab'), + ('\U00000ead', '\U00000eb0'), + ('\U00000eb2', '\U00000eb3'), + ('\U00000ebd', '\U00000ebd'), + ('\U00000ec0', '\U00000ec4'), + ('\U00000edc', '\U00000edf'), + ('\U00000f00', '\U00000f00'), + ('\U00000f40', '\U00000f47'), + ('\U00000f49', '\U00000f6c'), + ('\U00000f88', '\U00000f8c'), + ('\U00001000', '\U0000102a'), + ('\U0000103f', '\U0000103f'), + ('\U00001050', '\U00001055'), + ('\U0000105a', '\U0000105d'), + ('\U00001061', '\U00001061'), + ('\U00001065', '\U00001066'), + ('\U0000106e', '\U00001070'), + ('\U00001075', '\U00001081'), + ('\U0000108e', '\U0000108e'), + ('\U000010d0', '\U000010fa'), + ('\U000010fd', '\U00001248'), + ('\U0000124a', '\U0000124d'), + ('\U00001250', '\U00001256'), + ('\U00001258', '\U00001258'), + ('\U0000125a', '\U0000125d'), + ('\U00001260', '\U00001288'), + ('\U0000128a', '\U0000128d'), + ('\U00001290', '\U000012b0'), + ('\U000012b2', '\U000012b5'), + ('\U000012b8', '\U000012be'), + ('\U000012c0', '\U000012c0'), + ('\U000012c2', '\U000012c5'), + ('\U000012c8', '\U000012d6'), + ('\U000012d8', '\U00001310'), + ('\U00001312', '\U00001315'), + ('\U00001318', '\U0000135a'), + ('\U00001380', '\U0000138f'), + ('\U000013a0', '\U000013f4'), + ('\U00001401', '\U0000166c'), + ('\U0000166f', '\U0000167f'), + ('\U00001681', '\U0000169a'), + ('\U000016a0', '\U000016ea'), + ('\U00001700', '\U0000170c'), + ('\U0000170e', '\U00001711'), + ('\U00001720', '\U00001731'), + ('\U00001740', '\U00001751'), + ('\U00001760', '\U0000176c'), + ('\U0000176e', '\U00001770'), + ('\U00001780', '\U000017b3'), + ('\U000017dc', '\U000017dc'), + ('\U00001820', '\U00001842'), + ('\U00001844', '\U00001877'), + ('\U00001880', '\U000018a8'), + ('\U000018aa', '\U000018aa'), + ('\U000018b0', '\U000018f5'), + ('\U00001900', '\U0000191c'), + ('\U00001950', '\U0000196d'), + ('\U00001970', '\U00001974'), + ('\U00001980', '\U000019ab'), + ('\U000019c1', '\U000019c7'), + ('\U00001a00', '\U00001a16'), + ('\U00001a20', '\U00001a54'), + ('\U00001b05', '\U00001b33'), + ('\U00001b45', '\U00001b4b'), + ('\U00001b83', '\U00001ba0'), + ('\U00001bae', '\U00001baf'), + ('\U00001bba', '\U00001be5'), + ('\U00001c00', '\U00001c23'), + ('\U00001c4d', '\U00001c4f'), + ('\U00001c5a', '\U00001c77'), + ('\U00001ce9', '\U00001cec'), + ('\U00001cee', '\U00001cf1'), + ('\U00001cf5', '\U00001cf6'), + ('\U00002135', '\U00002138'), + ('\U00002d30', '\U00002d67'), + ('\U00002d80', '\U00002d96'), + ('\U00002da0', '\U00002da6'), + ('\U00002da8', '\U00002dae'), + ('\U00002db0', '\U00002db6'), + ('\U00002db8', '\U00002dbe'), + ('\U00002dc0', '\U00002dc6'), + ('\U00002dc8', '\U00002dce'), + ('\U00002dd0', '\U00002dd6'), + ('\U00002dd8', '\U00002dde'), + ('\U00003006', '\U00003006'), + ('\U0000303c', '\U0000303c'), + ('\U00003041', '\U00003096'), + ('\U0000309f', '\U0000309f'), + ('\U000030a1', '\U000030fa'), + ('\U000030ff', '\U000030ff'), + ('\U00003105', '\U0000312d'), + ('\U00003131', '\U0000318e'), + ('\U000031a0', '\U000031ba'), + ('\U000031f0', '\U000031ff'), + ('\U00003400', '\U00003400'), + ('\U00004db5', '\U00004db5'), + ('\U00004e00', '\U00004e00'), + ('\U00009fcc', '\U00009fcc'), + ('\U0000a000', '\U0000a014'), + ('\U0000a016', '\U0000a48c'), + ('\U0000a4d0', '\U0000a4f7'), + ('\U0000a500', '\U0000a60b'), + ('\U0000a610', '\U0000a61f'), + ('\U0000a62a', '\U0000a62b'), + ('\U0000a66e', '\U0000a66e'), + ('\U0000a6a0', '\U0000a6e5'), + ('\U0000a7fb', '\U0000a801'), + ('\U0000a803', '\U0000a805'), + ('\U0000a807', '\U0000a80a'), + ('\U0000a80c', '\U0000a822'), + ('\U0000a840', '\U0000a873'), + ('\U0000a882', '\U0000a8b3'), + ('\U0000a8f2', '\U0000a8f7'), + ('\U0000a8fb', '\U0000a8fb'), + ('\U0000a90a', '\U0000a925'), + ('\U0000a930', '\U0000a946'), + ('\U0000a960', '\U0000a97c'), + ('\U0000a984', '\U0000a9b2'), + ('\U0000aa00', '\U0000aa28'), + ('\U0000aa40', '\U0000aa42'), + ('\U0000aa44', '\U0000aa4b'), + ('\U0000aa60', '\U0000aa6f'), + ('\U0000aa71', '\U0000aa76'), + ('\U0000aa7a', '\U0000aa7a'), + ('\U0000aa80', '\U0000aaaf'), + ('\U0000aab1', '\U0000aab1'), + ('\U0000aab5', '\U0000aab6'), + ('\U0000aab9', '\U0000aabd'), + ('\U0000aac0', '\U0000aac0'), + ('\U0000aac2', '\U0000aac2'), + ('\U0000aadb', '\U0000aadc'), + ('\U0000aae0', '\U0000aaea'), + ('\U0000aaf2', '\U0000aaf2'), + ('\U0000ab01', '\U0000ab06'), + ('\U0000ab09', '\U0000ab0e'), + ('\U0000ab11', '\U0000ab16'), + ('\U0000ab20', '\U0000ab26'), + ('\U0000ab28', '\U0000ab2e'), + ('\U0000abc0', '\U0000abe2'), + ('\U0000ac00', '\U0000ac00'), + ('\U0000d7a3', '\U0000d7a3'), + ('\U0000d7b0', '\U0000d7c6'), + ('\U0000d7cb', '\U0000d7fb'), + ('\U0000f900', '\U0000fa6d'), + ('\U0000fa70', '\U0000fad9'), + ('\U0000fb1d', '\U0000fb1d'), + ('\U0000fb1f', '\U0000fb28'), + ('\U0000fb2a', '\U0000fb36'), + ('\U0000fb38', '\U0000fb3c'), + ('\U0000fb3e', '\U0000fb3e'), + ('\U0000fb40', '\U0000fb41'), + ('\U0000fb43', '\U0000fb44'), + ('\U0000fb46', '\U0000fbb1'), + ('\U0000fbd3', '\U0000fd3d'), + ('\U0000fd50', '\U0000fd8f'), + ('\U0000fd92', '\U0000fdc7'), + ('\U0000fdf0', '\U0000fdfb'), + ('\U0000fe70', '\U0000fe74'), + ('\U0000fe76', '\U0000fefc'), + ('\U0000ff66', '\U0000ff6f'), + ('\U0000ff71', '\U0000ff9d'), + ('\U0000ffa0', '\U0000ffbe'), + ('\U0000ffc2', '\U0000ffc7'), + ('\U0000ffca', '\U0000ffcf'), + ('\U0000ffd2', '\U0000ffd7'), + ('\U0000ffda', '\U0000ffdc'), + ('\U00010000', '\U0001000b'), + ('\U0001000d', '\U00010026'), + ('\U00010028', '\U0001003a'), + ('\U0001003c', '\U0001003d'), + ('\U0001003f', '\U0001004d'), + ('\U00010050', '\U0001005d'), + ('\U00010080', '\U000100fa'), + ('\U00010280', '\U0001029c'), + ('\U000102a0', '\U000102d0'), + ('\U00010300', '\U0001031e'), + ('\U00010330', '\U00010340'), + ('\U00010342', '\U00010349'), + ('\U00010380', '\U0001039d'), + ('\U000103a0', '\U000103c3'), + ('\U000103c8', '\U000103cf'), + ('\U00010450', '\U0001049d'), + ('\U00010800', '\U00010805'), + ('\U00010808', '\U00010808'), + ('\U0001080a', '\U00010835'), + ('\U00010837', '\U00010838'), + ('\U0001083c', '\U0001083c'), + ('\U0001083f', '\U00010855'), + ('\U00010900', '\U00010915'), + ('\U00010920', '\U00010939'), + ('\U00010980', '\U000109b7'), + ('\U000109be', '\U000109bf'), + ('\U00010a00', '\U00010a00'), + ('\U00010a10', '\U00010a13'), + ('\U00010a15', '\U00010a17'), + ('\U00010a19', '\U00010a33'), + ('\U00010a60', '\U00010a7c'), + ('\U00010b00', '\U00010b35'), + ('\U00010b40', '\U00010b55'), + ('\U00010b60', '\U00010b72'), + ('\U00010c00', '\U00010c48'), + ('\U00011003', '\U00011037'), + ('\U00011083', '\U000110af'), + ('\U000110d0', '\U000110e8'), + ('\U00011103', '\U00011126'), + ('\U00011183', '\U000111b2'), + ('\U000111c1', '\U000111c4'), + ('\U00011680', '\U000116aa'), + ('\U00012000', '\U0001236e'), + ('\U00013000', '\U0001342e'), + ('\U00016800', '\U00016a38'), + ('\U00016f00', '\U00016f44'), + ('\U00016f50', '\U00016f50'), + ('\U0001b000', '\U0001b001'), + ('\U0001ee00', '\U0001ee03'), + ('\U0001ee05', '\U0001ee1f'), + ('\U0001ee21', '\U0001ee22'), + ('\U0001ee24', '\U0001ee24'), + ('\U0001ee27', '\U0001ee27'), + ('\U0001ee29', '\U0001ee32'), + ('\U0001ee34', '\U0001ee37'), + ('\U0001ee39', '\U0001ee39'), + ('\U0001ee3b', '\U0001ee3b'), + ('\U0001ee42', '\U0001ee42'), + ('\U0001ee47', '\U0001ee47'), + ('\U0001ee49', '\U0001ee49'), + ('\U0001ee4b', '\U0001ee4b'), + ('\U0001ee4d', '\U0001ee4f'), + ('\U0001ee51', '\U0001ee52'), + ('\U0001ee54', '\U0001ee54'), + ('\U0001ee57', '\U0001ee57'), + ('\U0001ee59', '\U0001ee59'), + ('\U0001ee5b', '\U0001ee5b'), + ('\U0001ee5d', '\U0001ee5d'), + ('\U0001ee5f', '\U0001ee5f'), + ('\U0001ee61', '\U0001ee62'), + ('\U0001ee64', '\U0001ee64'), + ('\U0001ee67', '\U0001ee6a'), + ('\U0001ee6c', '\U0001ee72'), + ('\U0001ee74', '\U0001ee77'), + ('\U0001ee79', '\U0001ee7c'), + ('\U0001ee7e', '\U0001ee7e'), + ('\U0001ee80', '\U0001ee89'), + ('\U0001ee8b', '\U0001ee9b'), + ('\U0001eea1', '\U0001eea3'), + ('\U0001eea5', '\U0001eea9'), + ('\U0001eeab', '\U0001eebb'), + ('\U00020000', '\U00020000'), + ('\U0002a6d6', '\U0002a6d6'), + ('\U0002a700', '\U0002a700'), + ('\U0002b734', '\U0002b734'), + ('\U0002b740', '\U0002b740'), + ('\U0002b81d', '\U0002b81d'), + ('\U0002f800', '\U0002fa1d') + ]), +("Lt", &[ + ('\U000001c5', '\U000001c5'), + ('\U000001c8', '\U000001c8'), + ('\U000001cb', '\U000001cb'), + ('\U000001f2', '\U000001f2'), + ('\U00001f88', '\U00001f8f'), + ('\U00001f98', '\U00001f9f'), + ('\U00001fa8', '\U00001faf'), + ('\U00001fbc', '\U00001fbc'), + ('\U00001fcc', '\U00001fcc'), + ('\U00001ffc', '\U00001ffc') + ]), +("Lu", &[ + ('\U00000041', '\U0000005a'), + ('\U000000c0', '\U000000d6'), + ('\U000000d8', '\U000000de'), + ('\U00000100', '\U00000100'), + ('\U00000102', '\U00000102'), + ('\U00000104', '\U00000104'), + ('\U00000106', '\U00000106'), + ('\U00000108', '\U00000108'), + ('\U0000010a', '\U0000010a'), + ('\U0000010c', '\U0000010c'), + ('\U0000010e', '\U0000010e'), + ('\U00000110', '\U00000110'), + ('\U00000112', '\U00000112'), + ('\U00000114', '\U00000114'), + ('\U00000116', '\U00000116'), + ('\U00000118', '\U00000118'), + ('\U0000011a', '\U0000011a'), + ('\U0000011c', '\U0000011c'), + ('\U0000011e', '\U0000011e'), + ('\U00000120', '\U00000120'), + ('\U00000122', '\U00000122'), + ('\U00000124', '\U00000124'), + ('\U00000126', '\U00000126'), + ('\U00000128', '\U00000128'), + ('\U0000012a', '\U0000012a'), + ('\U0000012c', '\U0000012c'), + ('\U0000012e', '\U0000012e'), + ('\U00000130', '\U00000130'), + ('\U00000132', '\U00000132'), + ('\U00000134', '\U00000134'), + ('\U00000136', '\U00000136'), + ('\U00000139', '\U00000139'), + ('\U0000013b', '\U0000013b'), + ('\U0000013d', '\U0000013d'), + ('\U0000013f', '\U0000013f'), + ('\U00000141', '\U00000141'), + ('\U00000143', '\U00000143'), + ('\U00000145', '\U00000145'), + ('\U00000147', '\U00000147'), + ('\U0000014a', '\U0000014a'), + ('\U0000014c', '\U0000014c'), + ('\U0000014e', '\U0000014e'), + ('\U00000150', '\U00000150'), + ('\U00000152', '\U00000152'), + ('\U00000154', '\U00000154'), + ('\U00000156', '\U00000156'), + ('\U00000158', '\U00000158'), + ('\U0000015a', '\U0000015a'), + ('\U0000015c', '\U0000015c'), + ('\U0000015e', '\U0000015e'), + ('\U00000160', '\U00000160'), + ('\U00000162', '\U00000162'), + ('\U00000164', '\U00000164'), + ('\U00000166', '\U00000166'), + ('\U00000168', '\U00000168'), + ('\U0000016a', '\U0000016a'), + ('\U0000016c', '\U0000016c'), + ('\U0000016e', '\U0000016e'), + ('\U00000170', '\U00000170'), + ('\U00000172', '\U00000172'), + ('\U00000174', '\U00000174'), + ('\U00000176', '\U00000176'), + ('\U00000178', '\U00000179'), + ('\U0000017b', '\U0000017b'), + ('\U0000017d', '\U0000017d'), + ('\U00000181', '\U00000182'), + ('\U00000184', '\U00000184'), + ('\U00000186', '\U00000187'), + ('\U00000189', '\U0000018b'), + ('\U0000018e', '\U00000191'), + ('\U00000193', '\U00000194'), + ('\U00000196', '\U00000198'), + ('\U0000019c', '\U0000019d'), + ('\U0000019f', '\U000001a0'), + ('\U000001a2', '\U000001a2'), + ('\U000001a4', '\U000001a4'), + ('\U000001a6', '\U000001a7'), + ('\U000001a9', '\U000001a9'), + ('\U000001ac', '\U000001ac'), + ('\U000001ae', '\U000001af'), + ('\U000001b1', '\U000001b3'), + ('\U000001b5', '\U000001b5'), + ('\U000001b7', '\U000001b8'), + ('\U000001bc', '\U000001bc'), + ('\U000001c4', '\U000001c4'), + ('\U000001c7', '\U000001c7'), + ('\U000001ca', '\U000001ca'), + ('\U000001cd', '\U000001cd'), + ('\U000001cf', '\U000001cf'), + ('\U000001d1', '\U000001d1'), + ('\U000001d3', '\U000001d3'), + ('\U000001d5', '\U000001d5'), + ('\U000001d7', '\U000001d7'), + ('\U000001d9', '\U000001d9'), + ('\U000001db', '\U000001db'), + ('\U000001de', '\U000001de'), + ('\U000001e0', '\U000001e0'), + ('\U000001e2', '\U000001e2'), + ('\U000001e4', '\U000001e4'), + ('\U000001e6', '\U000001e6'), + ('\U000001e8', '\U000001e8'), + ('\U000001ea', '\U000001ea'), + ('\U000001ec', '\U000001ec'), + ('\U000001ee', '\U000001ee'), + ('\U000001f1', '\U000001f1'), + ('\U000001f4', '\U000001f4'), + ('\U000001f6', '\U000001f8'), + ('\U000001fa', '\U000001fa'), + ('\U000001fc', '\U000001fc'), + ('\U000001fe', '\U000001fe'), + ('\U00000200', '\U00000200'), + ('\U00000202', '\U00000202'), + ('\U00000204', '\U00000204'), + ('\U00000206', '\U00000206'), + ('\U00000208', '\U00000208'), + ('\U0000020a', '\U0000020a'), + ('\U0000020c', '\U0000020c'), + ('\U0000020e', '\U0000020e'), + ('\U00000210', '\U00000210'), + ('\U00000212', '\U00000212'), + ('\U00000214', '\U00000214'), + ('\U00000216', '\U00000216'), + ('\U00000218', '\U00000218'), + ('\U0000021a', '\U0000021a'), + ('\U0000021c', '\U0000021c'), + ('\U0000021e', '\U0000021e'), + ('\U00000220', '\U00000220'), + ('\U00000222', '\U00000222'), + ('\U00000224', '\U00000224'), + ('\U00000226', '\U00000226'), + ('\U00000228', '\U00000228'), + ('\U0000022a', '\U0000022a'), + ('\U0000022c', '\U0000022c'), + ('\U0000022e', '\U0000022e'), + ('\U00000230', '\U00000230'), + ('\U00000232', '\U00000232'), + ('\U0000023a', '\U0000023b'), + ('\U0000023d', '\U0000023e'), + ('\U00000241', '\U00000241'), + ('\U00000243', '\U00000246'), + ('\U00000248', '\U00000248'), + ('\U0000024a', '\U0000024a'), + ('\U0000024c', '\U0000024c'), + ('\U0000024e', '\U0000024e'), + ('\U00000370', '\U00000370'), + ('\U00000372', '\U00000372'), + ('\U00000376', '\U00000376'), + ('\U00000386', '\U00000386'), + ('\U00000388', '\U0000038a'), + ('\U0000038c', '\U0000038c'), + ('\U0000038e', '\U0000038f'), + ('\U00000391', '\U000003a1'), + ('\U000003a3', '\U000003ab'), + ('\U000003cf', '\U000003cf'), + ('\U000003d2', '\U000003d4'), + ('\U000003d8', '\U000003d8'), + ('\U000003da', '\U000003da'), + ('\U000003dc', '\U000003dc'), + ('\U000003de', '\U000003de'), + ('\U000003e0', '\U000003e0'), + ('\U000003e2', '\U000003e2'), + ('\U000003e4', '\U000003e4'), + ('\U000003e6', '\U000003e6'), + ('\U000003e8', '\U000003e8'), + ('\U000003ea', '\U000003ea'), + ('\U000003ec', '\U000003ec'), + ('\U000003ee', '\U000003ee'), + ('\U000003f4', '\U000003f4'), + ('\U000003f7', '\U000003f7'), + ('\U000003f9', '\U000003fa'), + ('\U000003fd', '\U0000042f'), + ('\U00000460', '\U00000460'), + ('\U00000462', '\U00000462'), + ('\U00000464', '\U00000464'), + ('\U00000466', '\U00000466'), + ('\U00000468', '\U00000468'), + ('\U0000046a', '\U0000046a'), + ('\U0000046c', '\U0000046c'), + ('\U0000046e', '\U0000046e'), + ('\U00000470', '\U00000470'), + ('\U00000472', '\U00000472'), + ('\U00000474', '\U00000474'), + ('\U00000476', '\U00000476'), + ('\U00000478', '\U00000478'), + ('\U0000047a', '\U0000047a'), + ('\U0000047c', '\U0000047c'), + ('\U0000047e', '\U0000047e'), + ('\U00000480', '\U00000480'), + ('\U0000048a', '\U0000048a'), + ('\U0000048c', '\U0000048c'), + ('\U0000048e', '\U0000048e'), + ('\U00000490', '\U00000490'), + ('\U00000492', '\U00000492'), + ('\U00000494', '\U00000494'), + ('\U00000496', '\U00000496'), + ('\U00000498', '\U00000498'), + ('\U0000049a', '\U0000049a'), + ('\U0000049c', '\U0000049c'), + ('\U0000049e', '\U0000049e'), + ('\U000004a0', '\U000004a0'), + ('\U000004a2', '\U000004a2'), + ('\U000004a4', '\U000004a4'), + ('\U000004a6', '\U000004a6'), + ('\U000004a8', '\U000004a8'), + ('\U000004aa', '\U000004aa'), + ('\U000004ac', '\U000004ac'), + ('\U000004ae', '\U000004ae'), + ('\U000004b0', '\U000004b0'), + ('\U000004b2', '\U000004b2'), + ('\U000004b4', '\U000004b4'), + ('\U000004b6', '\U000004b6'), + ('\U000004b8', '\U000004b8'), + ('\U000004ba', '\U000004ba'), + ('\U000004bc', '\U000004bc'), + ('\U000004be', '\U000004be'), + ('\U000004c0', '\U000004c1'), + ('\U000004c3', '\U000004c3'), + ('\U000004c5', '\U000004c5'), + ('\U000004c7', '\U000004c7'), + ('\U000004c9', '\U000004c9'), + ('\U000004cb', '\U000004cb'), + ('\U000004cd', '\U000004cd'), + ('\U000004d0', '\U000004d0'), + ('\U000004d2', '\U000004d2'), + ('\U000004d4', '\U000004d4'), + ('\U000004d6', '\U000004d6'), + ('\U000004d8', '\U000004d8'), + ('\U000004da', '\U000004da'), + ('\U000004dc', '\U000004dc'), + ('\U000004de', '\U000004de'), + ('\U000004e0', '\U000004e0'), + ('\U000004e2', '\U000004e2'), + ('\U000004e4', '\U000004e4'), + ('\U000004e6', '\U000004e6'), + ('\U000004e8', '\U000004e8'), + ('\U000004ea', '\U000004ea'), + ('\U000004ec', '\U000004ec'), + ('\U000004ee', '\U000004ee'), + ('\U000004f0', '\U000004f0'), + ('\U000004f2', '\U000004f2'), + ('\U000004f4', '\U000004f4'), + ('\U000004f6', '\U000004f6'), + ('\U000004f8', '\U000004f8'), + ('\U000004fa', '\U000004fa'), + ('\U000004fc', '\U000004fc'), + ('\U000004fe', '\U000004fe'), + ('\U00000500', '\U00000500'), + ('\U00000502', '\U00000502'), + ('\U00000504', '\U00000504'), + ('\U00000506', '\U00000506'), + ('\U00000508', '\U00000508'), + ('\U0000050a', '\U0000050a'), + ('\U0000050c', '\U0000050c'), + ('\U0000050e', '\U0000050e'), + ('\U00000510', '\U00000510'), + ('\U00000512', '\U00000512'), + ('\U00000514', '\U00000514'), + ('\U00000516', '\U00000516'), + ('\U00000518', '\U00000518'), + ('\U0000051a', '\U0000051a'), + ('\U0000051c', '\U0000051c'), + ('\U0000051e', '\U0000051e'), + ('\U00000520', '\U00000520'), + ('\U00000522', '\U00000522'), + ('\U00000524', '\U00000524'), + ('\U00000526', '\U00000526'), + ('\U00000531', '\U00000556'), + ('\U000010a0', '\U000010c5'), + ('\U000010c7', '\U000010c7'), + ('\U000010cd', '\U000010cd'), + ('\U00001e00', '\U00001e00'), + ('\U00001e02', '\U00001e02'), + ('\U00001e04', '\U00001e04'), + ('\U00001e06', '\U00001e06'), + ('\U00001e08', '\U00001e08'), + ('\U00001e0a', '\U00001e0a'), + ('\U00001e0c', '\U00001e0c'), + ('\U00001e0e', '\U00001e0e'), + ('\U00001e10', '\U00001e10'), + ('\U00001e12', '\U00001e12'), + ('\U00001e14', '\U00001e14'), + ('\U00001e16', '\U00001e16'), + ('\U00001e18', '\U00001e18'), + ('\U00001e1a', '\U00001e1a'), + ('\U00001e1c', '\U00001e1c'), + ('\U00001e1e', '\U00001e1e'), + ('\U00001e20', '\U00001e20'), + ('\U00001e22', '\U00001e22'), + ('\U00001e24', '\U00001e24'), + ('\U00001e26', '\U00001e26'), + ('\U00001e28', '\U00001e28'), + ('\U00001e2a', '\U00001e2a'), + ('\U00001e2c', '\U00001e2c'), + ('\U00001e2e', '\U00001e2e'), + ('\U00001e30', '\U00001e30'), + ('\U00001e32', '\U00001e32'), + ('\U00001e34', '\U00001e34'), + ('\U00001e36', '\U00001e36'), + ('\U00001e38', '\U00001e38'), + ('\U00001e3a', '\U00001e3a'), + ('\U00001e3c', '\U00001e3c'), + ('\U00001e3e', '\U00001e3e'), + ('\U00001e40', '\U00001e40'), + ('\U00001e42', '\U00001e42'), + ('\U00001e44', '\U00001e44'), + ('\U00001e46', '\U00001e46'), + ('\U00001e48', '\U00001e48'), + ('\U00001e4a', '\U00001e4a'), + ('\U00001e4c', '\U00001e4c'), + ('\U00001e4e', '\U00001e4e'), + ('\U00001e50', '\U00001e50'), + ('\U00001e52', '\U00001e52'), + ('\U00001e54', '\U00001e54'), + ('\U00001e56', '\U00001e56'), + ('\U00001e58', '\U00001e58'), + ('\U00001e5a', '\U00001e5a'), + ('\U00001e5c', '\U00001e5c'), + ('\U00001e5e', '\U00001e5e'), + ('\U00001e60', '\U00001e60'), + ('\U00001e62', '\U00001e62'), + ('\U00001e64', '\U00001e64'), + ('\U00001e66', '\U00001e66'), + ('\U00001e68', '\U00001e68'), + ('\U00001e6a', '\U00001e6a'), + ('\U00001e6c', '\U00001e6c'), + ('\U00001e6e', '\U00001e6e'), + ('\U00001e70', '\U00001e70'), + ('\U00001e72', '\U00001e72'), + ('\U00001e74', '\U00001e74'), + ('\U00001e76', '\U00001e76'), + ('\U00001e78', '\U00001e78'), + ('\U00001e7a', '\U00001e7a'), + ('\U00001e7c', '\U00001e7c'), + ('\U00001e7e', '\U00001e7e'), + ('\U00001e80', '\U00001e80'), + ('\U00001e82', '\U00001e82'), + ('\U00001e84', '\U00001e84'), + ('\U00001e86', '\U00001e86'), + ('\U00001e88', '\U00001e88'), + ('\U00001e8a', '\U00001e8a'), + ('\U00001e8c', '\U00001e8c'), + ('\U00001e8e', '\U00001e8e'), + ('\U00001e90', '\U00001e90'), + ('\U00001e92', '\U00001e92'), + ('\U00001e94', '\U00001e94'), + ('\U00001e9e', '\U00001e9e'), + ('\U00001ea0', '\U00001ea0'), + ('\U00001ea2', '\U00001ea2'), + ('\U00001ea4', '\U00001ea4'), + ('\U00001ea6', '\U00001ea6'), + ('\U00001ea8', '\U00001ea8'), + ('\U00001eaa', '\U00001eaa'), + ('\U00001eac', '\U00001eac'), + ('\U00001eae', '\U00001eae'), + ('\U00001eb0', '\U00001eb0'), + ('\U00001eb2', '\U00001eb2'), + ('\U00001eb4', '\U00001eb4'), + ('\U00001eb6', '\U00001eb6'), + ('\U00001eb8', '\U00001eb8'), + ('\U00001eba', '\U00001eba'), + ('\U00001ebc', '\U00001ebc'), + ('\U00001ebe', '\U00001ebe'), + ('\U00001ec0', '\U00001ec0'), + ('\U00001ec2', '\U00001ec2'), + ('\U00001ec4', '\U00001ec4'), + ('\U00001ec6', '\U00001ec6'), + ('\U00001ec8', '\U00001ec8'), + ('\U00001eca', '\U00001eca'), + ('\U00001ecc', '\U00001ecc'), + ('\U00001ece', '\U00001ece'), + ('\U00001ed0', '\U00001ed0'), + ('\U00001ed2', '\U00001ed2'), + ('\U00001ed4', '\U00001ed4'), + ('\U00001ed6', '\U00001ed6'), + ('\U00001ed8', '\U00001ed8'), + ('\U00001eda', '\U00001eda'), + ('\U00001edc', '\U00001edc'), + ('\U00001ede', '\U00001ede'), + ('\U00001ee0', '\U00001ee0'), + ('\U00001ee2', '\U00001ee2'), + ('\U00001ee4', '\U00001ee4'), + ('\U00001ee6', '\U00001ee6'), + ('\U00001ee8', '\U00001ee8'), + ('\U00001eea', '\U00001eea'), + ('\U00001eec', '\U00001eec'), + ('\U00001eee', '\U00001eee'), + ('\U00001ef0', '\U00001ef0'), + ('\U00001ef2', '\U00001ef2'), + ('\U00001ef4', '\U00001ef4'), + ('\U00001ef6', '\U00001ef6'), + ('\U00001ef8', '\U00001ef8'), + ('\U00001efa', '\U00001efa'), + ('\U00001efc', '\U00001efc'), + ('\U00001efe', '\U00001efe'), + ('\U00001f08', '\U00001f0f'), + ('\U00001f18', '\U00001f1d'), + ('\U00001f28', '\U00001f2f'), + ('\U00001f38', '\U00001f3f'), + ('\U00001f48', '\U00001f4d'), + ('\U00001f59', '\U00001f59'), + ('\U00001f5b', '\U00001f5b'), + ('\U00001f5d', '\U00001f5d'), + ('\U00001f5f', '\U00001f5f'), + ('\U00001f68', '\U00001f6f'), + ('\U00001fb8', '\U00001fbb'), + ('\U00001fc8', '\U00001fcb'), + ('\U00001fd8', '\U00001fdb'), + ('\U00001fe8', '\U00001fec'), + ('\U00001ff8', '\U00001ffb'), + ('\U00002102', '\U00002102'), + ('\U00002107', '\U00002107'), + ('\U0000210b', '\U0000210d'), + ('\U00002110', '\U00002112'), + ('\U00002115', '\U00002115'), + ('\U00002119', '\U0000211d'), + ('\U00002124', '\U00002124'), + ('\U00002126', '\U00002126'), + ('\U00002128', '\U00002128'), + ('\U0000212a', '\U0000212d'), + ('\U00002130', '\U00002133'), + ('\U0000213e', '\U0000213f'), + ('\U00002145', '\U00002145'), + ('\U00002183', '\U00002183'), + ('\U00002c00', '\U00002c2e'), + ('\U00002c60', '\U00002c60'), + ('\U00002c62', '\U00002c64'), + ('\U00002c67', '\U00002c67'), + ('\U00002c69', '\U00002c69'), + ('\U00002c6b', '\U00002c6b'), + ('\U00002c6d', '\U00002c70'), + ('\U00002c72', '\U00002c72'), + ('\U00002c75', '\U00002c75'), + ('\U00002c7e', '\U00002c80'), + ('\U00002c82', '\U00002c82'), + ('\U00002c84', '\U00002c84'), + ('\U00002c86', '\U00002c86'), + ('\U00002c88', '\U00002c88'), + ('\U00002c8a', '\U00002c8a'), + ('\U00002c8c', '\U00002c8c'), + ('\U00002c8e', '\U00002c8e'), + ('\U00002c90', '\U00002c90'), + ('\U00002c92', '\U00002c92'), + ('\U00002c94', '\U00002c94'), + ('\U00002c96', '\U00002c96'), + ('\U00002c98', '\U00002c98'), + ('\U00002c9a', '\U00002c9a'), + ('\U00002c9c', '\U00002c9c'), + ('\U00002c9e', '\U00002c9e'), + ('\U00002ca0', '\U00002ca0'), + ('\U00002ca2', '\U00002ca2'), + ('\U00002ca4', '\U00002ca4'), + ('\U00002ca6', '\U00002ca6'), + ('\U00002ca8', '\U00002ca8'), + ('\U00002caa', '\U00002caa'), + ('\U00002cac', '\U00002cac'), + ('\U00002cae', '\U00002cae'), + ('\U00002cb0', '\U00002cb0'), + ('\U00002cb2', '\U00002cb2'), + ('\U00002cb4', '\U00002cb4'), + ('\U00002cb6', '\U00002cb6'), + ('\U00002cb8', '\U00002cb8'), + ('\U00002cba', '\U00002cba'), + ('\U00002cbc', '\U00002cbc'), + ('\U00002cbe', '\U00002cbe'), + ('\U00002cc0', '\U00002cc0'), + ('\U00002cc2', '\U00002cc2'), + ('\U00002cc4', '\U00002cc4'), + ('\U00002cc6', '\U00002cc6'), + ('\U00002cc8', '\U00002cc8'), + ('\U00002cca', '\U00002cca'), + ('\U00002ccc', '\U00002ccc'), + ('\U00002cce', '\U00002cce'), + ('\U00002cd0', '\U00002cd0'), + ('\U00002cd2', '\U00002cd2'), + ('\U00002cd4', '\U00002cd4'), + ('\U00002cd6', '\U00002cd6'), + ('\U00002cd8', '\U00002cd8'), + ('\U00002cda', '\U00002cda'), + ('\U00002cdc', '\U00002cdc'), + ('\U00002cde', '\U00002cde'), + ('\U00002ce0', '\U00002ce0'), + ('\U00002ce2', '\U00002ce2'), + ('\U00002ceb', '\U00002ceb'), + ('\U00002ced', '\U00002ced'), + ('\U00002cf2', '\U00002cf2'), + ('\U0000a640', '\U0000a640'), + ('\U0000a642', '\U0000a642'), + ('\U0000a644', '\U0000a644'), + ('\U0000a646', '\U0000a646'), + ('\U0000a648', '\U0000a648'), + ('\U0000a64a', '\U0000a64a'), + ('\U0000a64c', '\U0000a64c'), + ('\U0000a64e', '\U0000a64e'), + ('\U0000a650', '\U0000a650'), + ('\U0000a652', '\U0000a652'), + ('\U0000a654', '\U0000a654'), + ('\U0000a656', '\U0000a656'), + ('\U0000a658', '\U0000a658'), + ('\U0000a65a', '\U0000a65a'), + ('\U0000a65c', '\U0000a65c'), + ('\U0000a65e', '\U0000a65e'), + ('\U0000a660', '\U0000a660'), + ('\U0000a662', '\U0000a662'), + ('\U0000a664', '\U0000a664'), + ('\U0000a666', '\U0000a666'), + ('\U0000a668', '\U0000a668'), + ('\U0000a66a', '\U0000a66a'), + ('\U0000a66c', '\U0000a66c'), + ('\U0000a680', '\U0000a680'), + ('\U0000a682', '\U0000a682'), + ('\U0000a684', '\U0000a684'), + ('\U0000a686', '\U0000a686'), + ('\U0000a688', '\U0000a688'), + ('\U0000a68a', '\U0000a68a'), + ('\U0000a68c', '\U0000a68c'), + ('\U0000a68e', '\U0000a68e'), + ('\U0000a690', '\U0000a690'), + ('\U0000a692', '\U0000a692'), + ('\U0000a694', '\U0000a694'), + ('\U0000a696', '\U0000a696'), + ('\U0000a722', '\U0000a722'), + ('\U0000a724', '\U0000a724'), + ('\U0000a726', '\U0000a726'), + ('\U0000a728', '\U0000a728'), + ('\U0000a72a', '\U0000a72a'), + ('\U0000a72c', '\U0000a72c'), + ('\U0000a72e', '\U0000a72e'), + ('\U0000a732', '\U0000a732'), + ('\U0000a734', '\U0000a734'), + ('\U0000a736', '\U0000a736'), + ('\U0000a738', '\U0000a738'), + ('\U0000a73a', '\U0000a73a'), + ('\U0000a73c', '\U0000a73c'), + ('\U0000a73e', '\U0000a73e'), + ('\U0000a740', '\U0000a740'), + ('\U0000a742', '\U0000a742'), + ('\U0000a744', '\U0000a744'), + ('\U0000a746', '\U0000a746'), + ('\U0000a748', '\U0000a748'), + ('\U0000a74a', '\U0000a74a'), + ('\U0000a74c', '\U0000a74c'), + ('\U0000a74e', '\U0000a74e'), + ('\U0000a750', '\U0000a750'), + ('\U0000a752', '\U0000a752'), + ('\U0000a754', '\U0000a754'), + ('\U0000a756', '\U0000a756'), + ('\U0000a758', '\U0000a758'), + ('\U0000a75a', '\U0000a75a'), + ('\U0000a75c', '\U0000a75c'), + ('\U0000a75e', '\U0000a75e'), + ('\U0000a760', '\U0000a760'), + ('\U0000a762', '\U0000a762'), + ('\U0000a764', '\U0000a764'), + ('\U0000a766', '\U0000a766'), + ('\U0000a768', '\U0000a768'), + ('\U0000a76a', '\U0000a76a'), + ('\U0000a76c', '\U0000a76c'), + ('\U0000a76e', '\U0000a76e'), + ('\U0000a779', '\U0000a779'), + ('\U0000a77b', '\U0000a77b'), + ('\U0000a77d', '\U0000a77e'), + ('\U0000a780', '\U0000a780'), + ('\U0000a782', '\U0000a782'), + ('\U0000a784', '\U0000a784'), + ('\U0000a786', '\U0000a786'), + ('\U0000a78b', '\U0000a78b'), + ('\U0000a78d', '\U0000a78d'), + ('\U0000a790', '\U0000a790'), + ('\U0000a792', '\U0000a792'), + ('\U0000a7a0', '\U0000a7a0'), + ('\U0000a7a2', '\U0000a7a2'), + ('\U0000a7a4', '\U0000a7a4'), + ('\U0000a7a6', '\U0000a7a6'), + ('\U0000a7a8', '\U0000a7a8'), + ('\U0000a7aa', '\U0000a7aa'), + ('\U0000ff21', '\U0000ff3a'), + ('\U00010400', '\U00010427'), + ('\U0001d400', '\U0001d419'), + ('\U0001d434', '\U0001d44d'), + ('\U0001d468', '\U0001d481'), + ('\U0001d49c', '\U0001d49c'), + ('\U0001d49e', '\U0001d49f'), + ('\U0001d4a2', '\U0001d4a2'), + ('\U0001d4a5', '\U0001d4a6'), + ('\U0001d4a9', '\U0001d4ac'), + ('\U0001d4ae', '\U0001d4b5'), + ('\U0001d4d0', '\U0001d4e9'), + ('\U0001d504', '\U0001d505'), + ('\U0001d507', '\U0001d50a'), + ('\U0001d50d', '\U0001d514'), + ('\U0001d516', '\U0001d51c'), + ('\U0001d538', '\U0001d539'), + ('\U0001d53b', '\U0001d53e'), + ('\U0001d540', '\U0001d544'), + ('\U0001d546', '\U0001d546'), + ('\U0001d54a', '\U0001d550'), + ('\U0001d56c', '\U0001d585'), + ('\U0001d5a0', '\U0001d5b9'), + ('\U0001d5d4', '\U0001d5ed'), + ('\U0001d608', '\U0001d621'), + ('\U0001d63c', '\U0001d655'), + ('\U0001d670', '\U0001d689'), + ('\U0001d6a8', '\U0001d6c0'), + ('\U0001d6e2', '\U0001d6fa'), + ('\U0001d71c', '\U0001d734'), + ('\U0001d756', '\U0001d76e'), + ('\U0001d790', '\U0001d7a8'), + ('\U0001d7ca', '\U0001d7ca') + ]), +("Lycian", &[ + ('\U00010280', '\U0001029c') + ]), +("Lydian", &[ + ('\U00010920', '\U00010939'), + ('\U0001093f', '\U0001093f') + ]), +("M", &[ + ('\U00000300', '\U0000036f'), + ('\U00000483', '\U00000489'), + ('\U00000591', '\U000005bd'), + ('\U000005bf', '\U000005bf'), + ('\U000005c1', '\U000005c2'), + ('\U000005c4', '\U000005c5'), + ('\U000005c7', '\U000005c7'), + ('\U00000610', '\U0000061a'), + ('\U0000064b', '\U0000065f'), + ('\U00000670', '\U00000670'), + ('\U000006d6', '\U000006dc'), + ('\U000006df', '\U000006e4'), + ('\U000006e7', '\U000006e8'), + ('\U000006ea', '\U000006ed'), + ('\U00000711', '\U00000711'), + ('\U00000730', '\U0000074a'), + ('\U000007a6', '\U000007b0'), + ('\U000007eb', '\U000007f3'), + ('\U00000816', '\U00000819'), + ('\U0000081b', '\U00000823'), + ('\U00000825', '\U00000827'), + ('\U00000829', '\U0000082d'), + ('\U00000859', '\U0000085b'), + ('\U000008e4', '\U000008fe'), + ('\U00000900', '\U00000903'), + ('\U0000093a', '\U0000093c'), + ('\U0000093e', '\U0000094f'), + ('\U00000951', '\U00000957'), + ('\U00000962', '\U00000963'), + ('\U00000981', '\U00000983'), + ('\U000009bc', '\U000009bc'), + ('\U000009be', '\U000009c4'), + ('\U000009c7', '\U000009c8'), + ('\U000009cb', '\U000009cd'), + ('\U000009d7', '\U000009d7'), + ('\U000009e2', '\U000009e3'), + ('\U00000a01', '\U00000a03'), + ('\U00000a3c', '\U00000a3c'), + ('\U00000a3e', '\U00000a42'), + ('\U00000a47', '\U00000a48'), + ('\U00000a4b', '\U00000a4d'), + ('\U00000a51', '\U00000a51'), + ('\U00000a70', '\U00000a71'), + ('\U00000a75', '\U00000a75'), + ('\U00000a81', '\U00000a83'), + ('\U00000abc', '\U00000abc'), + ('\U00000abe', '\U00000ac5'), + ('\U00000ac7', '\U00000ac9'), + ('\U00000acb', '\U00000acd'), + ('\U00000ae2', '\U00000ae3'), + ('\U00000b01', '\U00000b03'), + ('\U00000b3c', '\U00000b3c'), + ('\U00000b3e', '\U00000b44'), + ('\U00000b47', '\U00000b48'), + ('\U00000b4b', '\U00000b4d'), + ('\U00000b56', '\U00000b57'), + ('\U00000b62', '\U00000b63'), + ('\U00000b82', '\U00000b82'), + ('\U00000bbe', '\U00000bc2'), + ('\U00000bc6', '\U00000bc8'), + ('\U00000bca', '\U00000bcd'), + ('\U00000bd7', '\U00000bd7'), + ('\U00000c01', '\U00000c03'), + ('\U00000c3e', '\U00000c44'), + ('\U00000c46', '\U00000c48'), + ('\U00000c4a', '\U00000c4d'), + ('\U00000c55', '\U00000c56'), + ('\U00000c62', '\U00000c63'), + ('\U00000c82', '\U00000c83'), + ('\U00000cbc', '\U00000cbc'), + ('\U00000cbe', '\U00000cc4'), + ('\U00000cc6', '\U00000cc8'), + ('\U00000cca', '\U00000ccd'), + ('\U00000cd5', '\U00000cd6'), + ('\U00000ce2', '\U00000ce3'), + ('\U00000d02', '\U00000d03'), + ('\U00000d3e', '\U00000d44'), + ('\U00000d46', '\U00000d48'), + ('\U00000d4a', '\U00000d4d'), + ('\U00000d57', '\U00000d57'), + ('\U00000d62', '\U00000d63'), + ('\U00000d82', '\U00000d83'), + ('\U00000dca', '\U00000dca'), + ('\U00000dcf', '\U00000dd4'), + ('\U00000dd6', '\U00000dd6'), + ('\U00000dd8', '\U00000ddf'), + ('\U00000df2', '\U00000df3'), + ('\U00000e31', '\U00000e31'), + ('\U00000e34', '\U00000e3a'), + ('\U00000e47', '\U00000e4e'), + ('\U00000eb1', '\U00000eb1'), + ('\U00000eb4', '\U00000eb9'), + ('\U00000ebb', '\U00000ebc'), + ('\U00000ec8', '\U00000ecd'), + ('\U00000f18', '\U00000f19'), + ('\U00000f35', '\U00000f35'), + ('\U00000f37', '\U00000f37'), + ('\U00000f39', '\U00000f39'), + ('\U00000f3e', '\U00000f3f'), + ('\U00000f71', '\U00000f84'), + ('\U00000f86', '\U00000f87'), + ('\U00000f8d', '\U00000f97'), + ('\U00000f99', '\U00000fbc'), + ('\U00000fc6', '\U00000fc6'), + ('\U0000102b', '\U0000103e'), + ('\U00001056', '\U00001059'), + ('\U0000105e', '\U00001060'), + ('\U00001062', '\U00001064'), + ('\U00001067', '\U0000106d'), + ('\U00001071', '\U00001074'), + ('\U00001082', '\U0000108d'), + ('\U0000108f', '\U0000108f'), + ('\U0000109a', '\U0000109d'), + ('\U0000135d', '\U0000135f'), + ('\U00001712', '\U00001714'), + ('\U00001732', '\U00001734'), + ('\U00001752', '\U00001753'), + ('\U00001772', '\U00001773'), + ('\U000017b4', '\U000017d3'), + ('\U000017dd', '\U000017dd'), + ('\U0000180b', '\U0000180d'), + ('\U000018a9', '\U000018a9'), + ('\U00001920', '\U0000192b'), + ('\U00001930', '\U0000193b'), + ('\U000019b0', '\U000019c0'), + ('\U000019c8', '\U000019c9'), + ('\U00001a17', '\U00001a1b'), + ('\U00001a55', '\U00001a5e'), + ('\U00001a60', '\U00001a7c'), + ('\U00001a7f', '\U00001a7f'), + ('\U00001b00', '\U00001b04'), + ('\U00001b34', '\U00001b44'), + ('\U00001b6b', '\U00001b73'), + ('\U00001b80', '\U00001b82'), + ('\U00001ba1', '\U00001bad'), + ('\U00001be6', '\U00001bf3'), + ('\U00001c24', '\U00001c37'), + ('\U00001cd0', '\U00001cd2'), + ('\U00001cd4', '\U00001ce8'), + ('\U00001ced', '\U00001ced'), + ('\U00001cf2', '\U00001cf4'), + ('\U00001dc0', '\U00001de6'), + ('\U00001dfc', '\U00001dff'), + ('\U000020d0', '\U000020f0'), + ('\U00002cef', '\U00002cf1'), + ('\U00002d7f', '\U00002d7f'), + ('\U00002de0', '\U00002dff'), + ('\U0000302a', '\U0000302f'), + ('\U00003099', '\U0000309a'), + ('\U0000a66f', '\U0000a672'), + ('\U0000a674', '\U0000a67d'), + ('\U0000a69f', '\U0000a69f'), + ('\U0000a6f0', '\U0000a6f1'), + ('\U0000a802', '\U0000a802'), + ('\U0000a806', '\U0000a806'), + ('\U0000a80b', '\U0000a80b'), + ('\U0000a823', '\U0000a827'), + ('\U0000a880', '\U0000a881'), + ('\U0000a8b4', '\U0000a8c4'), + ('\U0000a8e0', '\U0000a8f1'), + ('\U0000a926', '\U0000a92d'), + ('\U0000a947', '\U0000a953'), + ('\U0000a980', '\U0000a983'), + ('\U0000a9b3', '\U0000a9c0'), + ('\U0000aa29', '\U0000aa36'), + ('\U0000aa43', '\U0000aa43'), + ('\U0000aa4c', '\U0000aa4d'), + ('\U0000aa7b', '\U0000aa7b'), + ('\U0000aab0', '\U0000aab0'), + ('\U0000aab2', '\U0000aab4'), + ('\U0000aab7', '\U0000aab8'), + ('\U0000aabe', '\U0000aabf'), + ('\U0000aac1', '\U0000aac1'), + ('\U0000aaeb', '\U0000aaef'), + ('\U0000aaf5', '\U0000aaf6'), + ('\U0000abe3', '\U0000abea'), + ('\U0000abec', '\U0000abed'), + ('\U0000fb1e', '\U0000fb1e'), + ('\U0000fe00', '\U0000fe0f'), + ('\U0000fe20', '\U0000fe26'), + ('\U000101fd', '\U000101fd'), + ('\U00010a01', '\U00010a03'), + ('\U00010a05', '\U00010a06'), + ('\U00010a0c', '\U00010a0f'), + ('\U00010a38', '\U00010a3a'), + ('\U00010a3f', '\U00010a3f'), + ('\U00011000', '\U00011002'), + ('\U00011038', '\U00011046'), + ('\U00011080', '\U00011082'), + ('\U000110b0', '\U000110ba'), + ('\U00011100', '\U00011102'), + ('\U00011127', '\U00011134'), + ('\U00011180', '\U00011182'), + ('\U000111b3', '\U000111c0'), + ('\U000116ab', '\U000116b7'), + ('\U00016f51', '\U00016f7e'), + ('\U00016f8f', '\U00016f92'), + ('\U0001d165', '\U0001d169'), + ('\U0001d16d', '\U0001d172'), + ('\U0001d17b', '\U0001d182'), + ('\U0001d185', '\U0001d18b'), + ('\U0001d1aa', '\U0001d1ad'), + ('\U0001d242', '\U0001d244'), + ('\U000e0100', '\U000e01ef') + ]), +("Malayalam", &[ + ('\U00000d02', '\U00000d03'), + ('\U00000d05', '\U00000d0c'), + ('\U00000d0e', '\U00000d10'), + ('\U00000d12', '\U00000d3a'), + ('\U00000d3d', '\U00000d44'), + ('\U00000d46', '\U00000d48'), + ('\U00000d4a', '\U00000d4e'), + ('\U00000d57', '\U00000d57'), + ('\U00000d60', '\U00000d63'), + ('\U00000d66', '\U00000d75'), + ('\U00000d79', '\U00000d7f') + ]), +("Mandaic", &[ + ('\U00000840', '\U0000085b'), + ('\U0000085e', '\U0000085e') + ]), +("Mc", &[ + ('\U00000903', '\U00000903'), + ('\U0000093b', '\U0000093b'), + ('\U0000093e', '\U00000940'), + ('\U00000949', '\U0000094c'), + ('\U0000094e', '\U0000094f'), + ('\U00000982', '\U00000983'), + ('\U000009be', '\U000009c0'), + ('\U000009c7', '\U000009c8'), + ('\U000009cb', '\U000009cc'), + ('\U000009d7', '\U000009d7'), + ('\U00000a03', '\U00000a03'), + ('\U00000a3e', '\U00000a40'), + ('\U00000a83', '\U00000a83'), + ('\U00000abe', '\U00000ac0'), + ('\U00000ac9', '\U00000ac9'), + ('\U00000acb', '\U00000acc'), + ('\U00000b02', '\U00000b03'), + ('\U00000b3e', '\U00000b3e'), + ('\U00000b40', '\U00000b40'), + ('\U00000b47', '\U00000b48'), + ('\U00000b4b', '\U00000b4c'), + ('\U00000b57', '\U00000b57'), + ('\U00000bbe', '\U00000bbf'), + ('\U00000bc1', '\U00000bc2'), + ('\U00000bc6', '\U00000bc8'), + ('\U00000bca', '\U00000bcc'), + ('\U00000bd7', '\U00000bd7'), + ('\U00000c01', '\U00000c03'), + ('\U00000c41', '\U00000c44'), + ('\U00000c82', '\U00000c83'), + ('\U00000cbe', '\U00000cbe'), + ('\U00000cc0', '\U00000cc4'), + ('\U00000cc7', '\U00000cc8'), + ('\U00000cca', '\U00000ccb'), + ('\U00000cd5', '\U00000cd6'), + ('\U00000d02', '\U00000d03'), + ('\U00000d3e', '\U00000d40'), + ('\U00000d46', '\U00000d48'), + ('\U00000d4a', '\U00000d4c'), + ('\U00000d57', '\U00000d57'), + ('\U00000d82', '\U00000d83'), + ('\U00000dcf', '\U00000dd1'), + ('\U00000dd8', '\U00000ddf'), + ('\U00000df2', '\U00000df3'), + ('\U00000f3e', '\U00000f3f'), + ('\U00000f7f', '\U00000f7f'), + ('\U0000102b', '\U0000102c'), + ('\U00001031', '\U00001031'), + ('\U00001038', '\U00001038'), + ('\U0000103b', '\U0000103c'), + ('\U00001056', '\U00001057'), + ('\U00001062', '\U00001064'), + ('\U00001067', '\U0000106d'), + ('\U00001083', '\U00001084'), + ('\U00001087', '\U0000108c'), + ('\U0000108f', '\U0000108f'), + ('\U0000109a', '\U0000109c'), + ('\U000017b6', '\U000017b6'), + ('\U000017be', '\U000017c5'), + ('\U000017c7', '\U000017c8'), + ('\U00001923', '\U00001926'), + ('\U00001929', '\U0000192b'), + ('\U00001930', '\U00001931'), + ('\U00001933', '\U00001938'), + ('\U000019b0', '\U000019c0'), + ('\U000019c8', '\U000019c9'), + ('\U00001a19', '\U00001a1a'), + ('\U00001a55', '\U00001a55'), + ('\U00001a57', '\U00001a57'), + ('\U00001a61', '\U00001a61'), + ('\U00001a63', '\U00001a64'), + ('\U00001a6d', '\U00001a72'), + ('\U00001b04', '\U00001b04'), + ('\U00001b35', '\U00001b35'), + ('\U00001b3b', '\U00001b3b'), + ('\U00001b3d', '\U00001b41'), + ('\U00001b43', '\U00001b44'), + ('\U00001b82', '\U00001b82'), + ('\U00001ba1', '\U00001ba1'), + ('\U00001ba6', '\U00001ba7'), + ('\U00001baa', '\U00001baa'), + ('\U00001bac', '\U00001bad'), + ('\U00001be7', '\U00001be7'), + ('\U00001bea', '\U00001bec'), + ('\U00001bee', '\U00001bee'), + ('\U00001bf2', '\U00001bf3'), + ('\U00001c24', '\U00001c2b'), + ('\U00001c34', '\U00001c35'), + ('\U00001ce1', '\U00001ce1'), + ('\U00001cf2', '\U00001cf3'), + ('\U0000302e', '\U0000302f'), + ('\U0000a823', '\U0000a824'), + ('\U0000a827', '\U0000a827'), + ('\U0000a880', '\U0000a881'), + ('\U0000a8b4', '\U0000a8c3'), + ('\U0000a952', '\U0000a953'), + ('\U0000a983', '\U0000a983'), + ('\U0000a9b4', '\U0000a9b5'), + ('\U0000a9ba', '\U0000a9bb'), + ('\U0000a9bd', '\U0000a9c0'), + ('\U0000aa2f', '\U0000aa30'), + ('\U0000aa33', '\U0000aa34'), + ('\U0000aa4d', '\U0000aa4d'), + ('\U0000aa7b', '\U0000aa7b'), + ('\U0000aaeb', '\U0000aaeb'), + ('\U0000aaee', '\U0000aaef'), + ('\U0000aaf5', '\U0000aaf5'), + ('\U0000abe3', '\U0000abe4'), + ('\U0000abe6', '\U0000abe7'), + ('\U0000abe9', '\U0000abea'), + ('\U0000abec', '\U0000abec'), + ('\U00011000', '\U00011000'), + ('\U00011002', '\U00011002'), + ('\U00011082', '\U00011082'), + ('\U000110b0', '\U000110b2'), + ('\U000110b7', '\U000110b8'), + ('\U0001112c', '\U0001112c'), + ('\U00011182', '\U00011182'), + ('\U000111b3', '\U000111b5'), + ('\U000111bf', '\U000111c0'), + ('\U000116ac', '\U000116ac'), + ('\U000116ae', '\U000116af'), + ('\U000116b6', '\U000116b6'), + ('\U00016f51', '\U00016f7e'), + ('\U0001d165', '\U0001d166'), + ('\U0001d16d', '\U0001d172') + ]), +("Me", &[ + ('\U00000488', '\U00000489'), + ('\U000020dd', '\U000020e0'), + ('\U000020e2', '\U000020e4'), + ('\U0000a670', '\U0000a672') + ]), +("Meetei_Mayek", &[ + ('\U0000aae0', '\U0000aaf6'), + ('\U0000abc0', '\U0000abed'), + ('\U0000abf0', '\U0000abf9') + ]), +("Meroitic_Cursive", &[ + ('\U000109a0', '\U000109b7'), + ('\U000109be', '\U000109bf') + ]), +("Meroitic_Hieroglyphs", &[ + ('\U00010980', '\U0001099f') + ]), +("Miao", &[ + ('\U00016f00', '\U00016f44'), + ('\U00016f50', '\U00016f7e'), + ('\U00016f8f', '\U00016f9f') + ]), +("Mn", &[ + ('\U00000300', '\U0000036f'), + ('\U00000483', '\U00000487'), + ('\U00000591', '\U000005bd'), + ('\U000005bf', '\U000005bf'), + ('\U000005c1', '\U000005c2'), + ('\U000005c4', '\U000005c5'), + ('\U000005c7', '\U000005c7'), + ('\U00000610', '\U0000061a'), + ('\U0000064b', '\U0000065f'), + ('\U00000670', '\U00000670'), + ('\U000006d6', '\U000006dc'), + ('\U000006df', '\U000006e4'), + ('\U000006e7', '\U000006e8'), + ('\U000006ea', '\U000006ed'), + ('\U00000711', '\U00000711'), + ('\U00000730', '\U0000074a'), + ('\U000007a6', '\U000007b0'), + ('\U000007eb', '\U000007f3'), + ('\U00000816', '\U00000819'), + ('\U0000081b', '\U00000823'), + ('\U00000825', '\U00000827'), + ('\U00000829', '\U0000082d'), + ('\U00000859', '\U0000085b'), + ('\U000008e4', '\U000008fe'), + ('\U00000900', '\U00000902'), + ('\U0000093a', '\U0000093a'), + ('\U0000093c', '\U0000093c'), + ('\U00000941', '\U00000948'), + ('\U0000094d', '\U0000094d'), + ('\U00000951', '\U00000957'), + ('\U00000962', '\U00000963'), + ('\U00000981', '\U00000981'), + ('\U000009bc', '\U000009bc'), + ('\U000009c1', '\U000009c4'), + ('\U000009cd', '\U000009cd'), + ('\U000009e2', '\U000009e3'), + ('\U00000a01', '\U00000a02'), + ('\U00000a3c', '\U00000a3c'), + ('\U00000a41', '\U00000a42'), + ('\U00000a47', '\U00000a48'), + ('\U00000a4b', '\U00000a4d'), + ('\U00000a51', '\U00000a51'), + ('\U00000a70', '\U00000a71'), + ('\U00000a75', '\U00000a75'), + ('\U00000a81', '\U00000a82'), + ('\U00000abc', '\U00000abc'), + ('\U00000ac1', '\U00000ac5'), + ('\U00000ac7', '\U00000ac8'), + ('\U00000acd', '\U00000acd'), + ('\U00000ae2', '\U00000ae3'), + ('\U00000b01', '\U00000b01'), + ('\U00000b3c', '\U00000b3c'), + ('\U00000b3f', '\U00000b3f'), + ('\U00000b41', '\U00000b44'), + ('\U00000b4d', '\U00000b4d'), + ('\U00000b56', '\U00000b56'), + ('\U00000b62', '\U00000b63'), + ('\U00000b82', '\U00000b82'), + ('\U00000bc0', '\U00000bc0'), + ('\U00000bcd', '\U00000bcd'), + ('\U00000c3e', '\U00000c40'), + ('\U00000c46', '\U00000c48'), + ('\U00000c4a', '\U00000c4d'), + ('\U00000c55', '\U00000c56'), + ('\U00000c62', '\U00000c63'), + ('\U00000cbc', '\U00000cbc'), + ('\U00000cbf', '\U00000cbf'), + ('\U00000cc6', '\U00000cc6'), + ('\U00000ccc', '\U00000ccd'), + ('\U00000ce2', '\U00000ce3'), + ('\U00000d41', '\U00000d44'), + ('\U00000d4d', '\U00000d4d'), + ('\U00000d62', '\U00000d63'), + ('\U00000dca', '\U00000dca'), + ('\U00000dd2', '\U00000dd4'), + ('\U00000dd6', '\U00000dd6'), + ('\U00000e31', '\U00000e31'), + ('\U00000e34', '\U00000e3a'), + ('\U00000e47', '\U00000e4e'), + ('\U00000eb1', '\U00000eb1'), + ('\U00000eb4', '\U00000eb9'), + ('\U00000ebb', '\U00000ebc'), + ('\U00000ec8', '\U00000ecd'), + ('\U00000f18', '\U00000f19'), + ('\U00000f35', '\U00000f35'), + ('\U00000f37', '\U00000f37'), + ('\U00000f39', '\U00000f39'), + ('\U00000f71', '\U00000f7e'), + ('\U00000f80', '\U00000f84'), + ('\U00000f86', '\U00000f87'), + ('\U00000f8d', '\U00000f97'), + ('\U00000f99', '\U00000fbc'), + ('\U00000fc6', '\U00000fc6'), + ('\U0000102d', '\U00001030'), + ('\U00001032', '\U00001037'), + ('\U00001039', '\U0000103a'), + ('\U0000103d', '\U0000103e'), + ('\U00001058', '\U00001059'), + ('\U0000105e', '\U00001060'), + ('\U00001071', '\U00001074'), + ('\U00001082', '\U00001082'), + ('\U00001085', '\U00001086'), + ('\U0000108d', '\U0000108d'), + ('\U0000109d', '\U0000109d'), + ('\U0000135d', '\U0000135f'), + ('\U00001712', '\U00001714'), + ('\U00001732', '\U00001734'), + ('\U00001752', '\U00001753'), + ('\U00001772', '\U00001773'), + ('\U000017b4', '\U000017b5'), + ('\U000017b7', '\U000017bd'), + ('\U000017c6', '\U000017c6'), + ('\U000017c9', '\U000017d3'), + ('\U000017dd', '\U000017dd'), + ('\U0000180b', '\U0000180d'), + ('\U000018a9', '\U000018a9'), + ('\U00001920', '\U00001922'), + ('\U00001927', '\U00001928'), + ('\U00001932', '\U00001932'), + ('\U00001939', '\U0000193b'), + ('\U00001a17', '\U00001a18'), + ('\U00001a1b', '\U00001a1b'), + ('\U00001a56', '\U00001a56'), + ('\U00001a58', '\U00001a5e'), + ('\U00001a60', '\U00001a60'), + ('\U00001a62', '\U00001a62'), + ('\U00001a65', '\U00001a6c'), + ('\U00001a73', '\U00001a7c'), + ('\U00001a7f', '\U00001a7f'), + ('\U00001b00', '\U00001b03'), + ('\U00001b34', '\U00001b34'), + ('\U00001b36', '\U00001b3a'), + ('\U00001b3c', '\U00001b3c'), + ('\U00001b42', '\U00001b42'), + ('\U00001b6b', '\U00001b73'), + ('\U00001b80', '\U00001b81'), + ('\U00001ba2', '\U00001ba5'), + ('\U00001ba8', '\U00001ba9'), + ('\U00001bab', '\U00001bab'), + ('\U00001be6', '\U00001be6'), + ('\U00001be8', '\U00001be9'), + ('\U00001bed', '\U00001bed'), + ('\U00001bef', '\U00001bf1'), + ('\U00001c2c', '\U00001c33'), + ('\U00001c36', '\U00001c37'), + ('\U00001cd0', '\U00001cd2'), + ('\U00001cd4', '\U00001ce0'), + ('\U00001ce2', '\U00001ce8'), + ('\U00001ced', '\U00001ced'), + ('\U00001cf4', '\U00001cf4'), + ('\U00001dc0', '\U00001de6'), + ('\U00001dfc', '\U00001dff'), + ('\U000020d0', '\U000020dc'), + ('\U000020e1', '\U000020e1'), + ('\U000020e5', '\U000020f0'), + ('\U00002cef', '\U00002cf1'), + ('\U00002d7f', '\U00002d7f'), + ('\U00002de0', '\U00002dff'), + ('\U0000302a', '\U0000302d'), + ('\U00003099', '\U0000309a'), + ('\U0000a66f', '\U0000a66f'), + ('\U0000a674', '\U0000a67d'), + ('\U0000a69f', '\U0000a69f'), + ('\U0000a6f0', '\U0000a6f1'), + ('\U0000a802', '\U0000a802'), + ('\U0000a806', '\U0000a806'), + ('\U0000a80b', '\U0000a80b'), + ('\U0000a825', '\U0000a826'), + ('\U0000a8c4', '\U0000a8c4'), + ('\U0000a8e0', '\U0000a8f1'), + ('\U0000a926', '\U0000a92d'), + ('\U0000a947', '\U0000a951'), + ('\U0000a980', '\U0000a982'), + ('\U0000a9b3', '\U0000a9b3'), + ('\U0000a9b6', '\U0000a9b9'), + ('\U0000a9bc', '\U0000a9bc'), + ('\U0000aa29', '\U0000aa2e'), + ('\U0000aa31', '\U0000aa32'), + ('\U0000aa35', '\U0000aa36'), + ('\U0000aa43', '\U0000aa43'), + ('\U0000aa4c', '\U0000aa4c'), + ('\U0000aab0', '\U0000aab0'), + ('\U0000aab2', '\U0000aab4'), + ('\U0000aab7', '\U0000aab8'), + ('\U0000aabe', '\U0000aabf'), + ('\U0000aac1', '\U0000aac1'), + ('\U0000aaec', '\U0000aaed'), + ('\U0000aaf6', '\U0000aaf6'), + ('\U0000abe5', '\U0000abe5'), + ('\U0000abe8', '\U0000abe8'), + ('\U0000abed', '\U0000abed'), + ('\U0000fb1e', '\U0000fb1e'), + ('\U0000fe00', '\U0000fe0f'), + ('\U0000fe20', '\U0000fe26'), + ('\U000101fd', '\U000101fd'), + ('\U00010a01', '\U00010a03'), + ('\U00010a05', '\U00010a06'), + ('\U00010a0c', '\U00010a0f'), + ('\U00010a38', '\U00010a3a'), + ('\U00010a3f', '\U00010a3f'), + ('\U00011001', '\U00011001'), + ('\U00011038', '\U00011046'), + ('\U00011080', '\U00011081'), + ('\U000110b3', '\U000110b6'), + ('\U000110b9', '\U000110ba'), + ('\U00011100', '\U00011102'), + ('\U00011127', '\U0001112b'), + ('\U0001112d', '\U00011134'), + ('\U00011180', '\U00011181'), + ('\U000111b6', '\U000111be'), + ('\U000116ab', '\U000116ab'), + ('\U000116ad', '\U000116ad'), + ('\U000116b0', '\U000116b5'), + ('\U000116b7', '\U000116b7'), + ('\U00016f8f', '\U00016f92'), + ('\U0001d167', '\U0001d169'), + ('\U0001d17b', '\U0001d182'), + ('\U0001d185', '\U0001d18b'), + ('\U0001d1aa', '\U0001d1ad'), + ('\U0001d242', '\U0001d244'), + ('\U000e0100', '\U000e01ef') + ]), +("Mongolian", &[ + ('\U00001800', '\U00001801'), + ('\U00001804', '\U00001804'), + ('\U00001806', '\U0000180e'), + ('\U00001810', '\U00001819'), + ('\U00001820', '\U00001877'), + ('\U00001880', '\U000018aa') + ]), +("Myanmar", &[ + ('\U00001000', '\U0000109f'), + ('\U0000aa60', '\U0000aa7b') + ]), +("N", &[ + ('\U00000030', '\U00000039'), + ('\U00000660', '\U00000669'), + ('\U000006f0', '\U000006f9'), + ('\U000007c0', '\U000007c9'), + ('\U00000966', '\U0000096f'), + ('\U000009e6', '\U000009ef'), + ('\U00000a66', '\U00000a6f'), + ('\U00000ae6', '\U00000aef'), + ('\U00000b66', '\U00000b6f'), + ('\U00000be6', '\U00000bef'), + ('\U00000c66', '\U00000c6f'), + ('\U00000ce6', '\U00000cef'), + ('\U00000d66', '\U00000d6f'), + ('\U00000e50', '\U00000e59'), + ('\U00000ed0', '\U00000ed9'), + ('\U00000f20', '\U00000f29'), + ('\U00001040', '\U00001049'), + ('\U00001090', '\U00001099'), + ('\U000016ee', '\U000016f0'), + ('\U000017e0', '\U000017e9'), + ('\U00001810', '\U00001819'), + ('\U00001946', '\U0000194f'), + ('\U000019d0', '\U000019d9'), + ('\U00001a80', '\U00001a89'), + ('\U00001a90', '\U00001a99'), + ('\U00001b50', '\U00001b59'), + ('\U00001bb0', '\U00001bb9'), + ('\U00001c40', '\U00001c49'), + ('\U00001c50', '\U00001c59'), + ('\U00002160', '\U00002182'), + ('\U00002185', '\U00002188'), + ('\U00003007', '\U00003007'), + ('\U00003021', '\U00003029'), + ('\U00003038', '\U0000303a'), + ('\U0000a620', '\U0000a629'), + ('\U0000a6e6', '\U0000a6ef'), + ('\U0000a8d0', '\U0000a8d9'), + ('\U0000a900', '\U0000a909'), + ('\U0000a9d0', '\U0000a9d9'), + ('\U0000aa50', '\U0000aa59'), + ('\U0000abf0', '\U0000abf9'), + ('\U0000ff10', '\U0000ff19'), + ('\U00010140', '\U00010174'), + ('\U00010341', '\U00010341'), + ('\U0001034a', '\U0001034a'), + ('\U000103d1', '\U000103d5'), + ('\U000104a0', '\U000104a9'), + ('\U00011066', '\U0001106f'), + ('\U000110f0', '\U000110f9'), + ('\U00011136', '\U0001113f'), + ('\U000111d0', '\U000111d9'), + ('\U000116c0', '\U000116c9'), + ('\U00012400', '\U00012462'), + ('\U0001d7ce', '\U0001d7ff') + ]), +("Nd", &[ + ('\U00000030', '\U00000039'), + ('\U00000660', '\U00000669'), + ('\U000006f0', '\U000006f9'), + ('\U000007c0', '\U000007c9'), + ('\U00000966', '\U0000096f'), + ('\U000009e6', '\U000009ef'), + ('\U00000a66', '\U00000a6f'), + ('\U00000ae6', '\U00000aef'), + ('\U00000b66', '\U00000b6f'), + ('\U00000be6', '\U00000bef'), + ('\U00000c66', '\U00000c6f'), + ('\U00000ce6', '\U00000cef'), + ('\U00000d66', '\U00000d6f'), + ('\U00000e50', '\U00000e59'), + ('\U00000ed0', '\U00000ed9'), + ('\U00000f20', '\U00000f29'), + ('\U00001040', '\U00001049'), + ('\U00001090', '\U00001099'), + ('\U000017e0', '\U000017e9'), + ('\U00001810', '\U00001819'), + ('\U00001946', '\U0000194f'), + ('\U000019d0', '\U000019d9'), + ('\U00001a80', '\U00001a89'), + ('\U00001a90', '\U00001a99'), + ('\U00001b50', '\U00001b59'), + ('\U00001bb0', '\U00001bb9'), + ('\U00001c40', '\U00001c49'), + ('\U00001c50', '\U00001c59'), + ('\U0000a620', '\U0000a629'), + ('\U0000a8d0', '\U0000a8d9'), + ('\U0000a900', '\U0000a909'), + ('\U0000a9d0', '\U0000a9d9'), + ('\U0000aa50', '\U0000aa59'), + ('\U0000abf0', '\U0000abf9'), + ('\U0000ff10', '\U0000ff19'), + ('\U000104a0', '\U000104a9'), + ('\U00011066', '\U0001106f'), + ('\U000110f0', '\U000110f9'), + ('\U00011136', '\U0001113f'), + ('\U000111d0', '\U000111d9'), + ('\U000116c0', '\U000116c9'), + ('\U0001d7ce', '\U0001d7ff') + ]), +("New_Tai_Lue", &[ + ('\U00001980', '\U000019ab'), + ('\U000019b0', '\U000019c9'), + ('\U000019d0', '\U000019da'), + ('\U000019de', '\U000019df') + ]), +("Nko", &[ + ('\U000007c0', '\U000007fa') + ]), +("Nl", &[ + ('\U000016ee', '\U000016f0'), + ('\U00002160', '\U00002182'), + ('\U00002185', '\U00002188'), + ('\U00003007', '\U00003007'), + ('\U00003021', '\U00003029'), + ('\U00003038', '\U0000303a'), + ('\U0000a6e6', '\U0000a6ef'), + ('\U00010140', '\U00010174'), + ('\U00010341', '\U00010341'), + ('\U0001034a', '\U0001034a'), + ('\U000103d1', '\U000103d5'), + ('\U00012400', '\U00012462') + ]), +("No", &[ + ('\U000000b2', '\U000000b3'), + ('\U000000b9', '\U000000b9'), + ('\U000000bc', '\U000000be'), + ('\U000009f4', '\U000009f9'), + ('\U00000b72', '\U00000b77'), + ('\U00000bf0', '\U00000bf2'), + ('\U00000c78', '\U00000c7e'), + ('\U00000d70', '\U00000d75'), + ('\U00000f2a', '\U00000f33'), + ('\U00001369', '\U0000137c'), + ('\U000017f0', '\U000017f9'), + ('\U000019da', '\U000019da'), + ('\U00002070', '\U00002070'), + ('\U00002074', '\U00002079'), + ('\U00002080', '\U00002089'), + ('\U00002150', '\U0000215f'), + ('\U00002189', '\U00002189'), + ('\U00002460', '\U0000249b'), + ('\U000024ea', '\U000024ff'), + ('\U00002776', '\U00002793'), + ('\U00002cfd', '\U00002cfd'), + ('\U00003192', '\U00003195'), + ('\U00003220', '\U00003229'), + ('\U00003248', '\U0000324f'), + ('\U00003251', '\U0000325f'), + ('\U00003280', '\U00003289'), + ('\U000032b1', '\U000032bf'), + ('\U0000a830', '\U0000a835'), + ('\U00010107', '\U00010133'), + ('\U00010175', '\U00010178'), + ('\U0001018a', '\U0001018a'), + ('\U00010320', '\U00010323'), + ('\U00010858', '\U0001085f'), + ('\U00010916', '\U0001091b'), + ('\U00010a40', '\U00010a47'), + ('\U00010a7d', '\U00010a7e'), + ('\U00010b58', '\U00010b5f'), + ('\U00010b78', '\U00010b7f'), + ('\U00010e60', '\U00010e7e'), + ('\U00011052', '\U00011065'), + ('\U0001d360', '\U0001d371'), + ('\U0001f100', '\U0001f10a') + ]), +("Ogham", &[ + ('\U00001680', '\U0000169c') + ]), +("Ol_Chiki", &[ + ('\U00001c50', '\U00001c7f') + ]), +("Old_Italic", &[ + ('\U00010300', '\U0001031e'), + ('\U00010320', '\U00010323') + ]), +("Old_Persian", &[ + ('\U000103a0', '\U000103c3'), + ('\U000103c8', '\U000103d5') + ]), +("Old_South_Arabian", &[ + ('\U00010a60', '\U00010a7f') + ]), +("Old_Turkic", &[ + ('\U00010c00', '\U00010c48') + ]), +("Oriya", &[ + ('\U00000b01', '\U00000b03'), + ('\U00000b05', '\U00000b0c'), + ('\U00000b0f', '\U00000b10'), + ('\U00000b13', '\U00000b28'), + ('\U00000b2a', '\U00000b30'), + ('\U00000b32', '\U00000b33'), + ('\U00000b35', '\U00000b39'), + ('\U00000b3c', '\U00000b44'), + ('\U00000b47', '\U00000b48'), + ('\U00000b4b', '\U00000b4d'), + ('\U00000b56', '\U00000b57'), + ('\U00000b5c', '\U00000b5d'), + ('\U00000b5f', '\U00000b63'), + ('\U00000b66', '\U00000b77') + ]), +("Osmanya", &[ + ('\U00010480', '\U0001049d'), + ('\U000104a0', '\U000104a9') + ]), +("P", &[ + ('\U00000021', '\U00000023'), + ('\U00000025', '\U0000002a'), + ('\U0000002c', '\U0000002f'), + ('\U0000003a', '\U0000003b'), + ('\U0000003f', '\U00000040'), + ('\U0000005b', '\U0000005d'), + ('\U0000005f', '\U0000005f'), + ('\U0000007b', '\U0000007b'), + ('\U0000007d', '\U0000007d'), + ('\U000000a1', '\U000000a1'), + ('\U000000a7', '\U000000a7'), + ('\U000000ab', '\U000000ab'), + ('\U000000b6', '\U000000b7'), + ('\U000000bb', '\U000000bb'), + ('\U000000bf', '\U000000bf'), + ('\U0000037e', '\U0000037e'), + ('\U00000387', '\U00000387'), + ('\U0000055a', '\U0000055f'), + ('\U00000589', '\U0000058a'), + ('\U000005be', '\U000005be'), + ('\U000005c0', '\U000005c0'), + ('\U000005c3', '\U000005c3'), + ('\U000005c6', '\U000005c6'), + ('\U000005f3', '\U000005f4'), + ('\U00000609', '\U0000060a'), + ('\U0000060c', '\U0000060d'), + ('\U0000061b', '\U0000061b'), + ('\U0000061e', '\U0000061f'), + ('\U0000066a', '\U0000066d'), + ('\U000006d4', '\U000006d4'), + ('\U00000700', '\U0000070d'), + ('\U000007f7', '\U000007f9'), + ('\U00000830', '\U0000083e'), + ('\U0000085e', '\U0000085e'), + ('\U00000964', '\U00000965'), + ('\U00000970', '\U00000970'), + ('\U00000af0', '\U00000af0'), + ('\U00000df4', '\U00000df4'), + ('\U00000e4f', '\U00000e4f'), + ('\U00000e5a', '\U00000e5b'), + ('\U00000f04', '\U00000f12'), + ('\U00000f14', '\U00000f14'), + ('\U00000f3a', '\U00000f3d'), + ('\U00000f85', '\U00000f85'), + ('\U00000fd0', '\U00000fd4'), + ('\U00000fd9', '\U00000fda'), + ('\U0000104a', '\U0000104f'), + ('\U000010fb', '\U000010fb'), + ('\U00001360', '\U00001368'), + ('\U00001400', '\U00001400'), + ('\U0000166d', '\U0000166e'), + ('\U0000169b', '\U0000169c'), + ('\U000016eb', '\U000016ed'), + ('\U00001735', '\U00001736'), + ('\U000017d4', '\U000017d6'), + ('\U000017d8', '\U000017da'), + ('\U00001800', '\U0000180a'), + ('\U00001944', '\U00001945'), + ('\U00001a1e', '\U00001a1f'), + ('\U00001aa0', '\U00001aa6'), + ('\U00001aa8', '\U00001aad'), + ('\U00001b5a', '\U00001b60'), + ('\U00001bfc', '\U00001bff'), + ('\U00001c3b', '\U00001c3f'), + ('\U00001c7e', '\U00001c7f'), + ('\U00001cc0', '\U00001cc7'), + ('\U00001cd3', '\U00001cd3'), + ('\U00002010', '\U00002027'), + ('\U00002030', '\U00002043'), + ('\U00002045', '\U00002051'), + ('\U00002053', '\U0000205e'), + ('\U0000207d', '\U0000207e'), + ('\U0000208d', '\U0000208e'), + ('\U00002308', '\U0000230b'), + ('\U00002329', '\U0000232a'), + ('\U00002768', '\U00002775'), + ('\U000027c5', '\U000027c6'), + ('\U000027e6', '\U000027ef'), + ('\U00002983', '\U00002998'), + ('\U000029d8', '\U000029db'), + ('\U000029fc', '\U000029fd'), + ('\U00002cf9', '\U00002cfc'), + ('\U00002cfe', '\U00002cff'), + ('\U00002d70', '\U00002d70'), + ('\U00002e00', '\U00002e2e'), + ('\U00002e30', '\U00002e3b'), + ('\U00003001', '\U00003003'), + ('\U00003008', '\U00003011'), + ('\U00003014', '\U0000301f'), + ('\U00003030', '\U00003030'), + ('\U0000303d', '\U0000303d'), + ('\U000030a0', '\U000030a0'), + ('\U000030fb', '\U000030fb'), + ('\U0000a4fe', '\U0000a4ff'), + ('\U0000a60d', '\U0000a60f'), + ('\U0000a673', '\U0000a673'), + ('\U0000a67e', '\U0000a67e'), + ('\U0000a6f2', '\U0000a6f7'), + ('\U0000a874', '\U0000a877'), + ('\U0000a8ce', '\U0000a8cf'), + ('\U0000a8f8', '\U0000a8fa'), + ('\U0000a92e', '\U0000a92f'), + ('\U0000a95f', '\U0000a95f'), + ('\U0000a9c1', '\U0000a9cd'), + ('\U0000a9de', '\U0000a9df'), + ('\U0000aa5c', '\U0000aa5f'), + ('\U0000aade', '\U0000aadf'), + ('\U0000aaf0', '\U0000aaf1'), + ('\U0000abeb', '\U0000abeb'), + ('\U0000fd3e', '\U0000fd3f'), + ('\U0000fe10', '\U0000fe19'), + ('\U0000fe30', '\U0000fe52'), + ('\U0000fe54', '\U0000fe61'), + ('\U0000fe63', '\U0000fe63'), + ('\U0000fe68', '\U0000fe68'), + ('\U0000fe6a', '\U0000fe6b'), + ('\U0000ff01', '\U0000ff03'), + ('\U0000ff05', '\U0000ff0a'), + ('\U0000ff0c', '\U0000ff0f'), + ('\U0000ff1a', '\U0000ff1b'), + ('\U0000ff1f', '\U0000ff20'), + ('\U0000ff3b', '\U0000ff3d'), + ('\U0000ff3f', '\U0000ff3f'), + ('\U0000ff5b', '\U0000ff5b'), + ('\U0000ff5d', '\U0000ff5d'), + ('\U0000ff5f', '\U0000ff65'), + ('\U00010100', '\U00010102'), + ('\U0001039f', '\U0001039f'), + ('\U000103d0', '\U000103d0'), + ('\U00010857', '\U00010857'), + ('\U0001091f', '\U0001091f'), + ('\U0001093f', '\U0001093f'), + ('\U00010a50', '\U00010a58'), + ('\U00010a7f', '\U00010a7f'), + ('\U00010b39', '\U00010b3f'), + ('\U00011047', '\U0001104d'), + ('\U000110bb', '\U000110bc'), + ('\U000110be', '\U000110c1'), + ('\U00011140', '\U00011143'), + ('\U000111c5', '\U000111c8'), + ('\U00012470', '\U00012473') + ]), +("Pc", &[ + ('\U0000005f', '\U0000005f'), + ('\U0000203f', '\U00002040'), + ('\U00002054', '\U00002054'), + ('\U0000fe33', '\U0000fe34'), + ('\U0000fe4d', '\U0000fe4f'), + ('\U0000ff3f', '\U0000ff3f') + ]), +("Pd", &[ + ('\U0000002d', '\U0000002d'), + ('\U0000058a', '\U0000058a'), + ('\U000005be', '\U000005be'), + ('\U00001400', '\U00001400'), + ('\U00001806', '\U00001806'), + ('\U00002010', '\U00002015'), + ('\U00002e17', '\U00002e17'), + ('\U00002e1a', '\U00002e1a'), + ('\U00002e3a', '\U00002e3b'), + ('\U0000301c', '\U0000301c'), + ('\U00003030', '\U00003030'), + ('\U000030a0', '\U000030a0'), + ('\U0000fe31', '\U0000fe32'), + ('\U0000fe58', '\U0000fe58'), + ('\U0000fe63', '\U0000fe63'), + ('\U0000ff0d', '\U0000ff0d') + ]), +("Pe", &[ + ('\U00000029', '\U00000029'), + ('\U0000005d', '\U0000005d'), + ('\U0000007d', '\U0000007d'), + ('\U00000f3b', '\U00000f3b'), + ('\U00000f3d', '\U00000f3d'), + ('\U0000169c', '\U0000169c'), + ('\U00002046', '\U00002046'), + ('\U0000207e', '\U0000207e'), + ('\U0000208e', '\U0000208e'), + ('\U00002309', '\U00002309'), + ('\U0000230b', '\U0000230b'), + ('\U0000232a', '\U0000232a'), + ('\U00002769', '\U00002769'), + ('\U0000276b', '\U0000276b'), + ('\U0000276d', '\U0000276d'), + ('\U0000276f', '\U0000276f'), + ('\U00002771', '\U00002771'), + ('\U00002773', '\U00002773'), + ('\U00002775', '\U00002775'), + ('\U000027c6', '\U000027c6'), + ('\U000027e7', '\U000027e7'), + ('\U000027e9', '\U000027e9'), + ('\U000027eb', '\U000027eb'), + ('\U000027ed', '\U000027ed'), + ('\U000027ef', '\U000027ef'), + ('\U00002984', '\U00002984'), + ('\U00002986', '\U00002986'), + ('\U00002988', '\U00002988'), + ('\U0000298a', '\U0000298a'), + ('\U0000298c', '\U0000298c'), + ('\U0000298e', '\U0000298e'), + ('\U00002990', '\U00002990'), + ('\U00002992', '\U00002992'), + ('\U00002994', '\U00002994'), + ('\U00002996', '\U00002996'), + ('\U00002998', '\U00002998'), + ('\U000029d9', '\U000029d9'), + ('\U000029db', '\U000029db'), + ('\U000029fd', '\U000029fd'), + ('\U00002e23', '\U00002e23'), + ('\U00002e25', '\U00002e25'), + ('\U00002e27', '\U00002e27'), + ('\U00002e29', '\U00002e29'), + ('\U00003009', '\U00003009'), + ('\U0000300b', '\U0000300b'), + ('\U0000300d', '\U0000300d'), + ('\U0000300f', '\U0000300f'), + ('\U00003011', '\U00003011'), + ('\U00003015', '\U00003015'), + ('\U00003017', '\U00003017'), + ('\U00003019', '\U00003019'), + ('\U0000301b', '\U0000301b'), + ('\U0000301e', '\U0000301f'), + ('\U0000fd3f', '\U0000fd3f'), + ('\U0000fe18', '\U0000fe18'), + ('\U0000fe36', '\U0000fe36'), + ('\U0000fe38', '\U0000fe38'), + ('\U0000fe3a', '\U0000fe3a'), + ('\U0000fe3c', '\U0000fe3c'), + ('\U0000fe3e', '\U0000fe3e'), + ('\U0000fe40', '\U0000fe40'), + ('\U0000fe42', '\U0000fe42'), + ('\U0000fe44', '\U0000fe44'), + ('\U0000fe48', '\U0000fe48'), + ('\U0000fe5a', '\U0000fe5a'), + ('\U0000fe5c', '\U0000fe5c'), + ('\U0000fe5e', '\U0000fe5e'), + ('\U0000ff09', '\U0000ff09'), + ('\U0000ff3d', '\U0000ff3d'), + ('\U0000ff5d', '\U0000ff5d'), + ('\U0000ff60', '\U0000ff60'), + ('\U0000ff63', '\U0000ff63') + ]), +("Pf", &[ + ('\U000000bb', '\U000000bb'), + ('\U00002019', '\U00002019'), + ('\U0000201d', '\U0000201d'), + ('\U0000203a', '\U0000203a'), + ('\U00002e03', '\U00002e03'), + ('\U00002e05', '\U00002e05'), + ('\U00002e0a', '\U00002e0a'), + ('\U00002e0d', '\U00002e0d'), + ('\U00002e1d', '\U00002e1d'), + ('\U00002e21', '\U00002e21') + ]), +("Phags_Pa", &[ + ('\U0000a840', '\U0000a877') + ]), +("Phoenician", &[ + ('\U00010900', '\U0001091b'), + ('\U0001091f', '\U0001091f') + ]), +("Pi", &[ + ('\U000000ab', '\U000000ab'), + ('\U00002018', '\U00002018'), + ('\U0000201b', '\U0000201c'), + ('\U0000201f', '\U0000201f'), + ('\U00002039', '\U00002039'), + ('\U00002e02', '\U00002e02'), + ('\U00002e04', '\U00002e04'), + ('\U00002e09', '\U00002e09'), + ('\U00002e0c', '\U00002e0c'), + ('\U00002e1c', '\U00002e1c'), + ('\U00002e20', '\U00002e20') + ]), +("Po", &[ + ('\U00000021', '\U00000023'), + ('\U00000025', '\U00000027'), + ('\U0000002a', '\U0000002a'), + ('\U0000002c', '\U0000002c'), + ('\U0000002e', '\U0000002f'), + ('\U0000003a', '\U0000003b'), + ('\U0000003f', '\U00000040'), + ('\U0000005c', '\U0000005c'), + ('\U000000a1', '\U000000a1'), + ('\U000000a7', '\U000000a7'), + ('\U000000b6', '\U000000b7'), + ('\U000000bf', '\U000000bf'), + ('\U0000037e', '\U0000037e'), + ('\U00000387', '\U00000387'), + ('\U0000055a', '\U0000055f'), + ('\U00000589', '\U00000589'), + ('\U000005c0', '\U000005c0'), + ('\U000005c3', '\U000005c3'), + ('\U000005c6', '\U000005c6'), + ('\U000005f3', '\U000005f4'), + ('\U00000609', '\U0000060a'), + ('\U0000060c', '\U0000060d'), + ('\U0000061b', '\U0000061b'), + ('\U0000061e', '\U0000061f'), + ('\U0000066a', '\U0000066d'), + ('\U000006d4', '\U000006d4'), + ('\U00000700', '\U0000070d'), + ('\U000007f7', '\U000007f9'), + ('\U00000830', '\U0000083e'), + ('\U0000085e', '\U0000085e'), + ('\U00000964', '\U00000965'), + ('\U00000970', '\U00000970'), + ('\U00000af0', '\U00000af0'), + ('\U00000df4', '\U00000df4'), + ('\U00000e4f', '\U00000e4f'), + ('\U00000e5a', '\U00000e5b'), + ('\U00000f04', '\U00000f12'), + ('\U00000f14', '\U00000f14'), + ('\U00000f85', '\U00000f85'), + ('\U00000fd0', '\U00000fd4'), + ('\U00000fd9', '\U00000fda'), + ('\U0000104a', '\U0000104f'), + ('\U000010fb', '\U000010fb'), + ('\U00001360', '\U00001368'), + ('\U0000166d', '\U0000166e'), + ('\U000016eb', '\U000016ed'), + ('\U00001735', '\U00001736'), + ('\U000017d4', '\U000017d6'), + ('\U000017d8', '\U000017da'), + ('\U00001800', '\U00001805'), + ('\U00001807', '\U0000180a'), + ('\U00001944', '\U00001945'), + ('\U00001a1e', '\U00001a1f'), + ('\U00001aa0', '\U00001aa6'), + ('\U00001aa8', '\U00001aad'), + ('\U00001b5a', '\U00001b60'), + ('\U00001bfc', '\U00001bff'), + ('\U00001c3b', '\U00001c3f'), + ('\U00001c7e', '\U00001c7f'), + ('\U00001cc0', '\U00001cc7'), + ('\U00001cd3', '\U00001cd3'), + ('\U00002016', '\U00002017'), + ('\U00002020', '\U00002027'), + ('\U00002030', '\U00002038'), + ('\U0000203b', '\U0000203e'), + ('\U00002041', '\U00002043'), + ('\U00002047', '\U00002051'), + ('\U00002053', '\U00002053'), + ('\U00002055', '\U0000205e'), + ('\U00002cf9', '\U00002cfc'), + ('\U00002cfe', '\U00002cff'), + ('\U00002d70', '\U00002d70'), + ('\U00002e00', '\U00002e01'), + ('\U00002e06', '\U00002e08'), + ('\U00002e0b', '\U00002e0b'), + ('\U00002e0e', '\U00002e16'), + ('\U00002e18', '\U00002e19'), + ('\U00002e1b', '\U00002e1b'), + ('\U00002e1e', '\U00002e1f'), + ('\U00002e2a', '\U00002e2e'), + ('\U00002e30', '\U00002e39'), + ('\U00003001', '\U00003003'), + ('\U0000303d', '\U0000303d'), + ('\U000030fb', '\U000030fb'), + ('\U0000a4fe', '\U0000a4ff'), + ('\U0000a60d', '\U0000a60f'), + ('\U0000a673', '\U0000a673'), + ('\U0000a67e', '\U0000a67e'), + ('\U0000a6f2', '\U0000a6f7'), + ('\U0000a874', '\U0000a877'), + ('\U0000a8ce', '\U0000a8cf'), + ('\U0000a8f8', '\U0000a8fa'), + ('\U0000a92e', '\U0000a92f'), + ('\U0000a95f', '\U0000a95f'), + ('\U0000a9c1', '\U0000a9cd'), + ('\U0000a9de', '\U0000a9df'), + ('\U0000aa5c', '\U0000aa5f'), + ('\U0000aade', '\U0000aadf'), + ('\U0000aaf0', '\U0000aaf1'), + ('\U0000abeb', '\U0000abeb'), + ('\U0000fe10', '\U0000fe16'), + ('\U0000fe19', '\U0000fe19'), + ('\U0000fe30', '\U0000fe30'), + ('\U0000fe45', '\U0000fe46'), + ('\U0000fe49', '\U0000fe4c'), + ('\U0000fe50', '\U0000fe52'), + ('\U0000fe54', '\U0000fe57'), + ('\U0000fe5f', '\U0000fe61'), + ('\U0000fe68', '\U0000fe68'), + ('\U0000fe6a', '\U0000fe6b'), + ('\U0000ff01', '\U0000ff03'), + ('\U0000ff05', '\U0000ff07'), + ('\U0000ff0a', '\U0000ff0a'), + ('\U0000ff0c', '\U0000ff0c'), + ('\U0000ff0e', '\U0000ff0f'), + ('\U0000ff1a', '\U0000ff1b'), + ('\U0000ff1f', '\U0000ff20'), + ('\U0000ff3c', '\U0000ff3c'), + ('\U0000ff61', '\U0000ff61'), + ('\U0000ff64', '\U0000ff65'), + ('\U00010100', '\U00010102'), + ('\U0001039f', '\U0001039f'), + ('\U000103d0', '\U000103d0'), + ('\U00010857', '\U00010857'), + ('\U0001091f', '\U0001091f'), + ('\U0001093f', '\U0001093f'), + ('\U00010a50', '\U00010a58'), + ('\U00010a7f', '\U00010a7f'), + ('\U00010b39', '\U00010b3f'), + ('\U00011047', '\U0001104d'), + ('\U000110bb', '\U000110bc'), + ('\U000110be', '\U000110c1'), + ('\U00011140', '\U00011143'), + ('\U000111c5', '\U000111c8'), + ('\U00012470', '\U00012473') + ]), +("Ps", &[ + ('\U00000028', '\U00000028'), + ('\U0000005b', '\U0000005b'), + ('\U0000007b', '\U0000007b'), + ('\U00000f3a', '\U00000f3a'), + ('\U00000f3c', '\U00000f3c'), + ('\U0000169b', '\U0000169b'), + ('\U0000201a', '\U0000201a'), + ('\U0000201e', '\U0000201e'), + ('\U00002045', '\U00002045'), + ('\U0000207d', '\U0000207d'), + ('\U0000208d', '\U0000208d'), + ('\U00002308', '\U00002308'), + ('\U0000230a', '\U0000230a'), + ('\U00002329', '\U00002329'), + ('\U00002768', '\U00002768'), + ('\U0000276a', '\U0000276a'), + ('\U0000276c', '\U0000276c'), + ('\U0000276e', '\U0000276e'), + ('\U00002770', '\U00002770'), + ('\U00002772', '\U00002772'), + ('\U00002774', '\U00002774'), + ('\U000027c5', '\U000027c5'), + ('\U000027e6', '\U000027e6'), + ('\U000027e8', '\U000027e8'), + ('\U000027ea', '\U000027ea'), + ('\U000027ec', '\U000027ec'), + ('\U000027ee', '\U000027ee'), + ('\U00002983', '\U00002983'), + ('\U00002985', '\U00002985'), + ('\U00002987', '\U00002987'), + ('\U00002989', '\U00002989'), + ('\U0000298b', '\U0000298b'), + ('\U0000298d', '\U0000298d'), + ('\U0000298f', '\U0000298f'), + ('\U00002991', '\U00002991'), + ('\U00002993', '\U00002993'), + ('\U00002995', '\U00002995'), + ('\U00002997', '\U00002997'), + ('\U000029d8', '\U000029d8'), + ('\U000029da', '\U000029da'), + ('\U000029fc', '\U000029fc'), + ('\U00002e22', '\U00002e22'), + ('\U00002e24', '\U00002e24'), + ('\U00002e26', '\U00002e26'), + ('\U00002e28', '\U00002e28'), + ('\U00003008', '\U00003008'), + ('\U0000300a', '\U0000300a'), + ('\U0000300c', '\U0000300c'), + ('\U0000300e', '\U0000300e'), + ('\U00003010', '\U00003010'), + ('\U00003014', '\U00003014'), + ('\U00003016', '\U00003016'), + ('\U00003018', '\U00003018'), + ('\U0000301a', '\U0000301a'), + ('\U0000301d', '\U0000301d'), + ('\U0000fd3e', '\U0000fd3e'), + ('\U0000fe17', '\U0000fe17'), + ('\U0000fe35', '\U0000fe35'), + ('\U0000fe37', '\U0000fe37'), + ('\U0000fe39', '\U0000fe39'), + ('\U0000fe3b', '\U0000fe3b'), + ('\U0000fe3d', '\U0000fe3d'), + ('\U0000fe3f', '\U0000fe3f'), + ('\U0000fe41', '\U0000fe41'), + ('\U0000fe43', '\U0000fe43'), + ('\U0000fe47', '\U0000fe47'), + ('\U0000fe59', '\U0000fe59'), + ('\U0000fe5b', '\U0000fe5b'), + ('\U0000fe5d', '\U0000fe5d'), + ('\U0000ff08', '\U0000ff08'), + ('\U0000ff3b', '\U0000ff3b'), + ('\U0000ff5b', '\U0000ff5b'), + ('\U0000ff5f', '\U0000ff5f'), + ('\U0000ff62', '\U0000ff62') + ]), +("Rejang", &[ + ('\U0000a930', '\U0000a953'), + ('\U0000a95f', '\U0000a95f') + ]), +("Runic", &[ + ('\U000016a0', '\U000016ea'), + ('\U000016ee', '\U000016f0') + ]), +("S", &[ + ('\U00000024', '\U00000024'), + ('\U0000002b', '\U0000002b'), + ('\U0000003c', '\U0000003e'), + ('\U0000005e', '\U0000005e'), + ('\U00000060', '\U00000060'), + ('\U0000007c', '\U0000007c'), + ('\U0000007e', '\U0000007e'), + ('\U000000a2', '\U000000a6'), + ('\U000000a8', '\U000000a9'), + ('\U000000ac', '\U000000ac'), + ('\U000000ae', '\U000000b1'), + ('\U000000b4', '\U000000b4'), + ('\U000000b8', '\U000000b8'), + ('\U000000d7', '\U000000d7'), + ('\U000000f7', '\U000000f7'), + ('\U000002c2', '\U000002c5'), + ('\U000002d2', '\U000002df'), + ('\U000002e5', '\U000002eb'), + ('\U000002ed', '\U000002ed'), + ('\U000002ef', '\U000002ff'), + ('\U00000375', '\U00000375'), + ('\U00000384', '\U00000385'), + ('\U000003f6', '\U000003f6'), + ('\U00000482', '\U00000482'), + ('\U0000058f', '\U0000058f'), + ('\U00000606', '\U00000608'), + ('\U0000060b', '\U0000060b'), + ('\U0000060e', '\U0000060f'), + ('\U000006de', '\U000006de'), + ('\U000006e9', '\U000006e9'), + ('\U000006fd', '\U000006fe'), + ('\U000007f6', '\U000007f6'), + ('\U000009f2', '\U000009f3'), + ('\U000009fa', '\U000009fb'), + ('\U00000af1', '\U00000af1'), + ('\U00000b70', '\U00000b70'), + ('\U00000bf3', '\U00000bfa'), + ('\U00000c7f', '\U00000c7f'), + ('\U00000d79', '\U00000d79'), + ('\U00000e3f', '\U00000e3f'), + ('\U00000f01', '\U00000f03'), + ('\U00000f13', '\U00000f13'), + ('\U00000f15', '\U00000f17'), + ('\U00000f1a', '\U00000f1f'), + ('\U00000f34', '\U00000f34'), + ('\U00000f36', '\U00000f36'), + ('\U00000f38', '\U00000f38'), + ('\U00000fbe', '\U00000fc5'), + ('\U00000fc7', '\U00000fcc'), + ('\U00000fce', '\U00000fcf'), + ('\U00000fd5', '\U00000fd8'), + ('\U0000109e', '\U0000109f'), + ('\U00001390', '\U00001399'), + ('\U000017db', '\U000017db'), + ('\U00001940', '\U00001940'), + ('\U000019de', '\U000019ff'), + ('\U00001b61', '\U00001b6a'), + ('\U00001b74', '\U00001b7c'), + ('\U00001fbd', '\U00001fbd'), + ('\U00001fbf', '\U00001fc1'), + ('\U00001fcd', '\U00001fcf'), + ('\U00001fdd', '\U00001fdf'), + ('\U00001fed', '\U00001fef'), + ('\U00001ffd', '\U00001ffe'), + ('\U00002044', '\U00002044'), + ('\U00002052', '\U00002052'), + ('\U0000207a', '\U0000207c'), + ('\U0000208a', '\U0000208c'), + ('\U000020a0', '\U000020ba'), + ('\U00002100', '\U00002101'), + ('\U00002103', '\U00002106'), + ('\U00002108', '\U00002109'), + ('\U00002114', '\U00002114'), + ('\U00002116', '\U00002118'), + ('\U0000211e', '\U00002123'), + ('\U00002125', '\U00002125'), + ('\U00002127', '\U00002127'), + ('\U00002129', '\U00002129'), + ('\U0000212e', '\U0000212e'), + ('\U0000213a', '\U0000213b'), + ('\U00002140', '\U00002144'), + ('\U0000214a', '\U0000214d'), + ('\U0000214f', '\U0000214f'), + ('\U00002190', '\U00002307'), + ('\U0000230c', '\U00002328'), + ('\U0000232b', '\U000023f3'), + ('\U00002400', '\U00002426'), + ('\U00002440', '\U0000244a'), + ('\U0000249c', '\U000024e9'), + ('\U00002500', '\U000026ff'), + ('\U00002701', '\U00002767'), + ('\U00002794', '\U000027c4'), + ('\U000027c7', '\U000027e5'), + ('\U000027f0', '\U00002982'), + ('\U00002999', '\U000029d7'), + ('\U000029dc', '\U000029fb'), + ('\U000029fe', '\U00002b4c'), + ('\U00002b50', '\U00002b59'), + ('\U00002ce5', '\U00002cea'), + ('\U00002e80', '\U00002e99'), + ('\U00002e9b', '\U00002ef3'), + ('\U00002f00', '\U00002fd5'), + ('\U00002ff0', '\U00002ffb'), + ('\U00003004', '\U00003004'), + ('\U00003012', '\U00003013'), + ('\U00003020', '\U00003020'), + ('\U00003036', '\U00003037'), + ('\U0000303e', '\U0000303f'), + ('\U0000309b', '\U0000309c'), + ('\U00003190', '\U00003191'), + ('\U00003196', '\U0000319f'), + ('\U000031c0', '\U000031e3'), + ('\U00003200', '\U0000321e'), + ('\U0000322a', '\U00003247'), + ('\U00003250', '\U00003250'), + ('\U00003260', '\U0000327f'), + ('\U0000328a', '\U000032b0'), + ('\U000032c0', '\U000032fe'), + ('\U00003300', '\U000033ff'), + ('\U00004dc0', '\U00004dff'), + ('\U0000a490', '\U0000a4c6'), + ('\U0000a700', '\U0000a716'), + ('\U0000a720', '\U0000a721'), + ('\U0000a789', '\U0000a78a'), + ('\U0000a828', '\U0000a82b'), + ('\U0000a836', '\U0000a839'), + ('\U0000aa77', '\U0000aa79'), + ('\U0000fb29', '\U0000fb29'), + ('\U0000fbb2', '\U0000fbc1'), + ('\U0000fdfc', '\U0000fdfd'), + ('\U0000fe62', '\U0000fe62'), + ('\U0000fe64', '\U0000fe66'), + ('\U0000fe69', '\U0000fe69'), + ('\U0000ff04', '\U0000ff04'), + ('\U0000ff0b', '\U0000ff0b'), + ('\U0000ff1c', '\U0000ff1e'), + ('\U0000ff3e', '\U0000ff3e'), + ('\U0000ff40', '\U0000ff40'), + ('\U0000ff5c', '\U0000ff5c'), + ('\U0000ff5e', '\U0000ff5e'), + ('\U0000ffe0', '\U0000ffe6'), + ('\U0000ffe8', '\U0000ffee'), + ('\U0000fffc', '\U0000fffd'), + ('\U00010137', '\U0001013f'), + ('\U00010179', '\U00010189'), + ('\U00010190', '\U0001019b'), + ('\U000101d0', '\U000101fc'), + ('\U0001d000', '\U0001d0f5'), + ('\U0001d100', '\U0001d126'), + ('\U0001d129', '\U0001d164'), + ('\U0001d16a', '\U0001d16c'), + ('\U0001d183', '\U0001d184'), + ('\U0001d18c', '\U0001d1a9'), + ('\U0001d1ae', '\U0001d1dd'), + ('\U0001d200', '\U0001d241'), + ('\U0001d245', '\U0001d245'), + ('\U0001d300', '\U0001d356'), + ('\U0001d6c1', '\U0001d6c1'), + ('\U0001d6db', '\U0001d6db'), + ('\U0001d6fb', '\U0001d6fb'), + ('\U0001d715', '\U0001d715'), + ('\U0001d735', '\U0001d735'), + ('\U0001d74f', '\U0001d74f'), + ('\U0001d76f', '\U0001d76f'), + ('\U0001d789', '\U0001d789'), + ('\U0001d7a9', '\U0001d7a9'), + ('\U0001d7c3', '\U0001d7c3'), + ('\U0001eef0', '\U0001eef1'), + ('\U0001f000', '\U0001f02b'), + ('\U0001f030', '\U0001f093'), + ('\U0001f0a0', '\U0001f0ae'), + ('\U0001f0b1', '\U0001f0be'), + ('\U0001f0c1', '\U0001f0cf'), + ('\U0001f0d1', '\U0001f0df'), + ('\U0001f110', '\U0001f12e'), + ('\U0001f130', '\U0001f16b'), + ('\U0001f170', '\U0001f19a'), + ('\U0001f1e6', '\U0001f202'), + ('\U0001f210', '\U0001f23a'), + ('\U0001f240', '\U0001f248'), + ('\U0001f250', '\U0001f251'), + ('\U0001f300', '\U0001f320'), + ('\U0001f330', '\U0001f335'), + ('\U0001f337', '\U0001f37c'), + ('\U0001f380', '\U0001f393'), + ('\U0001f3a0', '\U0001f3c4'), + ('\U0001f3c6', '\U0001f3ca'), + ('\U0001f3e0', '\U0001f3f0'), + ('\U0001f400', '\U0001f43e'), + ('\U0001f440', '\U0001f440'), + ('\U0001f442', '\U0001f4f7'), + ('\U0001f4f9', '\U0001f4fc'), + ('\U0001f500', '\U0001f53d'), + ('\U0001f540', '\U0001f543'), + ('\U0001f550', '\U0001f567'), + ('\U0001f5fb', '\U0001f640'), + ('\U0001f645', '\U0001f64f'), + ('\U0001f680', '\U0001f6c5'), + ('\U0001f700', '\U0001f773') + ]), +("Samaritan", &[ + ('\U00000800', '\U0000082d'), + ('\U00000830', '\U0000083e') + ]), +("Saurashtra", &[ + ('\U0000a880', '\U0000a8c4'), + ('\U0000a8ce', '\U0000a8d9') + ]), +("Sc", &[ + ('\U00000024', '\U00000024'), + ('\U000000a2', '\U000000a5'), + ('\U0000058f', '\U0000058f'), + ('\U0000060b', '\U0000060b'), + ('\U000009f2', '\U000009f3'), + ('\U000009fb', '\U000009fb'), + ('\U00000af1', '\U00000af1'), + ('\U00000bf9', '\U00000bf9'), + ('\U00000e3f', '\U00000e3f'), + ('\U000017db', '\U000017db'), + ('\U000020a0', '\U000020ba'), + ('\U0000a838', '\U0000a838'), + ('\U0000fdfc', '\U0000fdfc'), + ('\U0000fe69', '\U0000fe69'), + ('\U0000ff04', '\U0000ff04'), + ('\U0000ffe0', '\U0000ffe1'), + ('\U0000ffe5', '\U0000ffe6') + ]), +("Sharada", &[ + ('\U00011180', '\U000111c8'), + ('\U000111d0', '\U000111d9') + ]), +("Shavian", &[ + ('\U00010450', '\U0001047f') + ]), +("Sinhala", &[ + ('\U00000d82', '\U00000d83'), + ('\U00000d85', '\U00000d96'), + ('\U00000d9a', '\U00000db1'), + ('\U00000db3', '\U00000dbb'), + ('\U00000dbd', '\U00000dbd'), + ('\U00000dc0', '\U00000dc6'), + ('\U00000dca', '\U00000dca'), + ('\U00000dcf', '\U00000dd4'), + ('\U00000dd6', '\U00000dd6'), + ('\U00000dd8', '\U00000ddf'), + ('\U00000df2', '\U00000df4') + ]), +("Sk", &[ + ('\U0000005e', '\U0000005e'), + ('\U00000060', '\U00000060'), + ('\U000000a8', '\U000000a8'), + ('\U000000af', '\U000000af'), + ('\U000000b4', '\U000000b4'), + ('\U000000b8', '\U000000b8'), + ('\U000002c2', '\U000002c5'), + ('\U000002d2', '\U000002df'), + ('\U000002e5', '\U000002eb'), + ('\U000002ed', '\U000002ed'), + ('\U000002ef', '\U000002ff'), + ('\U00000375', '\U00000375'), + ('\U00000384', '\U00000385'), + ('\U00001fbd', '\U00001fbd'), + ('\U00001fbf', '\U00001fc1'), + ('\U00001fcd', '\U00001fcf'), + ('\U00001fdd', '\U00001fdf'), + ('\U00001fed', '\U00001fef'), + ('\U00001ffd', '\U00001ffe'), + ('\U0000309b', '\U0000309c'), + ('\U0000a700', '\U0000a716'), + ('\U0000a720', '\U0000a721'), + ('\U0000a789', '\U0000a78a'), + ('\U0000fbb2', '\U0000fbc1'), + ('\U0000ff3e', '\U0000ff3e'), + ('\U0000ff40', '\U0000ff40'), + ('\U0000ffe3', '\U0000ffe3') + ]), +("Sm", &[ + ('\U0000002b', '\U0000002b'), + ('\U0000003c', '\U0000003e'), + ('\U0000007c', '\U0000007c'), + ('\U0000007e', '\U0000007e'), + ('\U000000ac', '\U000000ac'), + ('\U000000b1', '\U000000b1'), + ('\U000000d7', '\U000000d7'), + ('\U000000f7', '\U000000f7'), + ('\U000003f6', '\U000003f6'), + ('\U00000606', '\U00000608'), + ('\U00002044', '\U00002044'), + ('\U00002052', '\U00002052'), + ('\U0000207a', '\U0000207c'), + ('\U0000208a', '\U0000208c'), + ('\U00002118', '\U00002118'), + ('\U00002140', '\U00002144'), + ('\U0000214b', '\U0000214b'), + ('\U00002190', '\U00002194'), + ('\U0000219a', '\U0000219b'), + ('\U000021a0', '\U000021a0'), + ('\U000021a3', '\U000021a3'), + ('\U000021a6', '\U000021a6'), + ('\U000021ae', '\U000021ae'), + ('\U000021ce', '\U000021cf'), + ('\U000021d2', '\U000021d2'), + ('\U000021d4', '\U000021d4'), + ('\U000021f4', '\U000022ff'), + ('\U00002320', '\U00002321'), + ('\U0000237c', '\U0000237c'), + ('\U0000239b', '\U000023b3'), + ('\U000023dc', '\U000023e1'), + ('\U000025b7', '\U000025b7'), + ('\U000025c1', '\U000025c1'), + ('\U000025f8', '\U000025ff'), + ('\U0000266f', '\U0000266f'), + ('\U000027c0', '\U000027c4'), + ('\U000027c7', '\U000027e5'), + ('\U000027f0', '\U000027ff'), + ('\U00002900', '\U00002982'), + ('\U00002999', '\U000029d7'), + ('\U000029dc', '\U000029fb'), + ('\U000029fe', '\U00002aff'), + ('\U00002b30', '\U00002b44'), + ('\U00002b47', '\U00002b4c'), + ('\U0000fb29', '\U0000fb29'), + ('\U0000fe62', '\U0000fe62'), + ('\U0000fe64', '\U0000fe66'), + ('\U0000ff0b', '\U0000ff0b'), + ('\U0000ff1c', '\U0000ff1e'), + ('\U0000ff5c', '\U0000ff5c'), + ('\U0000ff5e', '\U0000ff5e'), + ('\U0000ffe2', '\U0000ffe2'), + ('\U0000ffe9', '\U0000ffec'), + ('\U0001d6c1', '\U0001d6c1'), + ('\U0001d6db', '\U0001d6db'), + ('\U0001d6fb', '\U0001d6fb'), + ('\U0001d715', '\U0001d715'), + ('\U0001d735', '\U0001d735'), + ('\U0001d74f', '\U0001d74f'), + ('\U0001d76f', '\U0001d76f'), + ('\U0001d789', '\U0001d789'), + ('\U0001d7a9', '\U0001d7a9'), + ('\U0001d7c3', '\U0001d7c3'), + ('\U0001eef0', '\U0001eef1') + ]), +("So", &[ + ('\U000000a6', '\U000000a6'), + ('\U000000a9', '\U000000a9'), + ('\U000000ae', '\U000000ae'), + ('\U000000b0', '\U000000b0'), + ('\U00000482', '\U00000482'), + ('\U0000060e', '\U0000060f'), + ('\U000006de', '\U000006de'), + ('\U000006e9', '\U000006e9'), + ('\U000006fd', '\U000006fe'), + ('\U000007f6', '\U000007f6'), + ('\U000009fa', '\U000009fa'), + ('\U00000b70', '\U00000b70'), + ('\U00000bf3', '\U00000bf8'), + ('\U00000bfa', '\U00000bfa'), + ('\U00000c7f', '\U00000c7f'), + ('\U00000d79', '\U00000d79'), + ('\U00000f01', '\U00000f03'), + ('\U00000f13', '\U00000f13'), + ('\U00000f15', '\U00000f17'), + ('\U00000f1a', '\U00000f1f'), + ('\U00000f34', '\U00000f34'), + ('\U00000f36', '\U00000f36'), + ('\U00000f38', '\U00000f38'), + ('\U00000fbe', '\U00000fc5'), + ('\U00000fc7', '\U00000fcc'), + ('\U00000fce', '\U00000fcf'), + ('\U00000fd5', '\U00000fd8'), + ('\U0000109e', '\U0000109f'), + ('\U00001390', '\U00001399'), + ('\U00001940', '\U00001940'), + ('\U000019de', '\U000019ff'), + ('\U00001b61', '\U00001b6a'), + ('\U00001b74', '\U00001b7c'), + ('\U00002100', '\U00002101'), + ('\U00002103', '\U00002106'), + ('\U00002108', '\U00002109'), + ('\U00002114', '\U00002114'), + ('\U00002116', '\U00002117'), + ('\U0000211e', '\U00002123'), + ('\U00002125', '\U00002125'), + ('\U00002127', '\U00002127'), + ('\U00002129', '\U00002129'), + ('\U0000212e', '\U0000212e'), + ('\U0000213a', '\U0000213b'), + ('\U0000214a', '\U0000214a'), + ('\U0000214c', '\U0000214d'), + ('\U0000214f', '\U0000214f'), + ('\U00002195', '\U00002199'), + ('\U0000219c', '\U0000219f'), + ('\U000021a1', '\U000021a2'), + ('\U000021a4', '\U000021a5'), + ('\U000021a7', '\U000021ad'), + ('\U000021af', '\U000021cd'), + ('\U000021d0', '\U000021d1'), + ('\U000021d3', '\U000021d3'), + ('\U000021d5', '\U000021f3'), + ('\U00002300', '\U00002307'), + ('\U0000230c', '\U0000231f'), + ('\U00002322', '\U00002328'), + ('\U0000232b', '\U0000237b'), + ('\U0000237d', '\U0000239a'), + ('\U000023b4', '\U000023db'), + ('\U000023e2', '\U000023f3'), + ('\U00002400', '\U00002426'), + ('\U00002440', '\U0000244a'), + ('\U0000249c', '\U000024e9'), + ('\U00002500', '\U000025b6'), + ('\U000025b8', '\U000025c0'), + ('\U000025c2', '\U000025f7'), + ('\U00002600', '\U0000266e'), + ('\U00002670', '\U000026ff'), + ('\U00002701', '\U00002767'), + ('\U00002794', '\U000027bf'), + ('\U00002800', '\U000028ff'), + ('\U00002b00', '\U00002b2f'), + ('\U00002b45', '\U00002b46'), + ('\U00002b50', '\U00002b59'), + ('\U00002ce5', '\U00002cea'), + ('\U00002e80', '\U00002e99'), + ('\U00002e9b', '\U00002ef3'), + ('\U00002f00', '\U00002fd5'), + ('\U00002ff0', '\U00002ffb'), + ('\U00003004', '\U00003004'), + ('\U00003012', '\U00003013'), + ('\U00003020', '\U00003020'), + ('\U00003036', '\U00003037'), + ('\U0000303e', '\U0000303f'), + ('\U00003190', '\U00003191'), + ('\U00003196', '\U0000319f'), + ('\U000031c0', '\U000031e3'), + ('\U00003200', '\U0000321e'), + ('\U0000322a', '\U00003247'), + ('\U00003250', '\U00003250'), + ('\U00003260', '\U0000327f'), + ('\U0000328a', '\U000032b0'), + ('\U000032c0', '\U000032fe'), + ('\U00003300', '\U000033ff'), + ('\U00004dc0', '\U00004dff'), + ('\U0000a490', '\U0000a4c6'), + ('\U0000a828', '\U0000a82b'), + ('\U0000a836', '\U0000a837'), + ('\U0000a839', '\U0000a839'), + ('\U0000aa77', '\U0000aa79'), + ('\U0000fdfd', '\U0000fdfd'), + ('\U0000ffe4', '\U0000ffe4'), + ('\U0000ffe8', '\U0000ffe8'), + ('\U0000ffed', '\U0000ffee'), + ('\U0000fffc', '\U0000fffd'), + ('\U00010137', '\U0001013f'), + ('\U00010179', '\U00010189'), + ('\U00010190', '\U0001019b'), + ('\U000101d0', '\U000101fc'), + ('\U0001d000', '\U0001d0f5'), + ('\U0001d100', '\U0001d126'), + ('\U0001d129', '\U0001d164'), + ('\U0001d16a', '\U0001d16c'), + ('\U0001d183', '\U0001d184'), + ('\U0001d18c', '\U0001d1a9'), + ('\U0001d1ae', '\U0001d1dd'), + ('\U0001d200', '\U0001d241'), + ('\U0001d245', '\U0001d245'), + ('\U0001d300', '\U0001d356'), + ('\U0001f000', '\U0001f02b'), + ('\U0001f030', '\U0001f093'), + ('\U0001f0a0', '\U0001f0ae'), + ('\U0001f0b1', '\U0001f0be'), + ('\U0001f0c1', '\U0001f0cf'), + ('\U0001f0d1', '\U0001f0df'), + ('\U0001f110', '\U0001f12e'), + ('\U0001f130', '\U0001f16b'), + ('\U0001f170', '\U0001f19a'), + ('\U0001f1e6', '\U0001f202'), + ('\U0001f210', '\U0001f23a'), + ('\U0001f240', '\U0001f248'), + ('\U0001f250', '\U0001f251'), + ('\U0001f300', '\U0001f320'), + ('\U0001f330', '\U0001f335'), + ('\U0001f337', '\U0001f37c'), + ('\U0001f380', '\U0001f393'), + ('\U0001f3a0', '\U0001f3c4'), + ('\U0001f3c6', '\U0001f3ca'), + ('\U0001f3e0', '\U0001f3f0'), + ('\U0001f400', '\U0001f43e'), + ('\U0001f440', '\U0001f440'), + ('\U0001f442', '\U0001f4f7'), + ('\U0001f4f9', '\U0001f4fc'), + ('\U0001f500', '\U0001f53d'), + ('\U0001f540', '\U0001f543'), + ('\U0001f550', '\U0001f567'), + ('\U0001f5fb', '\U0001f640'), + ('\U0001f645', '\U0001f64f'), + ('\U0001f680', '\U0001f6c5'), + ('\U0001f700', '\U0001f773') + ]), +("Sora_Sompeng", &[ + ('\U000110d0', '\U000110e8'), + ('\U000110f0', '\U000110f9') + ]), +("Sundanese", &[ + ('\U00001b80', '\U00001bbf'), + ('\U00001cc0', '\U00001cc7') + ]), +("Syloti_Nagri", &[ + ('\U0000a800', '\U0000a82b') + ]), +("Syriac", &[ + ('\U00000700', '\U0000070d'), + ('\U0000070f', '\U0000074a'), + ('\U0000074d', '\U0000074f') + ]), +("Tagalog", &[ + ('\U00001700', '\U0000170c'), + ('\U0000170e', '\U00001714') + ]), +("Tagbanwa", &[ + ('\U00001760', '\U0000176c'), + ('\U0000176e', '\U00001770'), + ('\U00001772', '\U00001773') + ]), +("Tai_Le", &[ + ('\U00001950', '\U0000196d'), + ('\U00001970', '\U00001974') + ]), +("Tai_Tham", &[ + ('\U00001a20', '\U00001a5e'), + ('\U00001a60', '\U00001a7c'), + ('\U00001a7f', '\U00001a89'), + ('\U00001a90', '\U00001a99'), + ('\U00001aa0', '\U00001aad') + ]), +("Tai_Viet", &[ + ('\U0000aa80', '\U0000aac2'), + ('\U0000aadb', '\U0000aadf') + ]), +("Takri", &[ + ('\U00011680', '\U000116b7'), + ('\U000116c0', '\U000116c9') + ]), +("Tamil", &[ + ('\U00000b82', '\U00000b83'), + ('\U00000b85', '\U00000b8a'), + ('\U00000b8e', '\U00000b90'), + ('\U00000b92', '\U00000b95'), + ('\U00000b99', '\U00000b9a'), + ('\U00000b9c', '\U00000b9c'), + ('\U00000b9e', '\U00000b9f'), + ('\U00000ba3', '\U00000ba4'), + ('\U00000ba8', '\U00000baa'), + ('\U00000bae', '\U00000bb9'), + ('\U00000bbe', '\U00000bc2'), + ('\U00000bc6', '\U00000bc8'), + ('\U00000bca', '\U00000bcd'), + ('\U00000bd0', '\U00000bd0'), + ('\U00000bd7', '\U00000bd7'), + ('\U00000be6', '\U00000bfa') + ]), +("Telugu", &[ + ('\U00000c01', '\U00000c03'), + ('\U00000c05', '\U00000c0c'), + ('\U00000c0e', '\U00000c10'), + ('\U00000c12', '\U00000c28'), + ('\U00000c2a', '\U00000c33'), + ('\U00000c35', '\U00000c39'), + ('\U00000c3d', '\U00000c44'), + ('\U00000c46', '\U00000c48'), + ('\U00000c4a', '\U00000c4d'), + ('\U00000c55', '\U00000c56'), + ('\U00000c58', '\U00000c59'), + ('\U00000c60', '\U00000c63'), + ('\U00000c66', '\U00000c6f'), + ('\U00000c78', '\U00000c7f') + ]), +("Thaana", &[ + ('\U00000780', '\U000007b1') + ]), +("Thai", &[ + ('\U00000e01', '\U00000e3a'), + ('\U00000e40', '\U00000e5b') + ]), +("Tibetan", &[ + ('\U00000f00', '\U00000f47'), + ('\U00000f49', '\U00000f6c'), + ('\U00000f71', '\U00000f97'), + ('\U00000f99', '\U00000fbc'), + ('\U00000fbe', '\U00000fcc'), + ('\U00000fce', '\U00000fd4'), + ('\U00000fd9', '\U00000fda') + ]), +("Tifinagh", &[ + ('\U00002d30', '\U00002d67'), + ('\U00002d6f', '\U00002d70'), + ('\U00002d7f', '\U00002d7f') + ]), +("Ugaritic", &[ + ('\U00010380', '\U0001039d'), + ('\U0001039f', '\U0001039f') + ]), +("Vai", &[ + ('\U0000a500', '\U0000a62b') + ]), +("Yi", &[ + ('\U0000a000', '\U0000a48c'), + ('\U0000a490', '\U0000a4c6') + ]), +("Z", &[ + ('\U00000020', '\U00000020'), + ('\U000000a0', '\U000000a0'), + ('\U00001680', '\U00001680'), + ('\U00002000', '\U0000200a'), + ('\U00002028', '\U00002029'), + ('\U0000202f', '\U0000202f'), + ('\U0000205f', '\U0000205f'), + ('\U00003000', '\U00003000') + ]), +("Zl", &[ + ('\U00002028', '\U00002028') + ]), +("Zp", &[ + ('\U00002029', '\U00002029') + ]), +("Zs", &[ + ('\U00000020', '\U00000020'), + ('\U000000a0', '\U000000a0'), + ('\U00001680', '\U00001680'), + ('\U00002000', '\U0000200a'), + ('\U0000202f', '\U0000202f'), + ('\U0000205f', '\U0000205f'), + ('\U00003000', '\U00003000') + ]), + +]; + +pub static PERLD: Class = &[ + ('\U00000030', '\U00000039'), + ('\U00000660', '\U00000669'), + ('\U000006f0', '\U000006f9'), + ('\U000007c0', '\U000007c9'), + ('\U00000966', '\U0000096f'), + ('\U000009e6', '\U000009ef'), + ('\U00000a66', '\U00000a6f'), + ('\U00000ae6', '\U00000aef'), + ('\U00000b66', '\U00000b6f'), + ('\U00000be6', '\U00000bef'), + ('\U00000c66', '\U00000c6f'), + ('\U00000ce6', '\U00000cef'), + ('\U00000d66', '\U00000d6f'), + ('\U00000e50', '\U00000e59'), + ('\U00000ed0', '\U00000ed9'), + ('\U00000f20', '\U00000f29'), + ('\U00001040', '\U00001049'), + ('\U00001090', '\U00001099'), + ('\U000017e0', '\U000017e9'), + ('\U00001810', '\U00001819'), + ('\U00001946', '\U0000194f'), + ('\U000019d0', '\U000019d9'), + ('\U00001a80', '\U00001a89'), + ('\U00001a90', '\U00001a99'), + ('\U00001b50', '\U00001b59'), + ('\U00001bb0', '\U00001bb9'), + ('\U00001c40', '\U00001c49'), + ('\U00001c50', '\U00001c59'), + ('\U0000a620', '\U0000a629'), + ('\U0000a8d0', '\U0000a8d9'), + ('\U0000a900', '\U0000a909'), + ('\U0000a9d0', '\U0000a9d9'), + ('\U0000aa50', '\U0000aa59'), + ('\U0000abf0', '\U0000abf9'), + ('\U0000ff10', '\U0000ff19'), + ('\U000104a0', '\U000104a9'), + ('\U00011066', '\U0001106f'), + ('\U000110f0', '\U000110f9'), + ('\U00011136', '\U0001113f'), + ('\U000111d0', '\U000111d9'), + ('\U000116c0', '\U000116c9'), + ('\U0001d7ce', '\U0001d7ff') +]; + +pub static PERLS: Class = &[ + ('\U00000009', '\U0000000a'), + ('\U0000000c', '\U0000000d'), + ('\U00000020', '\U00000020'), + ('\U000000a0', '\U000000a0'), + ('\U00001680', '\U00001680'), + ('\U00002000', '\U0000200a'), + ('\U00002028', '\U00002029'), + ('\U0000202f', '\U0000202f'), + ('\U0000205f', '\U0000205f'), + ('\U00003000', '\U00003000') +]; + +pub static PERLW: Class = &[ + ('\U00000030', '\U00000039'), + ('\U00000041', '\U0000005a'), + ('\U0000005f', '\U0000005f'), + ('\U00000061', '\U0000007a'), + ('\U000000aa', '\U000000aa'), + ('\U000000b5', '\U000000b5'), + ('\U000000ba', '\U000000ba'), + ('\U000000c0', '\U000000d6'), + ('\U000000d8', '\U000000f6'), + ('\U000000f8', '\U000002c1'), + ('\U000002c6', '\U000002d1'), + ('\U000002e0', '\U000002e4'), + ('\U000002ec', '\U000002ec'), + ('\U000002ee', '\U000002ee'), + ('\U00000370', '\U00000374'), + ('\U00000376', '\U00000377'), + ('\U0000037a', '\U0000037d'), + ('\U00000386', '\U00000386'), + ('\U00000388', '\U0000038a'), + ('\U0000038c', '\U0000038c'), + ('\U0000038e', '\U000003a1'), + ('\U000003a3', '\U000003f5'), + ('\U000003f7', '\U00000481'), + ('\U0000048a', '\U00000527'), + ('\U00000531', '\U00000556'), + ('\U00000559', '\U00000559'), + ('\U00000561', '\U00000587'), + ('\U000005d0', '\U000005ea'), + ('\U000005f0', '\U000005f2'), + ('\U00000620', '\U0000064a'), + ('\U0000066e', '\U0000066f'), + ('\U00000671', '\U000006d3'), + ('\U000006d5', '\U000006d5'), + ('\U000006e5', '\U000006e6'), + ('\U000006ee', '\U000006ef'), + ('\U000006fa', '\U000006fc'), + ('\U000006ff', '\U000006ff'), + ('\U00000710', '\U00000710'), + ('\U00000712', '\U0000072f'), + ('\U0000074d', '\U000007a5'), + ('\U000007b1', '\U000007b1'), + ('\U000007ca', '\U000007ea'), + ('\U000007f4', '\U000007f5'), + ('\U000007fa', '\U000007fa'), + ('\U00000800', '\U00000815'), + ('\U0000081a', '\U0000081a'), + ('\U00000824', '\U00000824'), + ('\U00000828', '\U00000828'), + ('\U00000840', '\U00000858'), + ('\U000008a0', '\U000008a0'), + ('\U000008a2', '\U000008ac'), + ('\U00000904', '\U00000939'), + ('\U0000093d', '\U0000093d'), + ('\U00000950', '\U00000950'), + ('\U00000958', '\U00000961'), + ('\U00000971', '\U00000977'), + ('\U00000979', '\U0000097f'), + ('\U00000985', '\U0000098c'), + ('\U0000098f', '\U00000990'), + ('\U00000993', '\U000009a8'), + ('\U000009aa', '\U000009b0'), + ('\U000009b2', '\U000009b2'), + ('\U000009b6', '\U000009b9'), + ('\U000009bd', '\U000009bd'), + ('\U000009ce', '\U000009ce'), + ('\U000009dc', '\U000009dd'), + ('\U000009df', '\U000009e1'), + ('\U000009f0', '\U000009f1'), + ('\U00000a05', '\U00000a0a'), + ('\U00000a0f', '\U00000a10'), + ('\U00000a13', '\U00000a28'), + ('\U00000a2a', '\U00000a30'), + ('\U00000a32', '\U00000a33'), + ('\U00000a35', '\U00000a36'), + ('\U00000a38', '\U00000a39'), + ('\U00000a59', '\U00000a5c'), + ('\U00000a5e', '\U00000a5e'), + ('\U00000a72', '\U00000a74'), + ('\U00000a85', '\U00000a8d'), + ('\U00000a8f', '\U00000a91'), + ('\U00000a93', '\U00000aa8'), + ('\U00000aaa', '\U00000ab0'), + ('\U00000ab2', '\U00000ab3'), + ('\U00000ab5', '\U00000ab9'), + ('\U00000abd', '\U00000abd'), + ('\U00000ad0', '\U00000ad0'), + ('\U00000ae0', '\U00000ae1'), + ('\U00000b05', '\U00000b0c'), + ('\U00000b0f', '\U00000b10'), + ('\U00000b13', '\U00000b28'), + ('\U00000b2a', '\U00000b30'), + ('\U00000b32', '\U00000b33'), + ('\U00000b35', '\U00000b39'), + ('\U00000b3d', '\U00000b3d'), + ('\U00000b5c', '\U00000b5d'), + ('\U00000b5f', '\U00000b61'), + ('\U00000b71', '\U00000b71'), + ('\U00000b83', '\U00000b83'), + ('\U00000b85', '\U00000b8a'), + ('\U00000b8e', '\U00000b90'), + ('\U00000b92', '\U00000b95'), + ('\U00000b99', '\U00000b9a'), + ('\U00000b9c', '\U00000b9c'), + ('\U00000b9e', '\U00000b9f'), + ('\U00000ba3', '\U00000ba4'), + ('\U00000ba8', '\U00000baa'), + ('\U00000bae', '\U00000bb9'), + ('\U00000bd0', '\U00000bd0'), + ('\U00000c05', '\U00000c0c'), + ('\U00000c0e', '\U00000c10'), + ('\U00000c12', '\U00000c28'), + ('\U00000c2a', '\U00000c33'), + ('\U00000c35', '\U00000c39'), + ('\U00000c3d', '\U00000c3d'), + ('\U00000c58', '\U00000c59'), + ('\U00000c60', '\U00000c61'), + ('\U00000c85', '\U00000c8c'), + ('\U00000c8e', '\U00000c90'), + ('\U00000c92', '\U00000ca8'), + ('\U00000caa', '\U00000cb3'), + ('\U00000cb5', '\U00000cb9'), + ('\U00000cbd', '\U00000cbd'), + ('\U00000cde', '\U00000cde'), + ('\U00000ce0', '\U00000ce1'), + ('\U00000cf1', '\U00000cf2'), + ('\U00000d05', '\U00000d0c'), + ('\U00000d0e', '\U00000d10'), + ('\U00000d12', '\U00000d3a'), + ('\U00000d3d', '\U00000d3d'), + ('\U00000d4e', '\U00000d4e'), + ('\U00000d60', '\U00000d61'), + ('\U00000d7a', '\U00000d7f'), + ('\U00000d85', '\U00000d96'), + ('\U00000d9a', '\U00000db1'), + ('\U00000db3', '\U00000dbb'), + ('\U00000dbd', '\U00000dbd'), + ('\U00000dc0', '\U00000dc6'), + ('\U00000e01', '\U00000e30'), + ('\U00000e32', '\U00000e33'), + ('\U00000e40', '\U00000e46'), + ('\U00000e81', '\U00000e82'), + ('\U00000e84', '\U00000e84'), + ('\U00000e87', '\U00000e88'), + ('\U00000e8a', '\U00000e8a'), + ('\U00000e8d', '\U00000e8d'), + ('\U00000e94', '\U00000e97'), + ('\U00000e99', '\U00000e9f'), + ('\U00000ea1', '\U00000ea3'), + ('\U00000ea5', '\U00000ea5'), + ('\U00000ea7', '\U00000ea7'), + ('\U00000eaa', '\U00000eab'), + ('\U00000ead', '\U00000eb0'), + ('\U00000eb2', '\U00000eb3'), + ('\U00000ebd', '\U00000ebd'), + ('\U00000ec0', '\U00000ec4'), + ('\U00000ec6', '\U00000ec6'), + ('\U00000edc', '\U00000edf'), + ('\U00000f00', '\U00000f00'), + ('\U00000f40', '\U00000f47'), + ('\U00000f49', '\U00000f6c'), + ('\U00000f88', '\U00000f8c'), + ('\U00001000', '\U0000102a'), + ('\U0000103f', '\U0000103f'), + ('\U00001050', '\U00001055'), + ('\U0000105a', '\U0000105d'), + ('\U00001061', '\U00001061'), + ('\U00001065', '\U00001066'), + ('\U0000106e', '\U00001070'), + ('\U00001075', '\U00001081'), + ('\U0000108e', '\U0000108e'), + ('\U000010a0', '\U000010c5'), + ('\U000010c7', '\U000010c7'), + ('\U000010cd', '\U000010cd'), + ('\U000010d0', '\U000010fa'), + ('\U000010fc', '\U00001248'), + ('\U0000124a', '\U0000124d'), + ('\U00001250', '\U00001256'), + ('\U00001258', '\U00001258'), + ('\U0000125a', '\U0000125d'), + ('\U00001260', '\U00001288'), + ('\U0000128a', '\U0000128d'), + ('\U00001290', '\U000012b0'), + ('\U000012b2', '\U000012b5'), + ('\U000012b8', '\U000012be'), + ('\U000012c0', '\U000012c0'), + ('\U000012c2', '\U000012c5'), + ('\U000012c8', '\U000012d6'), + ('\U000012d8', '\U00001310'), + ('\U00001312', '\U00001315'), + ('\U00001318', '\U0000135a'), + ('\U00001380', '\U0000138f'), + ('\U000013a0', '\U000013f4'), + ('\U00001401', '\U0000166c'), + ('\U0000166f', '\U0000167f'), + ('\U00001681', '\U0000169a'), + ('\U000016a0', '\U000016ea'), + ('\U00001700', '\U0000170c'), + ('\U0000170e', '\U00001711'), + ('\U00001720', '\U00001731'), + ('\U00001740', '\U00001751'), + ('\U00001760', '\U0000176c'), + ('\U0000176e', '\U00001770'), + ('\U00001780', '\U000017b3'), + ('\U000017d7', '\U000017d7'), + ('\U000017dc', '\U000017dc'), + ('\U00001820', '\U00001877'), + ('\U00001880', '\U000018a8'), + ('\U000018aa', '\U000018aa'), + ('\U000018b0', '\U000018f5'), + ('\U00001900', '\U0000191c'), + ('\U00001950', '\U0000196d'), + ('\U00001970', '\U00001974'), + ('\U00001980', '\U000019ab'), + ('\U000019c1', '\U000019c7'), + ('\U00001a00', '\U00001a16'), + ('\U00001a20', '\U00001a54'), + ('\U00001aa7', '\U00001aa7'), + ('\U00001b05', '\U00001b33'), + ('\U00001b45', '\U00001b4b'), + ('\U00001b83', '\U00001ba0'), + ('\U00001bae', '\U00001baf'), + ('\U00001bba', '\U00001be5'), + ('\U00001c00', '\U00001c23'), + ('\U00001c4d', '\U00001c4f'), + ('\U00001c5a', '\U00001c7d'), + ('\U00001ce9', '\U00001cec'), + ('\U00001cee', '\U00001cf1'), + ('\U00001cf5', '\U00001cf6'), + ('\U00001d00', '\U00001dbf'), + ('\U00001e00', '\U00001f15'), + ('\U00001f18', '\U00001f1d'), + ('\U00001f20', '\U00001f45'), + ('\U00001f48', '\U00001f4d'), + ('\U00001f50', '\U00001f57'), + ('\U00001f59', '\U00001f59'), + ('\U00001f5b', '\U00001f5b'), + ('\U00001f5d', '\U00001f5d'), + ('\U00001f5f', '\U00001f7d'), + ('\U00001f80', '\U00001fb4'), + ('\U00001fb6', '\U00001fbc'), + ('\U00001fbe', '\U00001fbe'), + ('\U00001fc2', '\U00001fc4'), + ('\U00001fc6', '\U00001fcc'), + ('\U00001fd0', '\U00001fd3'), + ('\U00001fd6', '\U00001fdb'), + ('\U00001fe0', '\U00001fec'), + ('\U00001ff2', '\U00001ff4'), + ('\U00001ff6', '\U00001ffc'), + ('\U00002071', '\U00002071'), + ('\U0000207f', '\U0000207f'), + ('\U00002090', '\U0000209c'), + ('\U00002102', '\U00002102'), + ('\U00002107', '\U00002107'), + ('\U0000210a', '\U00002113'), + ('\U00002115', '\U00002115'), + ('\U00002119', '\U0000211d'), + ('\U00002124', '\U00002124'), + ('\U00002126', '\U00002126'), + ('\U00002128', '\U00002128'), + ('\U0000212a', '\U0000212d'), + ('\U0000212f', '\U00002139'), + ('\U0000213c', '\U0000213f'), + ('\U00002145', '\U00002149'), + ('\U0000214e', '\U0000214e'), + ('\U00002183', '\U00002184'), + ('\U00002c00', '\U00002c2e'), + ('\U00002c30', '\U00002c5e'), + ('\U00002c60', '\U00002ce4'), + ('\U00002ceb', '\U00002cee'), + ('\U00002cf2', '\U00002cf3'), + ('\U00002d00', '\U00002d25'), + ('\U00002d27', '\U00002d27'), + ('\U00002d2d', '\U00002d2d'), + ('\U00002d30', '\U00002d67'), + ('\U00002d6f', '\U00002d6f'), + ('\U00002d80', '\U00002d96'), + ('\U00002da0', '\U00002da6'), + ('\U00002da8', '\U00002dae'), + ('\U00002db0', '\U00002db6'), + ('\U00002db8', '\U00002dbe'), + ('\U00002dc0', '\U00002dc6'), + ('\U00002dc8', '\U00002dce'), + ('\U00002dd0', '\U00002dd6'), + ('\U00002dd8', '\U00002dde'), + ('\U00002e2f', '\U00002e2f'), + ('\U00003005', '\U00003006'), + ('\U00003031', '\U00003035'), + ('\U0000303b', '\U0000303c'), + ('\U00003041', '\U00003096'), + ('\U0000309d', '\U0000309f'), + ('\U000030a1', '\U000030fa'), + ('\U000030fc', '\U000030ff'), + ('\U00003105', '\U0000312d'), + ('\U00003131', '\U0000318e'), + ('\U000031a0', '\U000031ba'), + ('\U000031f0', '\U000031ff'), + ('\U00003400', '\U00003400'), + ('\U00004db5', '\U00004db5'), + ('\U00004e00', '\U00004e00'), + ('\U00009fcc', '\U00009fcc'), + ('\U0000a000', '\U0000a48c'), + ('\U0000a4d0', '\U0000a4fd'), + ('\U0000a500', '\U0000a60c'), + ('\U0000a610', '\U0000a61f'), + ('\U0000a62a', '\U0000a62b'), + ('\U0000a640', '\U0000a66e'), + ('\U0000a67f', '\U0000a697'), + ('\U0000a6a0', '\U0000a6e5'), + ('\U0000a717', '\U0000a71f'), + ('\U0000a722', '\U0000a788'), + ('\U0000a78b', '\U0000a78e'), + ('\U0000a790', '\U0000a793'), + ('\U0000a7a0', '\U0000a7aa'), + ('\U0000a7f8', '\U0000a801'), + ('\U0000a803', '\U0000a805'), + ('\U0000a807', '\U0000a80a'), + ('\U0000a80c', '\U0000a822'), + ('\U0000a840', '\U0000a873'), + ('\U0000a882', '\U0000a8b3'), + ('\U0000a8f2', '\U0000a8f7'), + ('\U0000a8fb', '\U0000a8fb'), + ('\U0000a90a', '\U0000a925'), + ('\U0000a930', '\U0000a946'), + ('\U0000a960', '\U0000a97c'), + ('\U0000a984', '\U0000a9b2'), + ('\U0000a9cf', '\U0000a9cf'), + ('\U0000aa00', '\U0000aa28'), + ('\U0000aa40', '\U0000aa42'), + ('\U0000aa44', '\U0000aa4b'), + ('\U0000aa60', '\U0000aa76'), + ('\U0000aa7a', '\U0000aa7a'), + ('\U0000aa80', '\U0000aaaf'), + ('\U0000aab1', '\U0000aab1'), + ('\U0000aab5', '\U0000aab6'), + ('\U0000aab9', '\U0000aabd'), + ('\U0000aac0', '\U0000aac0'), + ('\U0000aac2', '\U0000aac2'), + ('\U0000aadb', '\U0000aadd'), + ('\U0000aae0', '\U0000aaea'), + ('\U0000aaf2', '\U0000aaf4'), + ('\U0000ab01', '\U0000ab06'), + ('\U0000ab09', '\U0000ab0e'), + ('\U0000ab11', '\U0000ab16'), + ('\U0000ab20', '\U0000ab26'), + ('\U0000ab28', '\U0000ab2e'), + ('\U0000abc0', '\U0000abe2'), + ('\U0000ac00', '\U0000ac00'), + ('\U0000d7a3', '\U0000d7a3'), + ('\U0000d7b0', '\U0000d7c6'), + ('\U0000d7cb', '\U0000d7fb'), + ('\U0000f900', '\U0000fa6d'), + ('\U0000fa70', '\U0000fad9'), + ('\U0000fb00', '\U0000fb06'), + ('\U0000fb13', '\U0000fb17'), + ('\U0000fb1d', '\U0000fb1d'), + ('\U0000fb1f', '\U0000fb28'), + ('\U0000fb2a', '\U0000fb36'), + ('\U0000fb38', '\U0000fb3c'), + ('\U0000fb3e', '\U0000fb3e'), + ('\U0000fb40', '\U0000fb41'), + ('\U0000fb43', '\U0000fb44'), + ('\U0000fb46', '\U0000fbb1'), + ('\U0000fbd3', '\U0000fd3d'), + ('\U0000fd50', '\U0000fd8f'), + ('\U0000fd92', '\U0000fdc7'), + ('\U0000fdf0', '\U0000fdfb'), + ('\U0000fe70', '\U0000fe74'), + ('\U0000fe76', '\U0000fefc'), + ('\U0000ff21', '\U0000ff3a'), + ('\U0000ff41', '\U0000ff5a'), + ('\U0000ff66', '\U0000ffbe'), + ('\U0000ffc2', '\U0000ffc7'), + ('\U0000ffca', '\U0000ffcf'), + ('\U0000ffd2', '\U0000ffd7'), + ('\U0000ffda', '\U0000ffdc'), + ('\U00010000', '\U0001000b'), + ('\U0001000d', '\U00010026'), + ('\U00010028', '\U0001003a'), + ('\U0001003c', '\U0001003d'), + ('\U0001003f', '\U0001004d'), + ('\U00010050', '\U0001005d'), + ('\U00010080', '\U000100fa'), + ('\U00010280', '\U0001029c'), + ('\U000102a0', '\U000102d0'), + ('\U00010300', '\U0001031e'), + ('\U00010330', '\U00010340'), + ('\U00010342', '\U00010349'), + ('\U00010380', '\U0001039d'), + ('\U000103a0', '\U000103c3'), + ('\U000103c8', '\U000103cf'), + ('\U00010400', '\U0001049d'), + ('\U00010800', '\U00010805'), + ('\U00010808', '\U00010808'), + ('\U0001080a', '\U00010835'), + ('\U00010837', '\U00010838'), + ('\U0001083c', '\U0001083c'), + ('\U0001083f', '\U00010855'), + ('\U00010900', '\U00010915'), + ('\U00010920', '\U00010939'), + ('\U00010980', '\U000109b7'), + ('\U000109be', '\U000109bf'), + ('\U00010a00', '\U00010a00'), + ('\U00010a10', '\U00010a13'), + ('\U00010a15', '\U00010a17'), + ('\U00010a19', '\U00010a33'), + ('\U00010a60', '\U00010a7c'), + ('\U00010b00', '\U00010b35'), + ('\U00010b40', '\U00010b55'), + ('\U00010b60', '\U00010b72'), + ('\U00010c00', '\U00010c48'), + ('\U00011003', '\U00011037'), + ('\U00011083', '\U000110af'), + ('\U000110d0', '\U000110e8'), + ('\U00011103', '\U00011126'), + ('\U00011183', '\U000111b2'), + ('\U000111c1', '\U000111c4'), + ('\U00011680', '\U000116aa'), + ('\U00012000', '\U0001236e'), + ('\U00013000', '\U0001342e'), + ('\U00016800', '\U00016a38'), + ('\U00016f00', '\U00016f44'), + ('\U00016f50', '\U00016f50'), + ('\U00016f93', '\U00016f9f'), + ('\U0001b000', '\U0001b001'), + ('\U0001d400', '\U0001d454'), + ('\U0001d456', '\U0001d49c'), + ('\U0001d49e', '\U0001d49f'), + ('\U0001d4a2', '\U0001d4a2'), + ('\U0001d4a5', '\U0001d4a6'), + ('\U0001d4a9', '\U0001d4ac'), + ('\U0001d4ae', '\U0001d4b9'), + ('\U0001d4bb', '\U0001d4bb'), + ('\U0001d4bd', '\U0001d4c3'), + ('\U0001d4c5', '\U0001d505'), + ('\U0001d507', '\U0001d50a'), + ('\U0001d50d', '\U0001d514'), + ('\U0001d516', '\U0001d51c'), + ('\U0001d51e', '\U0001d539'), + ('\U0001d53b', '\U0001d53e'), + ('\U0001d540', '\U0001d544'), + ('\U0001d546', '\U0001d546'), + ('\U0001d54a', '\U0001d550'), + ('\U0001d552', '\U0001d6a5'), + ('\U0001d6a8', '\U0001d6c0'), + ('\U0001d6c2', '\U0001d6da'), + ('\U0001d6dc', '\U0001d6fa'), + ('\U0001d6fc', '\U0001d714'), + ('\U0001d716', '\U0001d734'), + ('\U0001d736', '\U0001d74e'), + ('\U0001d750', '\U0001d76e'), + ('\U0001d770', '\U0001d788'), + ('\U0001d78a', '\U0001d7a8'), + ('\U0001d7aa', '\U0001d7c2'), + ('\U0001d7c4', '\U0001d7cb'), + ('\U0001ee00', '\U0001ee03'), + ('\U0001ee05', '\U0001ee1f'), + ('\U0001ee21', '\U0001ee22'), + ('\U0001ee24', '\U0001ee24'), + ('\U0001ee27', '\U0001ee27'), + ('\U0001ee29', '\U0001ee32'), + ('\U0001ee34', '\U0001ee37'), + ('\U0001ee39', '\U0001ee39'), + ('\U0001ee3b', '\U0001ee3b'), + ('\U0001ee42', '\U0001ee42'), + ('\U0001ee47', '\U0001ee47'), + ('\U0001ee49', '\U0001ee49'), + ('\U0001ee4b', '\U0001ee4b'), + ('\U0001ee4d', '\U0001ee4f'), + ('\U0001ee51', '\U0001ee52'), + ('\U0001ee54', '\U0001ee54'), + ('\U0001ee57', '\U0001ee57'), + ('\U0001ee59', '\U0001ee59'), + ('\U0001ee5b', '\U0001ee5b'), + ('\U0001ee5d', '\U0001ee5d'), + ('\U0001ee5f', '\U0001ee5f'), + ('\U0001ee61', '\U0001ee62'), + ('\U0001ee64', '\U0001ee64'), + ('\U0001ee67', '\U0001ee6a'), + ('\U0001ee6c', '\U0001ee72'), + ('\U0001ee74', '\U0001ee77'), + ('\U0001ee79', '\U0001ee7c'), + ('\U0001ee7e', '\U0001ee7e'), + ('\U0001ee80', '\U0001ee89'), + ('\U0001ee8b', '\U0001ee9b'), + ('\U0001eea1', '\U0001eea3'), + ('\U0001eea5', '\U0001eea9'), + ('\U0001eeab', '\U0001eebb'), + ('\U00020000', '\U00020000'), + ('\U0002a6d6', '\U0002a6d6'), + ('\U0002a700', '\U0002a700'), + ('\U0002b734', '\U0002b734'), + ('\U0002b740', '\U0002b740'), + ('\U0002b81d', '\U0002b81d'), + ('\U0002f800', '\U0002fa1d') +]; + diff --git a/src/libregex/vm.rs b/src/libregex/vm.rs new file mode 100644 index 0000000000000..6058ba6bf9210 --- /dev/null +++ b/src/libregex/vm.rs @@ -0,0 +1,587 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// FIXME: Currently, the VM simulates an NFA. It would be nice to have another +// VM that simulates a DFA. +// +// According to Russ Cox[1], a DFA performs better than an NFA, principally +// because it reuses states previously computed by the machine *and* doesn't +// keep track of capture groups. The drawback of a DFA (aside from its +// complexity) is that it can't accurately return the locations of submatches. +// The NFA *can* do that. (This is my understanding anyway.) +// +// Cox suggests that a DFA ought to be used to answer "does this match" and +// "where does it match" questions. (In the latter, the starting position of +// the match is computed by executing the regex backwards.) Cox also suggests +// that a DFA should be run when asking "where are the submatches", which can +// 1) quickly answer "no" is there's no match and 2) discover the substring +// that matches, which means running the NFA on smaller input. +// +// Currently, the NFA simulation implemented below does some dirty tricks to +// avoid tracking capture groups when they aren't needed (which only works +// for 'is_match', not 'find'). This is a half-measure, but does provide some +// perf improvement. +// +// AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go. +// +// [1] - http://swtch.com/~rsc/regex/regex3.html + +use std::cmp; +use std::mem; +use std::slice::MutableVector; +use compile::{ + Program, + Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary, + Save, Jump, Split, +}; +use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED}; +use parse::unicode::PERLW; + +pub type CaptureLocs = Vec<Option<uint>>; + +/// Indicates the type of match to be performed by the VM. +pub enum MatchKind { + /// Only checks if a match exists or not. Does not return location. + Exists, + /// Returns the start and end indices of the entire match in the input + /// given. + Location, + /// Returns the start and end indices of each submatch in the input given. + Submatches, +} + +/// Runs an NFA simulation on the compiled expression given on the search text +/// `input`. The search begins at byte index `start` and ends at byte index +/// `end`. (The range is specified here so that zero-width assertions will work +/// correctly when searching for successive non-overlapping matches.) +/// +/// The `which` parameter indicates what kind of capture information the caller +/// wants. There are three choices: match existence only, the location of the +/// entire match or the locations of the entire match in addition to the +/// locations of each submatch. +pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str, + start: uint, end: uint) -> CaptureLocs { + Nfa { + which: which, + prog: prog, + input: input, + start: start, + end: end, + ic: 0, + chars: CharReader::new(input), + }.run() +} + +struct Nfa<'r, 't> { + which: MatchKind, + prog: &'r Program, + input: &'t str, + start: uint, + end: uint, + ic: uint, + chars: CharReader<'t>, +} + +/// Indicates the next action to take after a single non-empty instruction +/// is processed. +pub enum StepState { + /// This is returned if and only if a Match instruction is reached and + /// we only care about the existence of a match. It instructs the VM to + /// quit early. + StepMatchEarlyReturn, + /// Indicates that a match was found. Thus, the rest of the states in the + /// *current* queue should be dropped (i.e., leftmost-first semantics). + /// States in the "next" queue can still be processed. + StepMatch, + /// No match was found. Continue with the next state in the queue. + StepContinue, +} + +impl<'r, 't> Nfa<'r, 't> { + fn run(&mut self) -> CaptureLocs { + let ncaps = match self.which { + Exists => 0, + Location => 1, + Submatches => self.prog.num_captures(), + }; + let mut matched = false; + let ninsts = self.prog.insts.len(); + let mut clist = &mut Threads::new(self.which, ninsts, ncaps); + let mut nlist = &mut Threads::new(self.which, ninsts, ncaps); + + let mut groups = Vec::from_elem(ncaps * 2, None); + + // Determine if the expression starts with a '^' so we can avoid + // simulating .*? + // Make sure multi-line mode isn't enabled for it, otherwise we can't + // drop the initial .*? + let prefix_anchor = + match *self.prog.insts.get(1) { + EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, + _ => false, + }; + + self.ic = self.start; + let mut next_ic = self.chars.set(self.start); + while self.ic <= self.end { + if clist.size == 0 { + // We have a match and we're done exploring alternatives. + // Time to quit. + if matched { + break + } + + // If there are no threads to try, then we'll have to start + // over at the beginning of the regex. + // BUT, if there's a literal prefix for the program, try to + // jump ahead quickly. If it can't be found, then we can bail + // out early. + if self.prog.prefix.len() > 0 && clist.size == 0 { + let needle = self.prog.prefix.as_slice().as_bytes(); + let haystack = self.input.as_bytes().slice_from(self.ic); + match find_prefix(needle, haystack) { + None => break, + Some(i) => { + self.ic += i; + next_ic = self.chars.set(self.ic); + } + } + } + } + + // This simulates a preceding '.*?' for every regex by adding + // a state starting at the current position in the input for the + // beginning of the program only if we don't already have a match. + if clist.size == 0 || (!prefix_anchor && !matched) { + self.add(clist, 0, groups.as_mut_slice()) + } + + // Now we try to read the next character. + // As a result, the 'step' method will look at the previous + // character. + self.ic = next_ic; + next_ic = self.chars.advance(); + + let mut i = 0; + while i < clist.size { + let pc = clist.pc(i); + let step_state = self.step(groups.as_mut_slice(), nlist, + clist.groups(i), pc); + match step_state { + StepMatchEarlyReturn => return vec![Some(0), Some(0)], + StepMatch => { matched = true; clist.empty() }, + StepContinue => {}, + } + i += 1; + } + mem::swap(&mut clist, &mut nlist); + nlist.empty(); + } + match self.which { + Exists if matched => vec![Some(0), Some(0)], + Exists => vec![None, None], + Location | Submatches => groups, + } + } + + fn step(&self, groups: &mut [Option<uint>], nlist: &mut Threads, + caps: &mut [Option<uint>], pc: uint) + -> StepState { + match *self.prog.insts.get(pc) { + Match => { + match self.which { + Exists => { + return StepMatchEarlyReturn + } + Location => { + groups[0] = caps[0]; + groups[1] = caps[1]; + return StepMatch + } + Submatches => { + for (slot, val) in groups.mut_iter().zip(caps.iter()) { + *slot = *val; + } + return StepMatch + } + } + } + OneChar(c, flags) => { + if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) { + self.add(nlist, pc+1, caps); + } + } + CharClass(ref ranges, flags) => { + if self.chars.prev.is_some() { + let c = self.chars.prev.unwrap(); + let negate = flags & FLAG_NEGATED > 0; + let casei = flags & FLAG_NOCASE > 0; + let found = ranges.as_slice(); + let found = found.bsearch(|&rc| class_cmp(casei, c, rc)); + let found = found.is_some(); + if (found && !negate) || (!found && negate) { + self.add(nlist, pc+1, caps); + } + } + } + Any(flags) => { + if flags & FLAG_DOTNL > 0 + || !self.char_eq(false, self.chars.prev, '\n') { + self.add(nlist, pc+1, caps) + } + } + EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_) + | Save(_) | Jump(_) | Split(_, _) => {}, + } + StepContinue + } + + fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option<uint>]) { + if nlist.contains(pc) { + return + } + // We have to add states to the threads list even if their empty. + // TL;DR - It prevents cycles. + // If we didn't care about cycles, we'd *only* add threads that + // correspond to non-jumping instructions (OneChar, Any, Match, etc.). + // But, it's possible for valid regexs (like '(a*)*') to result in + // a cycle in the instruction list. e.g., We'll keep chasing the Split + // instructions forever. + // So we add these instructions to our thread queue, but in the main + // VM loop, we look for them but simply ignore them. + // Adding them to the queue prevents them from being revisited so we + // can avoid cycles (and the inevitable stack overflow). + // + // We make a minor optimization by indicating that the state is "empty" + // so that its capture groups are not filled in. + match *self.prog.insts.get(pc) { + EmptyBegin(flags) => { + let multi = flags & FLAG_MULTI > 0; + nlist.add(pc, groups, true); + if self.chars.is_begin() + || (multi && self.char_is(self.chars.prev, '\n')) { + self.add(nlist, pc + 1, groups) + } + } + EmptyEnd(flags) => { + let multi = flags & FLAG_MULTI > 0; + nlist.add(pc, groups, true); + if self.chars.is_end() + || (multi && self.char_is(self.chars.cur, '\n')) { + self.add(nlist, pc + 1, groups) + } + } + EmptyWordBoundary(flags) => { + nlist.add(pc, groups, true); + if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) { + self.add(nlist, pc + 1, groups) + } + } + Save(slot) => { + nlist.add(pc, groups, true); + match self.which { + Location if slot <= 1 => { + let old = groups[slot]; + groups[slot] = Some(self.ic); + self.add(nlist, pc + 1, groups); + groups[slot] = old; + } + Submatches => { + let old = groups[slot]; + groups[slot] = Some(self.ic); + self.add(nlist, pc + 1, groups); + groups[slot] = old; + } + Exists | Location => self.add(nlist, pc + 1, groups), + } + } + Jump(to) => { + nlist.add(pc, groups, true); + self.add(nlist, to, groups) + } + Split(x, y) => { + nlist.add(pc, groups, true); + self.add(nlist, x, groups); + self.add(nlist, y, groups); + } + Match | OneChar(_, _) | CharClass(_, _) | Any(_) => { + nlist.add(pc, groups, false); + } + } + } + + // FIXME: For case insensitive comparisons, it uses the uppercase + // character and tests for equality. IIUC, this does not generalize to + // all of Unicode. I believe we need to check the entire fold for each + // character. This will be easy to add if and when it gets added to Rust's + // standard library. + #[inline] + fn char_eq(&self, casei: bool, textc: Option<char>, regc: char) -> bool { + match textc { + None => false, + Some(textc) => { + regc == textc + || (casei && regc.to_uppercase() == textc.to_uppercase()) + } + } + } + + #[inline] + fn char_is(&self, textc: Option<char>, regc: char) -> bool { + textc == Some(regc) + } +} + +/// CharReader is responsible for maintaining a "previous" and a "current" +/// character. This one-character lookahead is necessary for assertions that +/// look one character before or after the current position. +pub struct CharReader<'t> { + /// The previous character read. It is None only when processing the first + /// character of the input. + pub prev: Option<char>, + /// The current character. + pub cur: Option<char>, + input: &'t str, + next: uint, +} + +impl<'t> CharReader<'t> { + /// Returns a new CharReader that advances through the input given. + /// Note that a CharReader has no knowledge of the range in which to search + /// the input. + pub fn new(input: &'t str) -> CharReader<'t> { + CharReader { + prev: None, + cur: None, + input: input, + next: 0, + } + } + + /// Sets the previous and current character given any arbitrary byte + /// index (at a unicode codepoint boundary). + #[inline] + pub fn set(&mut self, ic: uint) -> uint { + self.prev = None; + self.cur = None; + self.next = 0; + + if self.input.len() == 0 { + return 1 + } + if ic > 0 { + let i = cmp::min(ic, self.input.len()); + let prev = self.input.char_range_at_reverse(i); + self.prev = Some(prev.ch); + } + if ic < self.input.len() { + let cur = self.input.char_range_at(ic); + self.cur = Some(cur.ch); + self.next = cur.next; + self.next + } else { + self.input.len() + 1 + } + } + + /// Does the same as `set`, except it always advances to the next + /// character in the input (and therefore does half as many UTF8 decodings). + #[inline] + pub fn advance(&mut self) -> uint { + self.prev = self.cur; + if self.next < self.input.len() { + let cur = self.input.char_range_at(self.next); + self.cur = Some(cur.ch); + self.next = cur.next; + } else { + self.cur = None; + self.next = self.input.len() + 1; + } + self.next + } + + /// Returns true if and only if this is the beginning of the input + /// (ignoring the range of the input to search). + #[inline] + pub fn is_begin(&self) -> bool { self.prev.is_none() } + + /// Returns true if and only if this is the end of the input + /// (ignoring the range of the input to search). + #[inline] + pub fn is_end(&self) -> bool { self.cur.is_none() } + + /// Returns true if and only if the current position is a word boundary. + /// (Ignoring the range of the input to search.) + pub fn is_word_boundary(&self) -> bool { + if self.is_begin() { + return is_word(self.cur) + } + if self.is_end() { + return is_word(self.prev) + } + (is_word(self.cur) && !is_word(self.prev)) + || (is_word(self.prev) && !is_word(self.cur)) + } +} + +struct Thread { + pc: uint, + groups: Vec<Option<uint>>, +} + +struct Threads { + which: MatchKind, + queue: Vec<Thread>, + sparse: Vec<uint>, + size: uint, +} + +impl Threads { + // This is using a wicked neat trick to provide constant time lookup + // for threads in the queue using a sparse set. A queue of threads is + // allocated once with maximal size when the VM initializes and is reused + // throughout execution. That is, there should be zero allocation during + // the execution of a VM. + // + // See http://research.swtch.com/sparse for the deets. + fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads { + Threads { + which: which, + queue: Vec::from_fn(num_insts, |_| { + Thread { pc: 0, groups: Vec::from_elem(ncaps * 2, None) } + }), + sparse: Vec::from_elem(num_insts, 0u), + size: 0, + } + } + + fn add(&mut self, pc: uint, groups: &[Option<uint>], empty: bool) { + let t = self.queue.get_mut(self.size); + t.pc = pc; + match (empty, self.which) { + (_, Exists) | (true, _) => {}, + (false, Location) => { + *t.groups.get_mut(0) = groups[0]; + *t.groups.get_mut(1) = groups[1]; + } + (false, Submatches) => { + for (slot, val) in t.groups.mut_iter().zip(groups.iter()) { + *slot = *val; + } + } + } + *self.sparse.get_mut(pc) = self.size; + self.size += 1; + } + + #[inline] + fn contains(&self, pc: uint) -> bool { + let s = *self.sparse.get(pc); + s < self.size && self.queue.get(s).pc == pc + } + + #[inline] + fn empty(&mut self) { + self.size = 0; + } + + #[inline] + fn pc(&self, i: uint) -> uint { + self.queue.get(i).pc + } + + #[inline] + fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option<uint>] { + self.queue.get_mut(i).groups.as_mut_slice() + } +} + +/// Returns true if the character is a word character, according to the +/// (Unicode friendly) Perl character class '\w'. +/// Note that this is only use for testing word boundaries. The actual '\w' +/// is encoded as a CharClass instruction. +pub fn is_word(c: Option<char>) -> bool { + let c = match c { + None => return false, + Some(c) => c, + }; + // Try the common ASCII case before invoking binary search. + match c { + '_' | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' => true, + _ => PERLW.bsearch(|&(start, end)| { + if c >= start && c <= end { + Equal + } else if start > c { + Greater + } else { + Less + } + }).is_some() + } +} + +/// Given a character and a single character class range, return an ordering +/// indicating whether the character is less than the start of the range, +/// in the range (inclusive) or greater than the end of the range. +/// +/// If `casei` is `true`, then this ordering is computed case insensitively. +/// +/// This function is meant to be used with a binary search. +#[inline] +fn class_cmp(casei: bool, mut textc: char, + (mut start, mut end): (char, char)) -> Ordering { + if casei { + // FIXME: This is pretty ridiculous. All of this case conversion + // can be moved outside this function: + // 1) textc should be uppercased outside the bsearch. + // 2) the character class itself should be uppercased either in the + // parser or the compiler. + // FIXME: This is too simplistic for correct Unicode support. + // See also: char_eq + textc = textc.to_uppercase(); + start = start.to_uppercase(); + end = end.to_uppercase(); + } + if textc >= start && textc <= end { + Equal + } else if start > textc { + Greater + } else { + Less + } +} + +/// Returns the starting location of `needle` in `haystack`. +/// If `needle` is not in `haystack`, then `None` is returned. +/// +/// Note that this is using a naive substring algorithm. +#[inline] +pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option<uint> { + let (hlen, nlen) = (haystack.len(), needle.len()); + if nlen > hlen || nlen == 0 { + return None + } + let mut hayi = 0u; + 'HAYSTACK: loop { + if hayi > hlen - nlen { + break + } + let mut nedi = 0; + while nedi < nlen { + if haystack[hayi+nedi] != needle[nedi] { + hayi += 1; + continue 'HAYSTACK + } + nedi += 1; + } + return Some(hayi) + } + None +} diff --git a/src/libregex_macros/lib.rs b/src/libregex_macros/lib.rs new file mode 100644 index 0000000000000..72e00deba4d9c --- /dev/null +++ b/src/libregex_macros/lib.rs @@ -0,0 +1,684 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This crate provides the `regex!` macro. Its use is documented in the +//! `regex` crate. + +#![crate_id = "regex_macros#0.11-pre"] +#![crate_type = "dylib"] +#![experimental] +#![license = "MIT/ASL2"] +#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", + html_favicon_url = "http://www.rust-lang.org/favicon.ico", + html_root_url = "http://static.rust-lang.org/doc/master")] + +#![feature(macro_registrar, managed_boxes, quote)] + +extern crate regex; +extern crate syntax; + +use syntax::ast; +use syntax::codemap; +use syntax::ext::base::{ + SyntaxExtension, ExtCtxt, MacResult, MacExpr, DummyResult, + NormalTT, BasicMacroExpander, +}; +use syntax::parse; +use syntax::parse::token; +use syntax::print::pprust; + +use regex::Regex; +use regex::native::{ + OneChar, CharClass, Any, Save, Jump, Split, + Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, + Program, Dynamic, Native, + FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED, +}; + +/// For the `regex!` syntax extension. Do not use. +#[macro_registrar] +#[doc(hidden)] +pub fn macro_registrar(register: |ast::Name, SyntaxExtension|) { + let expander = ~BasicMacroExpander { expander: native, span: None }; + register(token::intern("regex"), NormalTT(expander, None)) +} + +/// Generates specialized code for the Pike VM for a particular regular +/// expression. +/// +/// There are two primary differences between the code generated here and the +/// general code in vm.rs. +/// +/// 1. All heap allocation is removed. Sized vector types are used instead. +/// Care must be taken to make sure that these vectors are not copied +/// gratuitously. (If you're not sure, run the benchmarks. They will yell +/// at you if you do.) +/// 2. The main `match instruction { ... }` expressions are replaced with more +/// direct `match pc { ... }`. The generators can be found in +/// `step_insts` and `add_insts`. +/// +/// Other more minor changes include eliding code when possible (although this +/// isn't completely thorough at the moment), and translating character class +/// matching from using a binary search to a simple `match` expression (see +/// `match_class`). +/// +/// It is strongly recommended to read the dynamic implementation in vm.rs +/// first before trying to understand the code generator. The implementation +/// strategy is identical and vm.rs has comments and will be easier to follow. +fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree]) + -> ~MacResult { + let regex = match parse(cx, tts) { + Some(r) => r, + // error is logged in 'parse' with cx.span_err + None => return DummyResult::any(sp), + }; + let re = match Regex::new(regex.to_owned()) { + Ok(re) => re, + Err(err) => { + cx.span_err(sp, err.to_str()); + return DummyResult::any(sp) + } + }; + let prog = match re.p { + Dynamic(ref prog) => prog.clone(), + Native(_) => unreachable!(), + }; + + let mut gen = NfaGen { + cx: &*cx, sp: sp, prog: prog, + names: re.names.clone(), original: re.original.clone(), + }; + MacExpr::new(gen.code()) +} + +struct NfaGen<'a> { + cx: &'a ExtCtxt<'a>, + sp: codemap::Span, + prog: Program, + names: ~[Option<~str>], + original: ~str, +} + +impl<'a> NfaGen<'a> { + fn code(&mut self) -> @ast::Expr { + // Most or all of the following things are used in the quasiquoted + // expression returned. + let num_cap_locs = 2 * self.prog.num_captures(); + let num_insts = self.prog.insts.len(); + let cap_names = self.vec_expr(self.names, + |cx, name| match name { + &Some(ref name) => { + let name = name.as_slice(); + quote_expr!(cx, Some(~$name)) + } + &None => quote_expr!(cx, None), + } + ); + let prefix_anchor = + match self.prog.insts.as_slice()[1] { + EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, + _ => false, + }; + let init_groups = self.vec_from_fn(num_cap_locs, + |cx| quote_expr!(cx, None)); + let prefix_bytes = self.vec_expr(self.prog.prefix.as_slice().as_bytes(), + |cx, b| quote_expr!(cx, $b)); + let check_prefix = self.check_prefix(); + let step_insts = self.step_insts(); + let add_insts = self.add_insts(); + let regex = self.original.as_slice(); + + quote_expr!(self.cx, { +fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, + start: uint, end: uint) -> Vec<Option<uint>> { + #![allow(unused_imports)] + use regex::native::{ + MatchKind, Exists, Location, Submatches, + StepState, StepMatchEarlyReturn, StepMatch, StepContinue, + CharReader, find_prefix, + }; + + return Nfa { + which: which, + input: input, + ic: 0, + chars: CharReader::new(input), + }.run(start, end); + + type Captures = [Option<uint>, ..$num_cap_locs]; + + struct Nfa<'t> { + which: MatchKind, + input: &'t str, + ic: uint, + chars: CharReader<'t>, + } + + impl<'t> Nfa<'t> { + #[allow(unused_variable)] + fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> { + let mut matched = false; + let prefix_bytes: &[u8] = &$prefix_bytes; + let mut clist = &mut Threads::new(self.which); + let mut nlist = &mut Threads::new(self.which); + + let mut groups = $init_groups; + + self.ic = start; + let mut next_ic = self.chars.set(start); + while self.ic <= end { + if clist.size == 0 { + if matched { + break + } + $check_prefix + } + if clist.size == 0 || (!$prefix_anchor && !matched) { + self.add(clist, 0, &mut groups) + } + + self.ic = next_ic; + next_ic = self.chars.advance(); + + let mut i = 0; + while i < clist.size { + let pc = clist.pc(i); + let step_state = self.step(&mut groups, nlist, + clist.groups(i), pc); + match step_state { + StepMatchEarlyReturn => + return vec![Some(0u), Some(0u)], + StepMatch => { matched = true; clist.empty() }, + StepContinue => {}, + } + i += 1; + } + ::std::mem::swap(&mut clist, &mut nlist); + nlist.empty(); + } + match self.which { + Exists if matched => vec![Some(0u), Some(0u)], + Exists => vec![None, None], + Location | Submatches => groups.iter().map(|x| *x).collect(), + } + } + + // Sometimes `nlist` is never used (for empty regexes). + #[allow(unused_variable)] + #[inline] + fn step(&self, groups: &mut Captures, nlist: &mut Threads, + caps: &mut Captures, pc: uint) -> StepState { + $step_insts + StepContinue + } + + fn add(&self, nlist: &mut Threads, pc: uint, + groups: &mut Captures) { + if nlist.contains(pc) { + return + } + $add_insts + } + } + + struct Thread { + pc: uint, + groups: Captures, + } + + struct Threads { + which: MatchKind, + queue: [Thread, ..$num_insts], + sparse: [uint, ..$num_insts], + size: uint, + } + + impl Threads { + fn new(which: MatchKind) -> Threads { + Threads { + which: which, + // These unsafe blocks are used for performance reasons, as it + // gives us a zero-cost initialization of a sparse set. The + // trick is described in more detail here: + // http://research.swtch.com/sparse + // The idea here is to avoid initializing threads that never + // need to be initialized, particularly for larger regexs with + // a lot of instructions. + queue: unsafe { ::std::mem::uninit() }, + sparse: unsafe { ::std::mem::uninit() }, + size: 0, + } + } + + #[inline] + fn add(&mut self, pc: uint, groups: &Captures) { + let t = &mut self.queue[self.size]; + t.pc = pc; + match self.which { + Exists => {}, + Location => { + t.groups[0] = groups[0]; + t.groups[1] = groups[1]; + } + Submatches => { + for (slot, val) in t.groups.mut_iter().zip(groups.iter()) { + *slot = *val; + } + } + } + self.sparse[pc] = self.size; + self.size += 1; + } + + #[inline] + fn add_empty(&mut self, pc: uint) { + self.queue[self.size].pc = pc; + self.sparse[pc] = self.size; + self.size += 1; + } + + #[inline] + fn contains(&self, pc: uint) -> bool { + let s = self.sparse[pc]; + s < self.size && self.queue[s].pc == pc + } + + #[inline] + fn empty(&mut self) { + self.size = 0; + } + + #[inline] + fn pc(&self, i: uint) -> uint { + self.queue[i].pc + } + + #[inline] + fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures { + &'r mut self.queue[i].groups + } + } +} + +::regex::Regex { + original: ~$regex, + names: ~$cap_names, + p: ::regex::native::Native(exec), +} + }) + } + + // Generates code for the `add` method, which is responsible for adding + // zero-width states to the next queue of states to visit. + fn add_insts(&self) -> @ast::Expr { + let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { + let nextpc = pc + 1; + let body = match *inst { + EmptyBegin(flags) => { + let nl = '\n'; + let cond = + if flags & FLAG_MULTI > 0 { + quote_expr!(self.cx, + self.chars.is_begin() + || self.chars.prev == Some($nl) + ) + } else { + quote_expr!(self.cx, self.chars.is_begin()) + }; + quote_expr!(self.cx, { + nlist.add_empty($pc); + if $cond { self.add(nlist, $nextpc, &mut *groups) } + }) + } + EmptyEnd(flags) => { + let nl = '\n'; + let cond = + if flags & FLAG_MULTI > 0 { + quote_expr!(self.cx, + self.chars.is_end() + || self.chars.cur == Some($nl) + ) + } else { + quote_expr!(self.cx, self.chars.is_end()) + }; + quote_expr!(self.cx, { + nlist.add_empty($pc); + if $cond { self.add(nlist, $nextpc, &mut *groups) } + }) + } + EmptyWordBoundary(flags) => { + let cond = + if flags & FLAG_NEGATED > 0 { + quote_expr!(self.cx, !self.chars.is_word_boundary()) + } else { + quote_expr!(self.cx, self.chars.is_word_boundary()) + }; + quote_expr!(self.cx, { + nlist.add_empty($pc); + if $cond { self.add(nlist, $nextpc, &mut *groups) } + }) + } + Save(slot) => { + let save = quote_expr!(self.cx, { + let old = groups[$slot]; + groups[$slot] = Some(self.ic); + self.add(nlist, $nextpc, &mut *groups); + groups[$slot] = old; + }); + let add = quote_expr!(self.cx, { + self.add(nlist, $nextpc, &mut *groups); + }); + // If this is saving a submatch location but we request + // existence or only full match location, then we can skip + // right over it every time. + if slot > 1 { + quote_expr!(self.cx, { + nlist.add_empty($pc); + match self.which { + Submatches => $save, + Exists | Location => $add, + } + }) + } else { + quote_expr!(self.cx, { + nlist.add_empty($pc); + match self.which { + Submatches | Location => $save, + Exists => $add, + } + }) + } + } + Jump(to) => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + self.add(nlist, $to, &mut *groups); + }) + } + Split(x, y) => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + self.add(nlist, $x, &mut *groups); + self.add(nlist, $y, &mut *groups); + }) + } + // For Match, OneChar, CharClass, Any + _ => quote_expr!(self.cx, nlist.add($pc, &*groups)), + }; + self.arm_inst(pc, body) + }).collect::<Vec<ast::Arm>>(); + + self.match_insts(arms) + } + + // Generates the code for the `step` method, which processes all states + // in the current queue that consume a single character. + fn step_insts(&self) -> @ast::Expr { + let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { + let nextpc = pc + 1; + let body = match *inst { + Match => { + quote_expr!(self.cx, { + match self.which { + Exists => { + return StepMatchEarlyReturn + } + Location => { + groups[0] = caps[0]; + groups[1] = caps[1]; + return StepMatch + } + Submatches => { + for (slot, val) in groups.mut_iter().zip(caps.iter()) { + *slot = *val; + } + return StepMatch + } + } + }) + } + OneChar(c, flags) => { + if flags & FLAG_NOCASE > 0 { + let upc = c.to_uppercase(); + quote_expr!(self.cx, { + let upc = self.chars.prev.map(|c| c.to_uppercase()); + if upc == Some($upc) { + self.add(nlist, $nextpc, caps); + } + }) + } else { + quote_expr!(self.cx, { + if self.chars.prev == Some($c) { + self.add(nlist, $nextpc, caps); + } + }) + } + } + CharClass(ref ranges, flags) => { + let negate = flags & FLAG_NEGATED > 0; + let casei = flags & FLAG_NOCASE > 0; + let get_char = + if casei { + quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase()) + } else { + quote_expr!(self.cx, self.chars.prev.unwrap()) + }; + let negcond = + if negate { + quote_expr!(self.cx, !found) + } else { + quote_expr!(self.cx, found) + }; + let mranges = self.match_class(casei, ranges.as_slice()); + quote_expr!(self.cx, { + if self.chars.prev.is_some() { + let c = $get_char; + let found = $mranges; + if $negcond { + self.add(nlist, $nextpc, caps); + } + } + }) + } + Any(flags) => { + if flags & FLAG_DOTNL > 0 { + quote_expr!(self.cx, self.add(nlist, $nextpc, caps)) + } else { + let nl = '\n'; // no char lits allowed? wtf? + quote_expr!(self.cx, { + if self.chars.prev != Some($nl) { + self.add(nlist, $nextpc, caps) + } + }) + } + } + // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split + _ => quote_expr!(self.cx, {}), + }; + self.arm_inst(pc, body) + }).collect::<Vec<ast::Arm>>(); + + self.match_insts(arms) + } + + // Translates a character class into a match expression. + // This avoids a binary search (and is hopefully replaced by a jump + // table). + fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> @ast::Expr { + let mut arms = ranges.iter().map(|&(mut start, mut end)| { + if casei { + start = start.to_uppercase(); + end = end.to_uppercase(); + } + ast::Arm { + attrs: vec!(), + pats: vec!(@ast::Pat{ + id: ast::DUMMY_NODE_ID, + span: self.sp, + node: ast::PatRange(quote_expr!(self.cx, $start), + quote_expr!(self.cx, $end)), + }), + guard: None, + body: quote_expr!(self.cx, true), + } + }).collect::<Vec<ast::Arm>>(); + + arms.push(self.wild_arm_expr(quote_expr!(self.cx, false))); + + let match_on = quote_expr!(self.cx, c); + self.dummy_expr(ast::ExprMatch(match_on, arms)) + } + + // Generates code for checking a literal prefix of the search string. + // The code is only generated if the regex *has* a literal prefix. + // Otherwise, a no-op is returned. + fn check_prefix(&self) -> @ast::Expr { + if self.prog.prefix.len() == 0 { + quote_expr!(self.cx, {}) + } else { + quote_expr!(self.cx, + if clist.size == 0 { + let haystack = self.input.as_bytes().slice_from(self.ic); + match find_prefix(prefix_bytes, haystack) { + None => break, + Some(i) => { + self.ic += i; + next_ic = self.chars.set(self.ic); + } + } + } + ) + } + } + + // Builds a `match pc { ... }` expression from a list of arms, specifically + // for matching the current program counter with an instruction. + // A wild-card arm is automatically added that executes a no-op. It will + // never be used, but is added to satisfy the compiler complaining about + // non-exhaustive patterns. + fn match_insts(&self, mut arms: Vec<ast::Arm>) -> @ast::Expr { + let mat_pc = quote_expr!(self.cx, pc); + arms.push(self.wild_arm_expr(quote_expr!(self.cx, {}))); + self.dummy_expr(ast::ExprMatch(mat_pc, arms)) + } + + // Creates a match arm for the instruction at `pc` with the expression + // `body`. + fn arm_inst(&self, pc: uint, body: @ast::Expr) -> ast::Arm { + ast::Arm { + attrs: vec!(), + pats: vec!(@ast::Pat{ + id: ast::DUMMY_NODE_ID, + span: self.sp, + node: ast::PatLit(quote_expr!(self.cx, $pc)), + }), + guard: None, + body: body, + } + } + + // Creates a wild-card match arm with the expression `body`. + fn wild_arm_expr(&self, body: @ast::Expr) -> ast::Arm { + ast::Arm { + attrs: vec!(), + pats: vec!(@ast::Pat{ + id: ast::DUMMY_NODE_ID, + span: self.sp, + node: ast::PatWild, + }), + guard: None, + body: body, + } + } + + // Builds a `[a, b, .., len]` expression where each element is the result + // of executing `to_expr`. + fn vec_from_fn(&self, len: uint, to_expr: |&ExtCtxt| -> @ast::Expr) + -> @ast::Expr { + self.vec_expr(Vec::from_elem(len, ()).as_slice(), + |cx, _| to_expr(cx)) + } + + // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr` + // on each element in `xs`. + fn vec_expr<T>(&self, xs: &[T], to_expr: |&ExtCtxt, &T| -> @ast::Expr) + -> @ast::Expr { + let mut exprs = vec!(); + for x in xs.iter() { + exprs.push(to_expr(self.cx, x)) + } + let vec_exprs = self.dummy_expr(ast::ExprVec(exprs)); + quote_expr!(self.cx, $vec_exprs) + } + + // Creates an expression with a dummy node ID given an underlying + // `ast::Expr_`. + fn dummy_expr(&self, e: ast::Expr_) -> @ast::Expr { + @ast::Expr { + id: ast::DUMMY_NODE_ID, + node: e, + span: self.sp, + } + } +} + +// This trait is defined in the quote module in the syntax crate, but I +// don't think it's exported. +// Interestingly, quote_expr! only requires that a 'to_tokens' method be +// defined rather than satisfying a particular trait. +#[doc(hidden)] +trait ToTokens { + fn to_tokens(&self, cx: &ExtCtxt) -> Vec<ast::TokenTree>; +} + +impl ToTokens for char { + fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> { + vec!(ast::TTTok(codemap::DUMMY_SP, token::LIT_CHAR((*self) as u32))) + } +} + +impl ToTokens for bool { + fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> { + let ident = token::IDENT(token::str_to_ident(self.to_str()), false); + vec!(ast::TTTok(codemap::DUMMY_SP, ident)) + } +} + +/// Looks for a single string literal and returns it. +/// Otherwise, logs an error with cx.span_err and returns None. +fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<~str> { + let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(), + Vec::from_slice(tts)); + let entry = cx.expand_expr(parser.parse_expr()); + let regex = match entry.node { + ast::ExprLit(lit) => { + match lit.node { + ast::LitStr(ref s, _) => s.to_str(), + _ => { + cx.span_err(entry.span, format!( + "expected string literal but got `{}`", + pprust::lit_to_str(lit))); + return None + } + } + } + _ => { + cx.span_err(entry.span, format!( + "expected string literal but got `{}`", + pprust::expr_to_str(entry))); + return None + } + }; + if !parser.eat(&token::EOF) { + cx.span_err(parser.span, "only one string literal allowed"); + return None; + } + Some(regex) +} diff --git a/src/test/bench/shootout-regex-dna.rs b/src/test/bench/shootout-regex-dna.rs new file mode 100644 index 0000000000000..0f86b8043a02d --- /dev/null +++ b/src/test/bench/shootout-regex-dna.rs @@ -0,0 +1,96 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// FIXME(#13725) windows needs fixing. +// ignore-win32 +// ignore-stage1 +// ignore-cross-compile #12102 + +#![feature(macro_rules, phase)] + +extern crate regex; +#[phase(syntax)]extern crate regex_macros; +extern crate sync; + +use std::io; +use regex::{NoExpand, Regex}; +use sync::Arc; + +fn count_matches(seq: &str, variant: &Regex) -> int { + let mut n = 0; + for _ in variant.find_iter(seq) { + n += 1; + } + n +} + +fn main() { + let mut rdr = if std::os::getenv("RUST_BENCH").is_some() { + let fd = io::File::open(&Path::new("shootout-k-nucleotide.data")); + ~io::BufferedReader::new(fd) as ~io::Reader + } else { + ~io::stdin() as ~io::Reader + }; + let mut seq = StrBuf::from_str(rdr.read_to_str().unwrap()); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(seq.as_slice(), NoExpand("")); + let seq_arc = Arc::new(seq.clone()); // copy before it moves + let clen = seq.len(); + + let mut seqlen = sync::Future::spawn(proc() { + let substs = ~[ + (regex!("B"), "(c|g|t)"), + (regex!("D"), "(a|g|t)"), + (regex!("H"), "(a|c|t)"), + (regex!("K"), "(g|t)"), + (regex!("M"), "(a|c)"), + (regex!("N"), "(a|c|g|t)"), + (regex!("R"), "(a|g)"), + (regex!("S"), "(c|g)"), + (regex!("V"), "(a|c|g)"), + (regex!("W"), "(a|t)"), + (regex!("Y"), "(c|t)"), + ]; + let mut seq = seq; + for (re, replacement) in substs.move_iter() { + seq = re.replace_all(seq.as_slice(), NoExpand(replacement)); + } + seq.len() + }); + + let variants = ~[ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + let (mut variant_strs, mut counts) = (vec!(), vec!()); + for variant in variants.move_iter() { + let seq_arc_copy = seq_arc.clone(); + variant_strs.push(variant.to_str().to_owned()); + counts.push(sync::Future::spawn(proc() { + count_matches(seq_arc_copy.as_slice(), &variant) + })); + } + + for (i, variant) in variant_strs.iter().enumerate() { + println!("{} {}", variant, counts.get_mut(i).get()); + } + println!(""); + println!("{}", ilen); + println!("{}", clen); + println!("{}", seqlen.get()); +} diff --git a/src/test/compile-fail/syntax-extension-regex-invalid.rs b/src/test/compile-fail/syntax-extension-regex-invalid.rs new file mode 100644 index 0000000000000..0e072dc1c060b --- /dev/null +++ b/src/test/compile-fail/syntax-extension-regex-invalid.rs @@ -0,0 +1,28 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// FIXME(#13725) windows needs fixing. +// ignore-win32 +// ignore-stage1 + +#![feature(phase)] + +extern crate regex; +#[phase(syntax)] extern crate regex_macros; + +// Tests to make sure that `regex!` will produce a compile error when given +// an invalid regular expression. +// More exhaustive failure tests for the parser are done with the traditional +// unit testing infrastructure, since both dynamic and native regexes use the +// same parser. + +fn main() { + let _ = regex!("("); //~ ERROR Regex syntax error +}