Skip to content

Commit

Permalink
auto merge of #10621 : Florob/rust/unicode63, r=cmr
Browse files Browse the repository at this point in the history
This update the unicode.rs file to the latest Unicode version released 2013-09-30.
  • Loading branch information
bors committed Nov 28, 2013
2 parents d2c405e + dfe38db commit 503e5df
Show file tree
Hide file tree
Showing 5 changed files with 1,479 additions and 814 deletions.
32 changes: 17 additions & 15 deletions src/etc/unicode.py
Expand Up @@ -5,7 +5,7 @@
# code covering the core properties. Since this is a pretty rare event we
# just store this out-of-line and check the unicode.rs file into git.
#
# The emitted code is "the minimum we think is necessary for libcore", that
# The emitted code is "the minimum we think is necessary for libstd", that
# is, to support basic operations of the compiler and "most nontrivial rust
# programs". It is not meant to be a complete implementation of unicode.
# For that we recommend you use a proper binding to libicu.
Expand Down Expand Up @@ -41,7 +41,7 @@ def load_unicode_data(f):
continue
[code, name, gencat, combine, bidi,
decomp, deci, digit, num, mirror,
old, iso, upcase, lowcsae, titlecase ] = fields
old, iso, upcase, lowcase, titlecase ] = fields

code = int(code, 16)

Expand Down Expand Up @@ -89,11 +89,9 @@ def load_unicode_data(f):

return (canon_decomp, compat_decomp, gencats, combines)


def load_derived_core_properties(f):
def load_properties(f, interestingprops):
fetch(f)
derivedprops = {}
interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
props = {}
re1 = re.compile("^([0-9A-F]+) +; (\w+)")
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")

Expand All @@ -118,10 +116,10 @@ def load_derived_core_properties(f):
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if prop not in derivedprops:
derivedprops[prop] = []
derivedprops[prop].append((d_lo, d_hi))
return derivedprops
if prop not in props:
props[prop] = []
props[prop].append((d_lo, d_hi))
return props

def escape_char(c):
if c <= 0xff:
Expand All @@ -144,7 +142,7 @@ def emit_bsearch_range_table(f):
use cmp::{Equal, Less, Greater};
use vec::ImmutableVector;
use option::None;
(do r.bsearch |&(lo,hi)| {
r.bsearch(|&(lo,hi)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
Expand Down Expand Up @@ -302,14 +300,14 @@ def emit_decomp_module(f, canon, compat, combine):
ix += 1
f.write("\n ];\n")

f.write(" pub fn canonical(c: char, i: &fn(char)) "
f.write(" pub fn canonical(c: char, i: |char|) "
+ "{ d(c, i, false); }\n\n")
f.write(" pub fn compatibility(c: char, i: &fn(char)) "
f.write(" pub fn compatibility(c: char, i: |char|) "
+"{ d(c, i, true); }\n\n")
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
+ " bsearch_range_value_table(c, combining_class_table)\n"
+ " }\n\n")
f.write(" fn d(c: char, i: &fn(char), k: bool) {\n")
f.write(" fn d(c: char, i: |char|, k: bool) {\n")
f.write(" use iter::Iterator;\n");

f.write(" if c <= '\\x7f' { i(c); return; }\n")
Expand Down Expand Up @@ -376,5 +374,9 @@ def emit_decomp_module(f, canon, compat, combine):

emit_decomp_module(rf, canon_decomp, compat_decomp, combines)

derived = load_derived_core_properties("DerivedCoreProperties.txt")
derived = load_properties("DerivedCoreProperties.txt",
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
emit_property_module(rf, "derived_property", derived)

props = load_properties("PropList.txt", ["White_Space"])
emit_property_module(rf, "property", props)
18 changes: 8 additions & 10 deletions src/libstd/char.rs
Expand Up @@ -14,7 +14,7 @@ use cast::transmute;
use option::{None, Option, Some};
use iter::{Iterator, range_step};
use str::StrSlice;
use unicode::{derived_property, general_category, decompose};
use unicode::{derived_property, property, general_category, decompose};
use to_str::ToStr;
use str;

Expand Down Expand Up @@ -89,30 +89,28 @@ pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }

///
/// Indicates whether a character is in lower case, defined
/// in terms of the Unicode General Category 'Ll'
/// in terms of the Unicode Derived Core Property 'Lowercase'.
///
#[inline]
pub fn is_lowercase(c: char) -> bool { general_category::Ll(c) }
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }

///
/// Indicates whether a character is in upper case, defined
/// in terms of the Unicode General Category 'Lu'.
/// in terms of the Unicode Derived Core Property 'Uppercase'.
///
#[inline]
pub fn is_uppercase(c: char) -> bool { general_category::Lu(c) }
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }

///
/// Indicates whether a character is whitespace. Whitespace is defined in
/// terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
/// additional 'Cc'-category control codes in the range [0x09, 0x0d]
/// terms of the Unicode Property 'White_Space'.
///
#[inline]
pub fn is_whitespace(c: char) -> bool {
// As an optimization ASCII whitespace characters are checked separately
c == ' '
|| ('\x09' <= c && c <= '\x0d')
|| general_category::Zs(c)
|| general_category::Zl(c)
|| general_category::Zp(c)
|| property::White_Space(c)
}

///
Expand Down

0 comments on commit 503e5df

Please sign in to comment.