auto merge of #10621 : Florob/rust/unicode63, r=cmr

This update the unicode.rs file to the latest Unicode version released 2013-09-30.
rust-lang · Nov 28, 2013 · 503e5df · 503e5df
2 parents d2c405e + dfe38db
commit 503e5df
Show file tree

Hide file tree

Showing 5 changed files with 1,479 additions and 814 deletions.
diff --git a/src/etc/unicode.py b/src/etc/unicode.py
@@ -5,7 +5,7 @@
 # code covering the core properties. Since this is a pretty rare event we
 # just store this out-of-line and check the unicode.rs file into git.
 #
-# The emitted code is "the minimum we think is necessary for libcore", that
+# The emitted code is "the minimum we think is necessary for libstd", that
 # is, to support basic operations of the compiler and "most nontrivial rust
 # programs". It is not meant to be a complete implementation of unicode.
 # For that we recommend you use a proper binding to libicu.
@@ -41,7 +41,7 @@ def load_unicode_data(f):
             continue
         [code, name, gencat, combine, bidi,
          decomp, deci, digit, num, mirror,
-         old, iso, upcase, lowcsae, titlecase ] = fields
+         old, iso, upcase, lowcase, titlecase ] = fields
 
         code = int(code, 16)
 
@@ -89,11 +89,9 @@ def load_unicode_data(f):
 
     return (canon_decomp, compat_decomp, gencats, combines)
 
-
-def load_derived_core_properties(f):
+def load_properties(f, interestingprops):
     fetch(f)
-    derivedprops = {}
-    interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
+    props = {}
     re1 = re.compile("^([0-9A-F]+) +; (\w+)")
     re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
 
@@ -118,10 +116,10 @@ def load_derived_core_properties(f):
             continue
         d_lo = int(d_lo, 16)
         d_hi = int(d_hi, 16)
-        if prop not in derivedprops:
-            derivedprops[prop] = []
-        derivedprops[prop].append((d_lo, d_hi))
-    return derivedprops
+        if prop not in props:
+            props[prop] = []
+        props[prop].append((d_lo, d_hi))
+    return props
 
 def escape_char(c):
     if c <= 0xff:
@@ -144,7 +142,7 @@ def emit_bsearch_range_table(f):
         use cmp::{Equal, Less, Greater};
         use vec::ImmutableVector;
         use option::None;
-        (do r.bsearch |&(lo,hi)| {
+        r.bsearch(|&(lo,hi)| {
             if lo <= c && c <= hi { Equal }
             else if hi < c { Less }
             else { Greater }
@@ -302,14 +300,14 @@ def emit_decomp_module(f, canon, compat, combine):
         ix += 1
     f.write("\n    ];\n")
 
-    f.write("    pub fn canonical(c: char, i: &fn(char)) "
+    f.write("    pub fn canonical(c: char, i: |char|) "
         + "{ d(c, i, false); }\n\n")
-    f.write("    pub fn compatibility(c: char, i: &fn(char)) "
+    f.write("    pub fn compatibility(c: char, i: |char|) "
             +"{ d(c, i, true); }\n\n")
     f.write("    pub fn canonical_combining_class(c: char) -> u8 {\n"
         + "        bsearch_range_value_table(c, combining_class_table)\n"
         + "    }\n\n")
-    f.write("    fn d(c: char, i: &fn(char), k: bool) {\n")
+    f.write("    fn d(c: char, i: |char|, k: bool) {\n")
     f.write("        use iter::Iterator;\n");
 
     f.write("        if c <= '\\x7f' { i(c); return; }\n")
@@ -376,5 +374,9 @@ def emit_decomp_module(f, canon, compat, combine):
 
 emit_decomp_module(rf, canon_decomp, compat_decomp, combines)
 
-derived = load_derived_core_properties("DerivedCoreProperties.txt")
+derived = load_properties("DerivedCoreProperties.txt",
+        ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
 emit_property_module(rf, "derived_property", derived)
+
+props = load_properties("PropList.txt", ["White_Space"])
+emit_property_module(rf, "property", props)
diff --git a/src/libstd/char.rs b/src/libstd/char.rs
@@ -14,7 +14,7 @@ use cast::transmute;
 use option::{None, Option, Some};
 use iter::{Iterator, range_step};
 use str::StrSlice;
-use unicode::{derived_property, general_category, decompose};
+use unicode::{derived_property, property, general_category, decompose};
 use to_str::ToStr;
 use str;
 
@@ -89,30 +89,28 @@ pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
 
 ///
 /// Indicates whether a character is in lower case, defined
-/// in terms of the Unicode General Category 'Ll'
+/// in terms of the Unicode Derived Core Property 'Lowercase'.
 ///
 #[inline]
-pub fn is_lowercase(c: char) -> bool { general_category::Ll(c) }
+pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
 
 ///
 /// Indicates whether a character is in upper case, defined
-/// in terms of the Unicode General Category 'Lu'.
+/// in terms of the Unicode Derived Core Property 'Uppercase'.
 ///
 #[inline]
-pub fn is_uppercase(c: char) -> bool { general_category::Lu(c) }
+pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
 
 ///
 /// Indicates whether a character is whitespace. Whitespace is defined in
-/// terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
-/// additional 'Cc'-category control codes in the range [0x09, 0x0d]
+/// terms of the Unicode Property 'White_Space'.
 ///
 #[inline]
 pub fn is_whitespace(c: char) -> bool {
+    // As an optimization ASCII whitespace characters are checked separately
     c == ' '
         || ('\x09' <= c && c <= '\x0d')
-        || general_category::Zs(c)
-        || general_category::Zl(c)
-        || general_category::Zp(c)
+        || property::White_Space(c)
 }
 
 ///