Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement upper, lower case conversion for char #12561

Merged
merged 3 commits into from Mar 13, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
118 changes: 79 additions & 39 deletions src/etc/unicode.py
Expand Up @@ -19,7 +19,7 @@
# programs". It is not meant to be a complete implementation of unicode.
# For that we recommend you use a proper binding to libicu.

import fileinput, re, os, sys
import fileinput, re, os, sys, operator


def fetch(f):
Expand All @@ -35,6 +35,8 @@ def fetch(f):
def load_unicode_data(f):
fetch(f)
gencats = {}
upperlower = {}
lowerupper = {}
combines = []
canon_decomp = {}
compat_decomp = {}
Expand All @@ -44,6 +46,7 @@ def load_unicode_data(f):
c_hi = 0
com_lo = 0
com_hi = 0

for line in fileinput.input(f):
fields = line.split(";")
if len(fields) != 15:
Expand All @@ -52,7 +55,17 @@ def load_unicode_data(f):
decomp, deci, digit, num, mirror,
old, iso, upcase, lowcase, titlecase ] = fields

code = int(code, 16)
code_org = code
code = int(code, 16)

# generate char to char direct common and simple conversions
# uppercase to lowercase
if gencat == "Lu" and lowcase != "" and code_org != lowcase:
upperlower[code] = int(lowcase, 16)

# lowercase to uppercase
if gencat == "Ll" and upcase != "" and code_org != upcase:
lowerupper[code] = int(upcase, 16)

if decomp != "":
if decomp.startswith('<'):
Expand Down Expand Up @@ -96,7 +109,7 @@ def load_unicode_data(f):
com_lo = code
com_hi = code

return (canon_decomp, compat_decomp, gencats, combines)
return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)

def load_properties(f, interestingprops):
fetch(f)
Expand Down Expand Up @@ -147,25 +160,28 @@ def ch_prefix(ix):

def emit_bsearch_range_table(f):
f.write("""
fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
use cmp::{Equal, Less, Greater};
use vec::ImmutableVector;
use option::None;
r.bsearch(|&(lo,hi)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) != None
}\n\n
fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
use cmp::{Equal, Less, Greater};
use vec::ImmutableVector;
use option::None;
r.bsearch(|&(lo,hi)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) != None
}\n\n
""");

def emit_property_module(f, mod, tbl):
f.write("pub mod %s {\n" % mod)
keys = tbl.keys()
keys.sort()
emit_bsearch_range_table(f);

for cat in keys:
if cat == "Cs": continue
if cat not in ["Nd", "Nl", "No", "Cc",
"XID_Start", "XID_Continue", "Alphabetic",
"Lowercase", "Uppercase", "White_Space"]:
continue
f.write(" static %s_table : &'static [(char,char)] = &[\n" % cat)
ix = 0
for pair in tbl[cat]:
Expand All @@ -175,35 +191,55 @@ def emit_property_module(f, mod, tbl):
f.write("\n ];\n\n")

f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" bsearch_range_table(c, %s_table)\n" % cat)
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
f.write("}\n")


def emit_property_module_old(f, mod, tbl):
f.write("mod %s {\n" % mod)
keys = tbl.keys()
keys.sort()
for cat in keys:
f.write(" fn %s(c: char) -> bool {\n" % cat)
f.write(" ret alt c {\n")
prefix = ' '
for pair in tbl[cat]:
if pair[0] == pair[1]:
f.write(" %c %s\n" %
(prefix, escape_char(pair[0])))
else:
f.write(" %c %s to %s\n" %
(prefix,
escape_char(pair[0]),
escape_char(pair[1])))
prefix = '|'
f.write(" { true }\n")
f.write(" _ { false }\n")
f.write(" };\n")
f.write(" }\n\n")
def emit_conversions_module(f, lowerupper, upperlower):
f.write("pub mod conversions {\n")
f.write("""
use cmp::{Equal, Less, Greater};
use vec::ImmutableVector;
use tuple::Tuple2;
use option::{Option, Some, None};

pub fn to_lower(c: char) -> char {
match bsearch_case_table(c, LuLl_table) {
None => c,
Some(index) => LuLl_table[index].val1()
}
}

pub fn to_upper(c: char) -> char {
match bsearch_case_table(c, LlLu_table) {
None => c,
Some(index) => LlLu_table[index].val1()
}
}

fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<uint> {
table.bsearch(|&(key, _)| {
if c == key { Equal }
else if key < c { Less }
else { Greater }
})
}
""");
emit_caseconversion_table(f, "LuLl", upperlower)
emit_caseconversion_table(f, "LlLu", lowerupper)
f.write("}\n")

def emit_caseconversion_table(f, name, table):
f.write(" static %s_table : &'static [(char, char)] = &[\n" % name)
sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0))
ix = 0
for key, value in sorted_table:
f.write(ch_prefix(ix))
f.write("(%s, %s)" % (escape_char(key), escape_char(value)))
ix += 1
f.write("\n ];\n\n")

def format_table_content(f, content, indent):
line = " "*indent
first = True
Expand Down Expand Up @@ -359,7 +395,8 @@ def emit_decomp_module(f, canon, compat, combine):
os.remove(i);
rf = open(r, "w")

(canon_decomp, compat_decomp, gencats, combines) = load_unicode_data("UnicodeData.txt")
(canon_decomp, compat_decomp, gencats,
combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")

# Preamble
rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
Expand All @@ -379,13 +416,16 @@ def emit_decomp_module(f, canon, compat, combine):

''')

emit_bsearch_range_table(rf);
emit_property_module(rf, "general_category", gencats)

emit_decomp_module(rf, canon_decomp, compat_decomp, combines)

derived = load_properties("DerivedCoreProperties.txt",
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])

emit_property_module(rf, "derived_property", derived)

props = load_properties("PropList.txt", ["White_Space"])
emit_property_module(rf, "property", props)
emit_conversions_module(rf, lowerupper, upperlower)
73 changes: 72 additions & 1 deletion src/libstd/char.rs
Expand Up @@ -28,7 +28,7 @@ use cast::transmute;
use option::{None, Option, Some};
use iter::{Iterator, range_step};
use str::StrSlice;
use unicode::{derived_property, property, general_category, decompose};
use unicode::{derived_property, property, general_category, decompose, conversions};

#[cfg(test)] use str::OwnedStr;

Expand Down Expand Up @@ -225,6 +225,38 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
else { None }
}

/// Convert a char to its uppercase equivalent
///
/// The case-folding performed is the common or simple mapping:
/// it maps one unicode codepoint (one char in Rust) to its uppercase equivalent according
/// to the Unicode database at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
/// The additional SpecialCasing.txt is not considered here, as it expands to multiple
/// codepoints in some cases.
///
/// A full reference can be found here
/// http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
///
/// # Return value
///
/// Returns the char itself if no conversion was made
#[inline]
pub fn to_uppercase(c: char) -> char {
conversions::to_upper(c)
}

/// Convert a char to its lowercase equivalent
///
/// The case-folding performed is the common or simple mapping
/// see `to_uppercase` for references and more information
///
/// # Return value
///
/// Returns the char itself if no conversion if possible
#[inline]
pub fn to_lowercase(c: char) -> char {
conversions::to_lower(c)
}

///
/// Converts a number to the character representing it
///
Expand Down Expand Up @@ -385,6 +417,8 @@ pub trait Char {
fn is_digit(&self) -> bool;
fn is_digit_radix(&self, radix: uint) -> bool;
fn to_digit(&self, radix: uint) -> Option<uint>;
fn to_lowercase(&self) -> char;
fn to_uppercase(&self) -> char;
fn from_digit(num: uint, radix: uint) -> Option<char>;
fn escape_unicode(&self, f: |char|);
fn escape_default(&self, f: |char|);
Expand Down Expand Up @@ -421,6 +455,10 @@ impl Char for char {

fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }

fn to_lowercase(&self) -> char { to_lowercase(*self) }

fn to_uppercase(&self) -> char { to_uppercase(*self) }

fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }

fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
Expand Down Expand Up @@ -516,6 +554,39 @@ fn test_to_digit() {
assert_eq!('$'.to_digit(36u), None);
}

#[test]
fn test_to_lowercase() {
assert_eq!('A'.to_lowercase(), 'a');
assert_eq!('Ö'.to_lowercase(), 'ö');
assert_eq!('ß'.to_lowercase(), 'ß');
assert_eq!('Ü'.to_lowercase(), 'ü');
assert_eq!('💩'.to_lowercase(), '💩');
assert_eq!('Σ'.to_lowercase(), 'σ');
assert_eq!('Τ'.to_lowercase(), 'τ');
assert_eq!('Ι'.to_lowercase(), 'ι');
assert_eq!('Γ'.to_lowercase(), 'γ');
assert_eq!('Μ'.to_lowercase(), 'μ');
assert_eq!('Α'.to_lowercase(), 'α');
assert_eq!('Σ'.to_lowercase(), 'σ');
}

#[test]
fn test_to_uppercase() {
assert_eq!('a'.to_uppercase(), 'A');
assert_eq!('ö'.to_uppercase(), 'Ö');
assert_eq!('ß'.to_uppercase(), 'ß'); // not ẞ: Latin capital letter sharp s
assert_eq!('ü'.to_uppercase(), 'Ü');
assert_eq!('💩'.to_uppercase(), '💩');

assert_eq!('σ'.to_uppercase(), 'Σ');
assert_eq!('τ'.to_uppercase(), 'Τ');
assert_eq!('ι'.to_uppercase(), 'Ι');
assert_eq!('γ'.to_uppercase(), 'Γ');
assert_eq!('μ'.to_uppercase(), 'Μ');
assert_eq!('α'.to_uppercase(), 'Α');
assert_eq!('ς'.to_uppercase(), 'Σ');
}

#[test]
fn test_is_control() {
assert!('\u0000'.is_control());
Expand Down