From eab351ef3e04290d8df17e64f2c3a442bdffd71f Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Mon, 4 Jan 2016 17:13:19 +0100 Subject: [PATCH 1/5] Cleanup unicode.py The methods related to char width are dead code since 464cdff102993ff1900eebbf65209e0a3c0be0d5; remove them. --- src/etc/unicode.py | 115 --------------------------------------------- 1 file changed, 115 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 3c1659ba2e0c4..4452be10bc274 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -271,43 +271,6 @@ def load_properties(f, interestingprops): return props -# load all widths of want_widths, except those in except_cats -def load_east_asian_width(want_widths, except_cats): - f = "EastAsianWidth.txt" - fetch(f) - widths = {} - re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)") - re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)") - - for line in fileinput.input(f): - width = None - d_lo = 0 - d_hi = 0 - cat = None - m = re1.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(1) - width = m.group(2) - cat = m.group(3) - else: - m = re2.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(2) - width = m.group(3) - cat = m.group(4) - else: - continue - if cat in except_cats or width not in want_widths: - continue - d_lo = int(d_lo, 16) - d_hi = int(d_hi, 16) - if width not in widths: - widths[width] = [] - widths[width].append((d_lo, d_hi)) - return widths - def escape_char(c): return "'\\u{%x}'" % c if c != 0 else "'\\0'" @@ -398,47 +361,6 @@ def emit_conversions_module(f, to_upper, to_lower, to_title): is_pub=False, t_type = t_type, pfun=pfun) f.write("}\n\n") -def emit_charwidth_module(f, width_table): - f.write("pub mod charwidth {\n") - f.write(" use core::option::Option;\n") - f.write(" use core::option::Option::{Some, None};\n") - f.write(" use core::result::Result::{Ok, Err};\n") - f.write(""" - fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 { - use core::cmp::Ordering::{Equal, Less, Greater}; - match r.binary_search_by(|&(lo, hi, _, _)| { - if lo <= c && c <= hi { Equal } - else if hi < c { Less } - else { Greater } - }) { - Ok(idx) => { - let (_, _, r_ncjk, r_cjk) = r[idx]; - if is_cjk { r_cjk } else { r_ncjk } - } - Err(_) => 1 - } - } -""") - - f.write(""" - pub fn width(c: char, is_cjk: bool) -> Option { - match c as usize { - _c @ 0 => Some(0), // null is zero width - cu if cu < 0x20 => None, // control sequences have no width - cu if cu < 0x7F => Some(1), // ASCII - cu if cu < 0xA0 => None, // more control sequences - _ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as usize) - } - } - -""") - - f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n") - f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n") - emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False, - pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3])) - f.write("}\n\n") - def emit_norm_module(f, canon, compat, combine, norm_props): canon_keys = canon.keys() canon_keys.sort() @@ -459,43 +381,6 @@ def emit_norm_module(f, canon, compat, combine, norm_props): canon_comp_keys = canon_comp.keys() canon_comp_keys.sort() -def remove_from_wtable(wtable, val): - wtable_out = [] - while wtable: - if wtable[0][1] < val: - wtable_out.append(wtable.pop(0)) - elif wtable[0][0] > val: - break - else: - (wt_lo, wt_hi, width, width_cjk) = wtable.pop(0) - if wt_lo == wt_hi == val: - continue - elif wt_lo == val: - wtable_out.append((wt_lo+1, wt_hi, width, width_cjk)) - elif wt_hi == val: - wtable_out.append((wt_lo, wt_hi-1, width, width_cjk)) - else: - wtable_out.append((wt_lo, val-1, width, width_cjk)) - wtable_out.append((val+1, wt_hi, width, width_cjk)) - if wtable: - wtable_out.extend(wtable) - return wtable_out - - - -def optimize_width_table(wtable): - wtable_out = [] - w_this = wtable.pop(0) - while wtable: - if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]: - w_tmp = wtable.pop(0) - w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3]) - else: - wtable_out.append(w_this) - w_this = wtable.pop(0) - wtable_out.append(w_this) - return wtable_out - if __name__ == "__main__": r = "tables.rs" if os.path.exists(r): From b081436ca4e20c864436bf12a0b0fa4bb82c049a Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Mon, 4 Jan 2016 17:27:51 +0100 Subject: [PATCH 2/5] Improve formatting of tables.rs Make unicode.py generate a tables.rs which is more conformant to usual Rust formatting (as per `rustfmt`). --- src/etc/unicode.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 4452be10bc274..5a9fbdd324043 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -326,15 +326,15 @@ def emit_conversions_module(f, to_upper, to_lower, to_title): pub fn to_lower(c: char) -> [char; 3] { match bsearch_case_table(c, to_lowercase_table) { - None => [c, '\\0', '\\0'], - Some(index) => to_lowercase_table[index].1 + None => [c, '\\0', '\\0'], + Some(index) => to_lowercase_table[index].1, } } pub fn to_upper(c: char) -> [char; 3] { match bsearch_case_table(c, to_uppercase_table) { None => [c, '\\0', '\\0'], - Some(index) => to_uppercase_table[index].1 + Some(index) => to_uppercase_table[index].1, } } From cf3fcf7758f4e568a5b97bdeffe7131a167e918f Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Mon, 4 Jan 2016 17:29:41 +0100 Subject: [PATCH 3/5] Reuse standard methods Do not hand-code `Result::ok` or `cmp` in tables.rs. --- src/etc/unicode.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 5a9fbdd324043..57bb36ce994c3 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -319,10 +319,8 @@ def emit_property_module(f, mod, tbl, emit): def emit_conversions_module(f, to_upper, to_lower, to_title): f.write("pub mod conversions {") f.write(""" - use core::cmp::Ordering::{Equal, Less, Greater}; use core::option::Option; use core::option::Option::{Some, None}; - use core::result::Result::{Ok, Err}; pub fn to_lower(c: char) -> [char; 3] { match bsearch_case_table(c, to_lowercase_table) { @@ -339,14 +337,7 @@ def emit_conversions_module(f, to_upper, to_lower, to_title): } fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option { - match table.binary_search_by(|&(key, _)| { - if c == key { Equal } - else if key < c { Less } - else { Greater } - }) { - Ok(i) => Some(i), - Err(_) => None, - } + table.binary_search_by(|&(key, _)| key.cmp(&c)).ok() } """) From aa77f39ccf2786b864206f24e85661eee63022a8 Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Mon, 4 Jan 2016 17:35:06 +0100 Subject: [PATCH 4/5] Improve the range comparison As mentioned in #29734, the range comparison closure can be improved. The LLVM IR and the assembly from the new version are much simpler and unfortunately we cannot rely on the compiler to optimise this much, as it would need to know that `lo <= hi`. Besides from simpler code, there might also be a performance advantage, although it is unlikely to appear on benchmarks, as we are doing a binary search, which should always involve few comparisons. The code is available on the playpen for ease of comparison: http://is.gd/4raMmH --- src/etc/unicode.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index 57bb36ce994c3..10b864a902dc0 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -279,12 +279,12 @@ def emit_bsearch_range_table(f): fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool { use core::cmp::Ordering::{Equal, Less, Greater}; r.binary_search_by(|&(lo, hi)| { - if lo <= c && c <= hi { - Equal + if c < lo { + Greater } else if hi < c { Less } else { - Greater + Equal } }) .is_ok() From 3fff63400b3032f64cdd3d0743d9a9c2cab019b4 Mon Sep 17 00:00:00 2001 From: Andrea Canciani Date: Mon, 4 Jan 2016 17:49:16 +0100 Subject: [PATCH 5/5] Update librustc_unicode/tables.rs with a new version generated by src/etc/unicode.py. --- src/librustc_unicode/tables.rs | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/src/librustc_unicode/tables.rs b/src/librustc_unicode/tables.rs index cf75cf5257714..a147bea791c47 100644 --- a/src/librustc_unicode/tables.rs +++ b/src/librustc_unicode/tables.rs @@ -16,13 +16,18 @@ /// that the unicode parts of `CharExt` and `UnicodeStrPrelude` traits are based on. pub const UNICODE_VERSION: (u64, u64, u64) = (8, 0, 0); -fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { +fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool { use core::cmp::Ordering::{Equal, Less, Greater}; - r.binary_search_by(|&(lo,hi)| { - if lo <= c && c <= hi { Equal } - else if hi < c { Less } - else { Greater } - }).is_ok() + r.binary_search_by(|&(lo, hi)| { + if c < lo { + Greater + } else if hi < c { + Less + } else { + Equal + } + }) + .is_ok() } pub mod general_category { @@ -1188,34 +1193,25 @@ pub mod property { } pub mod conversions { - use core::cmp::Ordering::{Equal, Less, Greater}; use core::option::Option; use core::option::Option::{Some, None}; - use core::result::Result::{Ok, Err}; pub fn to_lower(c: char) -> [char; 3] { match bsearch_case_table(c, to_lowercase_table) { - None => [c, '\0', '\0'], - Some(index) => to_lowercase_table[index].1 + None => [c, '\0', '\0'], + Some(index) => to_lowercase_table[index].1, } } pub fn to_upper(c: char) -> [char; 3] { match bsearch_case_table(c, to_uppercase_table) { None => [c, '\0', '\0'], - Some(index) => to_uppercase_table[index].1 + Some(index) => to_uppercase_table[index].1, } } fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option { - match table.binary_search_by(|&(key, _)| { - if c == key { Equal } - else if key < c { Less } - else { Greater } - }) { - Ok(i) => Some(i), - Err(_) => None, - } + table.binary_search_by(|&(key, _)| key.cmp(&c)).ok() } const to_lowercase_table: &'static [(char, [char; 3])] = &[